From 69f0116f5c14358532cd1774da2af33e91166265 Mon Sep 17 00:00:00 2001 From: rafelafrance Date: Tue, 15 Oct 2024 17:55:30 -0400 Subject: [PATCH] WIP Output CSV format --- args/parse_treatments.bash | 1 + flora/parse_treatments.py | 15 +- flora/pylib/treatment.py | 2 +- flora/pylib/treatments.py | 3 + flora/pylib/writers/csv_writer.py | 163 ++++++++++-------- .../{base_html_writer.py => html_writer.py} | 25 ++- flora/pylib/writers/treatment_html_writer.py | 38 ---- flora/pylib/writers/writer_utils.py | 47 ----- 8 files changed, 126 insertions(+), 168 deletions(-) rename flora/pylib/writers/{base_html_writer.py => html_writer.py} (85%) delete mode 100644 flora/pylib/writers/treatment_html_writer.py delete mode 100644 flora/pylib/writers/writer_utils.py diff --git a/args/parse_treatments.bash b/args/parse_treatments.bash index 0ded6a94..352aeb47 100755 --- a/args/parse_treatments.bash +++ b/args/parse_treatments.bash @@ -6,5 +6,6 @@ for path in data/treatments/*; do ./flora/parse_treatments.py \ --treatment-dir="$path" \ --html-file=data/output/html_output/"$output".html \ + --json-dir=data/output/json_output/"$output"_json \ --csv-file=data/output/csv_output/"$output".csv done diff --git a/flora/parse_treatments.py b/flora/parse_treatments.py index 976c01e4..4ffbada2 100755 --- a/flora/parse_treatments.py +++ b/flora/parse_treatments.py @@ -7,9 +7,10 @@ from traiter.pylib.darwin_core import DarwinCore from util.pylib import log +from flora.pylib import const from flora.pylib.treatments import Treatments -from flora.pylib.writers.csv_writer import CsvWriter -from flora.pylib.writers.treatment_html_writer import HtmlWriter +from flora.pylib.writers.csv_writer import write_csv +from flora.pylib.writers.html_writer import HtmlWriter def main(): @@ -20,12 +21,16 @@ def main(): treatments.parse() if args.html_file: - writer = HtmlWriter(args.html_file, args.spotlight) + writer = HtmlWriter( + template_dir=f"{const.ROOT_DIR}/flora/pylib/writers/templates", + template="treatment_html_writer.html", + html_file=args.html_file, + spotlight=args.spotlight, + ) writer.write(treatments, args) if args.csv_file: - writer = CsvWriter(args.csv_file) - writer.write(treatments, args) + write_csv(treatments) if args.json_dir: args.json_dir.mkdir(parents=True, exist_ok=True) diff --git a/flora/pylib/treatment.py b/flora/pylib/treatment.py index f9286d90..005a7a65 100644 --- a/flora/pylib/treatment.py +++ b/flora/pylib/treatment.py @@ -6,7 +6,7 @@ from traiter.pylib import util as t_util from traiter.pylib.rules.base import Base -from .rules.linkable import Linkable +from flora.pylib.rules.linkable import Linkable @dataclass diff --git a/flora/pylib/treatments.py b/flora/pylib/treatments.py index 1f2e8126..3d2a923c 100644 --- a/flora/pylib/treatments.py +++ b/flora/pylib/treatments.py @@ -12,6 +12,9 @@ def __init__(self, treatment_dir, limit, offset): ) self.nlp = flora_pipeline.build() + def __iter__(self): + yield from self.treatments + @staticmethod def get_treatments(treatment_dir, limit, offset): labels = [Treatment(p) for p in sorted(treatment_dir.glob("*"))] diff --git a/flora/pylib/writers/csv_writer.py b/flora/pylib/writers/csv_writer.py index ff8db278..1c63f16f 100644 --- a/flora/pylib/writers/csv_writer.py +++ b/flora/pylib/writers/csv_writer.py @@ -1,75 +1,94 @@ from collections import defaultdict import pandas as pd - -from flora.pylib.rules.part import Part - -from . import writer_utils as w_utils - -PARTS_SET = {*Part.labels, "multiple_parts"} - - -class CsvWriter: - def __init__(self, csv_file, csv_min=0, first=None): - self.csv_file = csv_file - self.csv_min = csv_min - self.csv_rows = [] - self.first = first if first else ["taxon"] - - def write(self, treatments, size_units="centimeters"): - csv_rows = self.format_all_rows(treatments) - df = pd.DataFrame(csv_rows) - df = self.sort_df(df) - - with self.csv_file.open("w") as out_file: - out_file.write(f"** All sizes are given in {size_units}. **\n") - df.to_csv(out_file, index=False) - - def format_all_rows(self, treatments): - csv_rows = [self.format_row(r) for r in treatments] - return csv_rows - - def format_row(self, treatment): - csv_row = {"taxon": treatment.taxon} - return self.row_builder(treatment, csv_row) - - def row_builder(self, treatment, csv_row): - by_header = defaultdict(list) - for trait in treatment.traits: - if trait["trait"] in PARTS_SET: - continue - - key_set = set(trait.keys()) - - if not (PARTS_SET & key_set): - continue - - base_header = w_utils.html_label(trait) - - self.group_values_by_header(by_header, trait, base_header) - self.number_columns(by_header, csv_row) - return csv_row - - def sort_df(self, df): - rest = [ - c - for c in df.columns - if c not in self.first and df[c].notna().sum() >= self.csv_min - ] - - columns = self.first + sorted(rest) - df = df[columns] - return df - - @staticmethod - def group_values_by_header(by_header, trait, base_header): - filtered = {k: v for k, v in trait.items() if k not in w_utils.COLUMN_SKIPS} - by_header[base_header].append(filtered) - - @staticmethod - def number_columns(by_header, csv_row): - for unnumbered_header, trait_list in by_header.items(): - for i, trait in enumerate(trait_list, 1): - for key, value in trait.items(): - header = f"{unnumbered_header}.{i}.{key}" - csv_row[header] = value +from traiter.pylib.darwin_core import DarwinCore + +from flora.pylib.treatments import Treatments + +TAXON = "dwc:scientificName" + + +def write_csv(treatments: Treatments): + rows = [] + for treatment in treatments: + grouped = group_traits(treatment) + flattened = flatten_traits(grouped) + formatted = remove_duplicates(flattened) + add_row_fields(treatment, formatted) + rows.append(formatted) + + max_indexes = get_max_indexes(rows) + rows = number_columns(rows, max_indexes) + + df = pd.DataFrame(rows) + print(df.head()) + # sort columns + # output data frame + + +def number_columns(rows, max_indexes): + new_rows = [] + for row in rows: + new_row = {} + for (key, i), value in row.items(): + suffix = f"_{i}" if max_indexes[key] > 1 else "" + for col, val in value.items(): + new_row[col + suffix] = val + + new_rows.append(new_row) + return new_rows + + +def get_max_indexes(rows): + max_index = defaultdict(int) + for row in rows: + for key, i in row: + if i > max_index[key]: + max_index[key] = i + return max_index + + +def remove_duplicates(flattened): + cleaned = {} + for key, values in flattened.items(): + i = 0 + used = set() + for val in values: + as_tuple = tuple(val.items()) + if as_tuple not in used: + i += 1 + used.add(as_tuple) + cleaned[(key, i)] = val + return cleaned + + +def flatten_traits(grouped): + flattened = defaultdict(list) + for name, dwc_list in grouped.items(): + for dwc_value in dwc_list: + new = {} + flat = dwc_value.flatten() + for key, value in flat.items(): + if isinstance(value, dict): + for field, val in value.items(): + new[f"{key}_{field}"] = val + else: + new[key] = value + flattened[name].append(new) + return flattened + + +def group_traits(treatment): + grouped: dict[str, list[DarwinCore]] = defaultdict(list) + for trait in treatment.traits: + dwc = DarwinCore() + dwc_trait = trait.to_dwc(dwc) + grouped[trait.key].append(dwc_trait) + return grouped + + +def add_row_fields(treatment, formatted: dict[tuple, dict]): + taxon = formatted.get((TAXON, 1)) + taxon = taxon[TAXON] if taxon else "unknown" + formatted[("taxon", 1)] = {"taxon": taxon} + formatted[("treatment", 1)] = {"treatment": treatment.path.stem} diff --git a/flora/pylib/writers/base_html_writer.py b/flora/pylib/writers/html_writer.py similarity index 85% rename from flora/pylib/writers/base_html_writer.py rename to flora/pylib/writers/html_writer.py index 22a2a46f..1d245064 100644 --- a/flora/pylib/writers/base_html_writer.py +++ b/flora/pylib/writers/html_writer.py @@ -5,10 +5,11 @@ from typing import Any, NamedTuple import jinja2 +from tqdm import tqdm from traiter.pylib.darwin_core import DYN, DarwinCore from flora.pylib.label import Label -from flora.pylib.labels import Labels +from flora.pylib.treatments import Treatments COLOR_COUNT = 14 BACKGROUNDS = itertools.cycle([f"cc{i}" for i in range(COLOR_COUNT)]) @@ -27,9 +28,10 @@ class Sortable(NamedTuple): @dataclass(kw_only=True) -class BaseHtmlWriterRow: +class HtmlWriterRow: formatted_text: str formatted_traits: list[TraitRow] = field(default_factory=list) + treatment_id: str = "" class CssClasses: @@ -45,7 +47,7 @@ def __getitem__(self, key): return self.classes[key] -class BaseHtmlWriter: +class HtmlWriter: def __init__(self, template_dir, template, html_file, spotlight=""): self.template_dir = template_dir self.template = template @@ -53,8 +55,21 @@ def __init__(self, template_dir, template, html_file, spotlight=""): self.css_classes = CssClasses(spotlight) self.formatted = [] - def write(self, rows: Labels, args=None): - raise NotImplementedError + def write(self, treatments: Treatments, args=None): + for treat in tqdm(treatments.treatments, desc="write"): + self.formatted.append( + HtmlWriterRow( + treatment_id=treat.path.stem, + formatted_text=self.format_text(treat, exclude=["trs"]), + formatted_traits=self.format_traits(treat), + ), + ) + + summary = { + "Total treatments:": len(treatments.treatments), + } + + self.write_template(args.html_file, summary=summary) def format_text(self, row: Label, exclude=None): """Wrap traits in the text with that can be formatted with CSS.""" diff --git a/flora/pylib/writers/treatment_html_writer.py b/flora/pylib/writers/treatment_html_writer.py deleted file mode 100644 index fe2a1657..00000000 --- a/flora/pylib/writers/treatment_html_writer.py +++ /dev/null @@ -1,38 +0,0 @@ -from dataclasses import dataclass - -from tqdm import tqdm - -from flora.pylib import const -from flora.pylib.treatments import Treatments -from flora.pylib.writers.base_html_writer import BaseHtmlWriter, BaseHtmlWriterRow - - -@dataclass(kw_only=True) -class HtmlWriterRow(BaseHtmlWriterRow): - treatment_id: str = "" - - -class HtmlWriter(BaseHtmlWriter): - def __init__(self, html_file, spotlight=""): - super().__init__( - template_dir=f"{const.ROOT_DIR}/flora/pylib/writers/templates", - template="treatment_html_writer.html", - html_file=html_file, - spotlight=spotlight, - ) - - def write(self, treatments: Treatments, args=None): - for treat in tqdm(treatments.treatments, desc="write"): - self.formatted.append( - HtmlWriterRow( - treatment_id=treat.path.stem, - formatted_text=self.format_text(treat, exclude=["trs"]), - formatted_traits=self.format_traits(treat), - ), - ) - - summary = { - "Total treatments:": len(treatments.treatments), - } - - self.write_template(args.text_dir, summary=summary) diff --git a/flora/pylib/writers/writer_utils.py b/flora/pylib/writers/writer_utils.py deleted file mode 100644 index 15a4433a..00000000 --- a/flora/pylib/writers/writer_utils.py +++ /dev/null @@ -1,47 +0,0 @@ -from pathlib import Path - -from traiter.pylib import term_util - -from flora.pylib.rules import terms - -LOCATION_CSV = Path(terms.__file__).parent / "part_location_terms.csv" -LOCATION_ENTS = term_util.get_labels(LOCATION_CSV) - -TITLE_SKIPS = ["start", "end"] -FIELD_SKIPS = [*TITLE_SKIPS, "trait", "dimensions"] -FIELD_SKIPS += ["part", "subpart"] -COLUMN_SKIPS = [*FIELD_SKIPS, "taxon"] -TRAIT_SKIPS = [*LOCATION_ENTS, "part", "subpart", "sex"] - -SUBPART_SET = {"subpart"} - - -def label_parts(trait): - # keys = set(trait.keys()) - - name = {} # Dicts preserve order sets do not - - part = trait.get("part", "") - name[" ".join(part) if isinstance(part, list) else part] = 1 - - subpart = trait.get("subpart", "") - if subpart: - name[trait[subpart[0]]] = 1 - - name[trait["trait"]] = 1 - - if trait.get("sex"): - name[trait["sex"]] = 1 - - return name - - -def html_label(trait): - parts = label_parts(trait) - - parts = "_".join(parts.keys()) - parts = parts.strip().replace(" ", "_").replace("-", "") - parts = parts.removeprefix("_") - parts = parts.removesuffix("_part") - - return parts