Skip to content

Commit

Permalink
WIP Output CSV format
Browse files Browse the repository at this point in the history
  • Loading branch information
rafelafrance committed Oct 15, 2024
1 parent 8f3df96 commit 69f0116
Show file tree
Hide file tree
Showing 8 changed files with 126 additions and 168 deletions.
1 change: 1 addition & 0 deletions args/parse_treatments.bash
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ for path in data/treatments/*; do
./flora/parse_treatments.py \
--treatment-dir="$path" \
--html-file=data/output/html_output/"$output".html \
--json-dir=data/output/json_output/"$output"_json \
--csv-file=data/output/csv_output/"$output".csv
done
15 changes: 10 additions & 5 deletions flora/parse_treatments.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
from traiter.pylib.darwin_core import DarwinCore
from util.pylib import log

from flora.pylib import const
from flora.pylib.treatments import Treatments
from flora.pylib.writers.csv_writer import CsvWriter
from flora.pylib.writers.treatment_html_writer import HtmlWriter
from flora.pylib.writers.csv_writer import write_csv
from flora.pylib.writers.html_writer import HtmlWriter


def main():
Expand All @@ -20,12 +21,16 @@ def main():
treatments.parse()

if args.html_file:
writer = HtmlWriter(args.html_file, args.spotlight)
writer = HtmlWriter(
template_dir=f"{const.ROOT_DIR}/flora/pylib/writers/templates",
template="treatment_html_writer.html",
html_file=args.html_file,
spotlight=args.spotlight,
)
writer.write(treatments, args)

if args.csv_file:
writer = CsvWriter(args.csv_file)
writer.write(treatments, args)
write_csv(treatments)

if args.json_dir:
args.json_dir.mkdir(parents=True, exist_ok=True)
Expand Down
2 changes: 1 addition & 1 deletion flora/pylib/treatment.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from traiter.pylib import util as t_util
from traiter.pylib.rules.base import Base

from .rules.linkable import Linkable
from flora.pylib.rules.linkable import Linkable


@dataclass
Expand Down
3 changes: 3 additions & 0 deletions flora/pylib/treatments.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ def __init__(self, treatment_dir, limit, offset):
)
self.nlp = flora_pipeline.build()

def __iter__(self):
yield from self.treatments

@staticmethod
def get_treatments(treatment_dir, limit, offset):
labels = [Treatment(p) for p in sorted(treatment_dir.glob("*"))]
Expand Down
163 changes: 91 additions & 72 deletions flora/pylib/writers/csv_writer.py
Original file line number Diff line number Diff line change
@@ -1,75 +1,94 @@
from collections import defaultdict

import pandas as pd

from flora.pylib.rules.part import Part

from . import writer_utils as w_utils

PARTS_SET = {*Part.labels, "multiple_parts"}


class CsvWriter:
def __init__(self, csv_file, csv_min=0, first=None):
self.csv_file = csv_file
self.csv_min = csv_min
self.csv_rows = []
self.first = first if first else ["taxon"]

def write(self, treatments, size_units="centimeters"):
csv_rows = self.format_all_rows(treatments)
df = pd.DataFrame(csv_rows)
df = self.sort_df(df)

with self.csv_file.open("w") as out_file:
out_file.write(f"** All sizes are given in {size_units}. **\n")
df.to_csv(out_file, index=False)

def format_all_rows(self, treatments):
csv_rows = [self.format_row(r) for r in treatments]
return csv_rows

def format_row(self, treatment):
csv_row = {"taxon": treatment.taxon}
return self.row_builder(treatment, csv_row)

def row_builder(self, treatment, csv_row):
by_header = defaultdict(list)
for trait in treatment.traits:
if trait["trait"] in PARTS_SET:
continue

key_set = set(trait.keys())

if not (PARTS_SET & key_set):
continue

base_header = w_utils.html_label(trait)

self.group_values_by_header(by_header, trait, base_header)
self.number_columns(by_header, csv_row)
return csv_row

def sort_df(self, df):
rest = [
c
for c in df.columns
if c not in self.first and df[c].notna().sum() >= self.csv_min
]

columns = self.first + sorted(rest)
df = df[columns]
return df

@staticmethod
def group_values_by_header(by_header, trait, base_header):
filtered = {k: v for k, v in trait.items() if k not in w_utils.COLUMN_SKIPS}
by_header[base_header].append(filtered)

@staticmethod
def number_columns(by_header, csv_row):
for unnumbered_header, trait_list in by_header.items():
for i, trait in enumerate(trait_list, 1):
for key, value in trait.items():
header = f"{unnumbered_header}.{i}.{key}"
csv_row[header] = value
from traiter.pylib.darwin_core import DarwinCore

from flora.pylib.treatments import Treatments

TAXON = "dwc:scientificName"


def write_csv(treatments: Treatments):
rows = []
for treatment in treatments:
grouped = group_traits(treatment)
flattened = flatten_traits(grouped)
formatted = remove_duplicates(flattened)
add_row_fields(treatment, formatted)
rows.append(formatted)

max_indexes = get_max_indexes(rows)
rows = number_columns(rows, max_indexes)

df = pd.DataFrame(rows)
print(df.head())
# sort columns
# output data frame


def number_columns(rows, max_indexes):
new_rows = []
for row in rows:
new_row = {}
for (key, i), value in row.items():
suffix = f"_{i}" if max_indexes[key] > 1 else ""
for col, val in value.items():
new_row[col + suffix] = val

new_rows.append(new_row)
return new_rows


def get_max_indexes(rows):
max_index = defaultdict(int)
for row in rows:
for key, i in row:
if i > max_index[key]:
max_index[key] = i
return max_index


def remove_duplicates(flattened):
cleaned = {}
for key, values in flattened.items():
i = 0
used = set()
for val in values:
as_tuple = tuple(val.items())
if as_tuple not in used:
i += 1
used.add(as_tuple)
cleaned[(key, i)] = val
return cleaned


def flatten_traits(grouped):
flattened = defaultdict(list)
for name, dwc_list in grouped.items():
for dwc_value in dwc_list:
new = {}
flat = dwc_value.flatten()
for key, value in flat.items():
if isinstance(value, dict):
for field, val in value.items():
new[f"{key}_{field}"] = val
else:
new[key] = value
flattened[name].append(new)
return flattened


def group_traits(treatment):
grouped: dict[str, list[DarwinCore]] = defaultdict(list)
for trait in treatment.traits:
dwc = DarwinCore()
dwc_trait = trait.to_dwc(dwc)
grouped[trait.key].append(dwc_trait)
return grouped


def add_row_fields(treatment, formatted: dict[tuple, dict]):
taxon = formatted.get((TAXON, 1))
taxon = taxon[TAXON] if taxon else "unknown"
formatted[("taxon", 1)] = {"taxon": taxon}
formatted[("treatment", 1)] = {"treatment": treatment.path.stem}
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
from typing import Any, NamedTuple

import jinja2
from tqdm import tqdm
from traiter.pylib.darwin_core import DYN, DarwinCore

from flora.pylib.label import Label
from flora.pylib.labels import Labels
from flora.pylib.treatments import Treatments

COLOR_COUNT = 14
BACKGROUNDS = itertools.cycle([f"cc{i}" for i in range(COLOR_COUNT)])
Expand All @@ -27,9 +28,10 @@ class Sortable(NamedTuple):


@dataclass(kw_only=True)
class BaseHtmlWriterRow:
class HtmlWriterRow:
formatted_text: str
formatted_traits: list[TraitRow] = field(default_factory=list)
treatment_id: str = ""


class CssClasses:
Expand All @@ -45,16 +47,29 @@ def __getitem__(self, key):
return self.classes[key]


class BaseHtmlWriter:
class HtmlWriter:
def __init__(self, template_dir, template, html_file, spotlight=""):
self.template_dir = template_dir
self.template = template
self.html_file = html_file
self.css_classes = CssClasses(spotlight)
self.formatted = []

def write(self, rows: Labels, args=None):
raise NotImplementedError
def write(self, treatments: Treatments, args=None):
for treat in tqdm(treatments.treatments, desc="write"):
self.formatted.append(
HtmlWriterRow(
treatment_id=treat.path.stem,
formatted_text=self.format_text(treat, exclude=["trs"]),
formatted_traits=self.format_traits(treat),
),
)

summary = {
"Total treatments:": len(treatments.treatments),
}

self.write_template(args.html_file, summary=summary)

def format_text(self, row: Label, exclude=None):
"""Wrap traits in the text with <spans> that can be formatted with CSS."""
Expand Down
38 changes: 0 additions & 38 deletions flora/pylib/writers/treatment_html_writer.py

This file was deleted.

47 changes: 0 additions & 47 deletions flora/pylib/writers/writer_utils.py

This file was deleted.

0 comments on commit 69f0116

Please sign in to comment.