diff --git a/README.md b/README.md index 5e4a7de7..187032a3 100644 --- a/README.md +++ b/README.md @@ -7,9 +7,12 @@ This repository merges three older repositories: - `traiter_plants` - `traiter_efloras` - `traiter_mimosa` -- Parts of `digi_leap` -More merging for other Traiter repositories for plant traits may occur. +And I split some functionality was also split out to enable me to use it in other projects. +- `pdf_parsers`: Scripts for parsing PDFs to prepare them for information extraction. + - https://github.com/rafelafrance/pdf_parsers +- `LabelTraiter`: Parsing treatments (this repo) and herbarium labels are now separate repositories. + - https://github.com/rafelafrance/LabelTraiter ## All right, what's this all about then? **Challenge**: Extract trait information from plant treatments. That is, if I'm given treatment text like: (Reformatted to emphasize targeted traits.) @@ -28,9 +31,6 @@ Essentially, we are finding relevant terms in the text (NER) and then linking th 4. Sex: Plants exhibit sexual dimorphism, so we to note which part/subpart/trait notation is associated with which sex. 5. Other text: Things like conjunctions, punctuation, etc. Although they are not recorded, they are often important for parsing and linking of terms. -## Multiple methods for parsing -1. Rule based parsing. Most machine learning models require a substantial training dataset. I use this method to bootstrap the training data. If machine learning methods fail, I can fall back to this. - ## Rule-based parsing strategy 1. I label terms using Spacy's phrase and rule-based matchers. 2. Then I match terms using rule-based matchers repeatedly until I have built up a recognizable trait like: color, size, count, etc. @@ -58,6 +58,38 @@ cd FloraTraiter make install ``` +### Extract traits + +You'll need some treatment text files. One treatment per file. + +Example: + +```bash +parse-treatments --treatment-dir /path/to/treatments --json-dir /path/to/output/traits --html-file /path/to/traits.html +``` + +The output formats --json-dir & --html-file are optional. An example of the HTML output was shown above. An example of JSON output. + +```json +{ + "dwc:scientificName": "Astragalus cobrensis A. Gray var. maguirei Kearney, | var. maguirei", + "dwc:taxonRank": "variety", + "dwc:scientificNameAuthorship": "A. Gray | Kearney", + "dwc:dynamicProperties": { + "leafletHairSurface": "pilosulous", + "leafletHair": "hair", + "leafletHairShape": "incurved-ascending", + "leafletHairSize": "lengthLowInCentimeters: 0.06 ~ lengthHighInCentimeters: 0.08", + "leafPart": "leaflet | leaf", + "partLocation": "adaxial", + "fruitPart": "legume", + "legumeColor": "white", + "legumeSurface": "villosulous" + }, + "text": "..." +} +``` + ### Taxon database A taxon database is included with the source code, but it may be out of date. I build a taxon database from 4 sources. The 3 primary sources each have various issues, but they complement each other well. @@ -69,8 +101,6 @@ A taxon database is included with the source code, but it may be out of date. I Download the first 3 sources and then use the `util_add_taxa.py` script to extract the taxa and put them into a form the parsers can use. -## Repository details - ## Tests There are tests which you can run like so: ```bash diff --git a/assets/traits.png b/assets/traits.png index 16f53571..6b786f46 100644 Binary files a/assets/traits.png and b/assets/traits.png differ diff --git a/assets/treatment.png b/assets/treatment.png index aec14d6e..44baf3fe 100644 Binary files a/assets/treatment.png and b/assets/treatment.png differ diff --git a/flora/parse_treatments.py b/flora/parse_treatments.py index f691904a..ddb65b19 100755 --- a/flora/parse_treatments.py +++ b/flora/parse_treatments.py @@ -15,26 +15,26 @@ def main(): log.started() args = parse_args() - treatments: Treatments = Treatments(args) + treatments: Treatments = Treatments(args.treatment_dir, args.limit, args.offset) treatments.parse() if args.html_file: writer = HtmlWriter(args.html_file, args.spotlight) writer.write(treatments, args) - if args.traiter_dir: - args.traiter_dir.mkdir(parents=True, exist_ok=True) - write_json(treatments, args.traiter_dir) + if args.json_dir: + args.json_dir.mkdir(parents=True, exist_ok=True) + write_json(treatments, args.json_dir) log.finished() -def write_json(treatments, traiter_dir): +def write_json(treatments, json_dir): for treat in treatments.treatments: dwc = DarwinCore() _ = [t.to_dwc(dwc) for t in treat.traits] - path = traiter_dir / f"{treat.path.stem}.json" + path = json_dir / f"{treat.path.stem}.json" with path.open("w") as f: output = dwc.to_dict() output["text"] = treat.text @@ -54,15 +54,15 @@ def parse_args() -> argparse.Namespace: ) arg_parser.add_argument( - "--text-dir", + "--treatment-dir", metavar="PATH", type=Path, required=True, - help="""Directory containing the input text files.""", + help="""Directory containing the input treatment text files.""", ) arg_parser.add_argument( - "--traiter-dir", + "--json-dir", metavar="PATH", type=Path, help="""Output JSON files holding traits, one for each input text file, in this diff --git a/flora/pylib/treatments.py b/flora/pylib/treatments.py index 7cd1bf81..14247d99 100644 --- a/flora/pylib/treatments.py +++ b/flora/pylib/treatments.py @@ -6,16 +6,18 @@ class Treatments: - def __init__(self, args): - self.treatments: list[Treatment] = self.get_treatments(args) + def __init__(self, treatment_dir, limit, offset): + self.treatments: list[Treatment] = self.get_treatments( + treatment_dir, limit, offset + ) self.nlp = pipeline.build() @staticmethod - def get_treatments(args): - labels = [Treatment(p) for p in sorted(args.text_dir.glob("*"))] + def get_treatments(treatment_dir, limit, offset): + labels = [Treatment(p) for p in sorted(treatment_dir.glob("*"))] - if args.limit: - labels = labels[args.offset : args.limit + args.offset] + if limit: + labels = labels[offset : limit + offset] return labels