CsvGeneralizer: Adding ability to interpret metadata rows (#106)

- adding docs to parameters - expose more parameters in CLI - additional tests
linkml · Nov 10, 2022 · b96e566 · b96e566
1 parent 455c91a
commit b96e566
Show file tree

Hide file tree

Showing 7 changed files with 632 additions and 2,139 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,7 +24,7 @@ psycopg2-binary = "^2.9.2"
 strsimpy = "^0.2.1"
 requests = "^2.26.0"
 bioregistry = "^0.5.87"
-oaklib = "^0.1.43"
+oaklib = "^0.1.52"
 pandera = "^0.12.0"
 tomlkit = "^0.11.4"
 inflect = "^6.0.0"

diff --git a/schema_automator/annotators/schema_annotator.py b/schema_automator/annotators/schema_annotator.py
@@ -55,6 +55,7 @@ def annotate_element(self, elt: Union[PermissibleValue, Element]) -> None:
         if self.mine_descriptions and elt.description:
             texts.append(elt.description)
         for text in texts:
+            logging.info(f"Annotating: {text}")
             for r in self.annotate_text(text):
                 logging.debug(f'MATCH: {r}')
                 if self.allow_partial or r.matches_whole_text:
@@ -91,13 +92,15 @@ def annotate_text(self, text: str) -> Iterator[TextAnnotation]:
         oi = self.ontology_implementation
         text_exp = uncamel(text)  # TODO: use main linkml_runtime method
         if isinstance(oi, TextAnnotatorInterface):
+            logging.debug(f"Using TextAnnotatorInterface on {text_exp}")
             # TextAnnotation is available; use this by default
             for r in oi.annotate_text(text_exp):
                 yield r
             if text_exp != text.lower():
                 for r in oi.annotate_text(text_exp):
                     yield r
         elif isinstance(oi, SearchInterface):
+            logging.debug(f"Using SearchInterface on {text_exp}")
             # use search as an alternative
             cfg = SearchConfiguration(is_complete=True)
             for r in oi.basic_search(text, config=cfg):
@@ -128,7 +131,15 @@ def enrich(self, schema: Union[SchemaDefinition, str]) -> SchemaDefinition:
         Enrich a schema by performing lookups on the external ontology/vocabulary endpoint,
         and copying over metadata
 
-        Currently the only metadata obtained is text definitions
+        Currently, the only metadata obtained is text definitions
+
+        .. code-block:: python
+
+        >>> from schema_automator.annotators.schema_annotator import SchemaAnnotator
+        >>> from oaklib.selector import get_implementation_from_shorthand
+        >>> oi = get_implementation_from_shorthand("sqlite:obo:so")
+        >>> sa = SchemaAnnotator(ontology_implementation=oi)
+        >>> schema = sa.enrich("tests/data/schema.yaml")
 
         :param schema:
         :return:
@@ -160,7 +171,7 @@ def _add_description_from_curies(self, elt: Union[Element, PermissibleValue], cu
             if elt.description:
                 break
             try:
-                defn = oi.get_definition_by_curie(x)
+                defn = oi.definition(x)
                 if defn:
                     elt.description = defn
                 else:

diff --git a/schema_automator/cli.py b/schema_automator/cli.py
@@ -87,6 +87,13 @@ def main(verbose: int, quiet: bool):
 @click.option('--column-separator', '-s', default='\t', help='separator')
 @click.option('--downcase-header/--no-downcase-header', default=False, help='if true make headers lowercase')
 @click.option('--enum-columns', '-E', multiple=True, help='column that is forced to be an enum')
+@click.option('--enum-threshold', type=click.FLOAT, help='set high to be more inclusive')
+@click.option('--max-enum-size',
+              type=click.INT,
+              help='set high to be more inclusive')
+@click.option('--data-dictionary-row-count',
+              type=click.INT,
+              help='rows that provide metadata about columns')
 @click.option('--robot/--no-robot', default=False, help='set if the TSV is a ROBOT template')
 @click.option('--pandera/--no-pandera', default=False, help='set to use panderas as inference engine')
 def generalize_tsv(tsvfile, output, class_name, schema_name, pandera: bool, annotator, **kwargs):
@@ -99,6 +106,7 @@ def generalize_tsv(tsvfile, output, class_name, schema_name, pandera: bool, anno
 
         schemauto generalize-tsv --class-name Person --schema-name PersonInfo my/data/persons.tsv
     """
+    kwargs = {k:v for k, v in kwargs.items() if v is not None}
     if pandera:
         ie = PandasDataGeneralizer(**kwargs)
     else:
@@ -387,16 +395,32 @@ def annotate_schema(schema: str, input: str, output: str, **kwargs):
 @main.command()
 @click.argument('schema')
 @click.option('--input', '-i', help="OAK input ontology selector")
+@click.option('--annotate/--no-annotate', default=True, help="If true, annotate the schema")
 @output_option
-def enrich_schema(schema: str, input: str, output: str, **args):
+def enrich_schema(schema: str, input: str, output: str, annotate: bool, **args):
     """
-    Annotate all elements of a schema
+    Enrich a schema using an ontology.
+
+    This will use OAK to add additional metadata using uris and mappings in the schema.
+
+    For example, if your schema has a class with a mapping to a SO class,
+    then the definition of that will be copied to the class description.
+    
+    Example:
+
+        schemauto enrich-schema -i bioportal: my-schema.yaml -o my-enriched.yaml
+
+    If your schema has no mappings you can use --annotate to add them
+
+    Example:
 
-    Requires Bioportal API key
+        schemauto enrich-schema -i so.obo --annotate my-schema.yaml -o my-enriched.yaml --annotate
     """
     impl = get_implementation_from_shorthand(input)
     annr = SchemaAnnotator(impl)
     logging.info(f"Enriching: {schema}")
+    if annotate:
+        schema = annr.annotate_schema(schema)
     schema = annr.enrich(schema)
     write_schema(schema, output)
 

diff --git a/schema_automator/generalizers/csv_data_generalizer.py b/schema_automator/generalizers/csv_data_generalizer.py
@@ -1,7 +1,7 @@
 import click
 import logging
 import yaml
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Set, Any
 from collections import defaultdict
 import os
 import re
@@ -14,6 +14,7 @@
 from deprecation import deprecated
 from linkml_runtime import SchemaView
 from linkml_runtime.linkml_model import SchemaDefinition, ClassDefinition, TypeDefinition, SlotDefinition
+from linkml_runtime.linkml_model.meta import UniqueKey
 from quantulum3 import parser as q_parser
 from dataclasses import dataclass, field
 
@@ -63,17 +64,46 @@ class CsvDataGeneralizer(Generalizer):
     """
 
     column_separator: str = "\t"
+    """character that separates columns in the input file"""
+
     schema_name: str = 'example'
+    """LinkML schema name (no spaces)"""
+
     robot: bool = False
+    """If true, conforms to robot template format. Data dictionary rows start with '>'"""
+
+    data_dictionary_row_count: int = field(default=0)
+    """number of rows after header containing data dictionary information"""
+
     enum_columns: List[str] = field(default_factory=lambda: [])
+    """List of columns that are coerced into enums"""
+
     enum_mask_columns: List[str] = field(default_factory=lambda: [])
+    """List of columns that are excluded from being enums"""
+
     enum_threshold: float = 0.1
+    """If number if distinct values divided by total number of values is greater than this, then the column is considered an enum"""
+
     enum_strlen_threshold: int = 30
+    """Maximimum length of a string to be considered a permissible enum value"""
+
     max_enum_size: int = 50
+    """Max number of permissible values for a column to be considered an enum"""
+
     downcase_header: bool = False
+    """If true, coerce column names to be lower case"""
+
     infer_foreign_keys: bool = False
-    max_pk_len: int = 60   # URIs can be long..
+    """For multi-CVS files, infer linkages between rows"""
+
+    max_pk_len: int = 60
+    """Maximum length to be considered for a primary key column. Note: URIs can be long"""
+
     min_distinct_fk_val: int = 8
+    """For inferring foreign keys, there must be a minimum number."""
+
+    source_schema: Optional[SchemaDefinition] = None
+    """Optional base schema to draw from"""
 
     def infer_linkages(self, files: List[str], **kwargs) -> List[ForeignKey]:
         """
@@ -297,14 +327,31 @@ def convert_dicts(self,
                       rr: List[Dict],
                       schema_name: str = 'example',
                       class_name: str = DEFAULT_CLASS_NAME,
-                      **kwargs) -> SchemaDefinition:
+                      **kwargs) -> Optional[SchemaDefinition]:
+        """
+        Converts a list of row objects to a schema.
+
+        Each row is a data item, presumed to be of the same type,
+        that is generalized.
+
+        :param rr:
+        :param schema_name:
+        :param class_name:
+        :param kwargs:
+        :return:
+        """
         slots = {}
-        slot_values = {}
+
+        slot_distinct_values: Dict[str, Set[Any]] = {}
+        """distinct values for each slot"""
+
+        slot_values: Dict[str, List[Any]] = defaultdict(list)
+        """all values for each slot"""
+
         n = 0
         enums = {}
         robot_defs = {}
         slot_usage = {}
-        types = {}
         enum_columns = self.enum_columns
         enum_mask_columns = self.enum_mask_columns
         if len(rr) == 0:
@@ -317,6 +364,14 @@ def convert_dicts(self,
                 for k, v in row.items():
                     robot_defs[k] = v
                 continue
+            if n <= self.data_dictionary_row_count:
+                if self.source_schema is None:
+                    self.source_schema = SchemaDefinition(id="auto", name="auto")
+                for k, v in row.items():
+                    if k not in self.source_schema.slots:
+                        self.source_schema.slots[k] = SlotDefinition(k)
+                    self.source_schema.slots[k].description = v
+                continue
             for k, v in row.items():
                 if k is None or k == '':
                     continue
@@ -332,22 +387,44 @@ def convert_dicts(self,
                     vs = [v]
                 if k not in slots:
                     slots[k] = {'range': None}
-                    slot_values[k] = set()
+                    slot_distinct_values[k] = set()
                 if v is not None and v != "" and not str(v).startswith('$ref:'):
                     slots[k]['examples'] = [{'value': v}]
-                    slot_values[k].update(vs)
+                    slot_distinct_values[k].update(vs)
+                    slot_values[k] += vs
                 if len(vs) > 1:
                     slots[k]['multivalued'] = True
         types = {}
         new_slots = {}
+        col_number = 0
+        unique_keys = []
         for sn, s in slots.items():
-            vals = slot_values[sn]
+            col_number += 1
+            is_unique = len(set(slot_values[sn])) == len(slot_values[sn])
+            is_pk = is_unique and col_number == 1
+            if self.source_schema and sn in self.source_schema.slots and self.source_schema.slots[sn].identifier:
+                is_pk = True
+            if is_pk:
+                s['identifier'] = True
+            elif is_unique:
+                unique_keys.append(sn)
+            vals = slot_distinct_values[sn]
+            if self.source_schema:
+                if sn in self.source_schema.slots:
+                    s['description'] = self.source_schema.slots[sn].description
             s['range'] = infer_range(s, vals, types)
+            logging.info(f"Slot {sn} has range {s['range']}")
             if (s['range'] == 'string' or sn in enum_columns) and sn not in enum_mask_columns:
+                filtered_vals = \
+                    [v
+                     for v in slot_values[sn]
+                     if not isinteger(v) and not isfloat(v) and not isboolean(v) and not is_date(v)]
+                n_filtered_vals = len(filtered_vals) + 1
                 n_distinct = len(vals)
                 longest = max([len(str(v)) for v in vals]) if n_distinct > 0 else 0
+                logging.info(f"Considering {sn} as enum: {n_distinct} distinct values / {n_filtered_vals}, longest={longest}")
                 if sn in enum_columns or \
-                        ((n_distinct / n) < self.enum_threshold and 0 < n_distinct <= self.max_enum_size
+                        ((n_distinct / n_filtered_vals) < self.enum_threshold and 0 < n_distinct <= self.max_enum_size
                          and longest < self.enum_strlen_threshold):
                     enum_name = sn.replace(' ', '_').replace('(s)', '')
                     enum_name = f'{enum_name}_enum'
@@ -416,6 +493,9 @@ def convert_dicts(self,
         for sn, s in new_slots.items():
             if sn not in slots:
                 slots[sn] = s
+
+        unique_keys = [UniqueKey(f"{k}_key",
+                                 unique_key_slots=[k]) for k in unique_keys]
         schema = SchemaDefinition(
             id=f'https://w3id.org/{schema_name}',
             name=schema_name,
@@ -426,7 +506,9 @@ def convert_dicts(self,
             classes=[
                 ClassDefinition(class_name,
                                 slots=class_slots,
-                                slot_usage=slot_usage)
+                                slot_usage=slot_usage,
+                                unique_keys=unique_keys,
+                                )
             ],
             slots=slots,
             enums=enums
@@ -465,6 +547,16 @@ def isfloat(value):
     except ValueError:
         return False
 
+def isinteger(value):
+    try:
+        int(value)
+        return True
+    except ValueError:
+        return False
+
+def isboolean(value):
+    return value in ['true', 'false']
+
 
 def is_measurement(value):
     ms = q_parser.parse(value)
@@ -503,8 +595,18 @@ def is_all_measurement(values):
         return False
 
 
-def infer_range(slot: dict, vals: set, types: dict) -> str:
+def infer_range(slot: dict, vals: set, types: dict, coerce=True) -> str:
+    """
+    Infers the range of a slot based on the values
+
+    :param slot:
+    :param vals:
+    :param types:
+    :return:
+    """
+    logging.info(f"Inferring value for {list(vals)[0:5]}...")
     nn_vals = [v for v in vals if v is not None and v != ""]
+    logging.info(f"FILTERED: {list(nn_vals)[0:5]}...")
     if len(nn_vals) == 0:
         return 'string'
     if all(str(v).startswith('$ref:') for v in nn_vals):
@@ -513,12 +615,15 @@ def infer_range(slot: dict, vals: set, types: dict) -> str:
         return 'integer'
     if all(isinstance(v, float) for v in nn_vals):
         return 'float'
-    if all(str(v).isdigit() for v in nn_vals):
-        return 'integer'
-    if all(is_date(v) for v in nn_vals):
-        return 'datetime'
-    if all(isfloat(v) for v in nn_vals):
-        return 'float'
+    if coerce:
+        if all(isinteger(v) for v in nn_vals):
+            return 'integer'
+        if all(isboolean(v) for v in nn_vals):
+            return 'boolean'
+        if all(isfloat(v) for v in nn_vals):
+            return 'float'
+        if all(is_date(v) for v in nn_vals):
+            return 'datetime'
     if is_all_measurement(nn_vals):
         return 'measurement'
     v0 = nn_vals[0]
@@ -535,12 +640,17 @@ def infer_range(slot: dict, vals: set, types: dict) -> str:
     return 'string'
 
 
-def get_db(db_id: str) -> str:
-    parts = db_id.split(':')
-    if len(parts) > 1:
-        return parts[0]
-    else:
-        return None
+def get_db(db_id: str) -> Optional[str]:
+    """
+    Extracts the database from a CURIE
+
+    :param db_id:
+    :return:
+    """
+    if isinstance(db_id, str) and ':' in db_id:
+        parts = db_id.split(':')
+        if len(parts) > 1:
+            return parts[0]
 
 
 def is_date(string, fuzzy=False):

diff --git a/tests/resources/bio.obo b/tests/resources/bio.obo
@@ -0,0 +1,13 @@
+format-version: 1.2
+ontology: bio
+
+[Term]
+id: BIO:1
+name: biochemical reaction
+def: "A biochemical reaction" []
+
+[Term]
+id: BIO:2
+name: chemical structure
+def: "A chemical structure" []
+