Skip to content

Commit

Permalink
CsvGeneralizer: Adding ability to interpret metadata rows (#106)
Browse files Browse the repository at this point in the history
- adding docs to parameters
 - expose more parameters in CLI
 - additional tests
  • Loading branch information
cmungall authored Nov 10, 2022
1 parent 455c91a commit b96e566
Show file tree
Hide file tree
Showing 7 changed files with 632 additions and 2,139 deletions.
2,492 changes: 384 additions & 2,108 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ psycopg2-binary = "^2.9.2"
strsimpy = "^0.2.1"
requests = "^2.26.0"
bioregistry = "^0.5.87"
oaklib = "^0.1.43"
oaklib = "^0.1.52"
pandera = "^0.12.0"
tomlkit = "^0.11.4"
inflect = "^6.0.0"
Expand Down
15 changes: 13 additions & 2 deletions schema_automator/annotators/schema_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def annotate_element(self, elt: Union[PermissibleValue, Element]) -> None:
if self.mine_descriptions and elt.description:
texts.append(elt.description)
for text in texts:
logging.info(f"Annotating: {text}")
for r in self.annotate_text(text):
logging.debug(f'MATCH: {r}')
if self.allow_partial or r.matches_whole_text:
Expand Down Expand Up @@ -91,13 +92,15 @@ def annotate_text(self, text: str) -> Iterator[TextAnnotation]:
oi = self.ontology_implementation
text_exp = uncamel(text) # TODO: use main linkml_runtime method
if isinstance(oi, TextAnnotatorInterface):
logging.debug(f"Using TextAnnotatorInterface on {text_exp}")
# TextAnnotation is available; use this by default
for r in oi.annotate_text(text_exp):
yield r
if text_exp != text.lower():
for r in oi.annotate_text(text_exp):
yield r
elif isinstance(oi, SearchInterface):
logging.debug(f"Using SearchInterface on {text_exp}")
# use search as an alternative
cfg = SearchConfiguration(is_complete=True)
for r in oi.basic_search(text, config=cfg):
Expand Down Expand Up @@ -128,7 +131,15 @@ def enrich(self, schema: Union[SchemaDefinition, str]) -> SchemaDefinition:
Enrich a schema by performing lookups on the external ontology/vocabulary endpoint,
and copying over metadata
Currently the only metadata obtained is text definitions
Currently, the only metadata obtained is text definitions
.. code-block:: python
>>> from schema_automator.annotators.schema_annotator import SchemaAnnotator
>>> from oaklib.selector import get_implementation_from_shorthand
>>> oi = get_implementation_from_shorthand("sqlite:obo:so")
>>> sa = SchemaAnnotator(ontology_implementation=oi)
>>> schema = sa.enrich("tests/data/schema.yaml")
:param schema:
:return:
Expand Down Expand Up @@ -160,7 +171,7 @@ def _add_description_from_curies(self, elt: Union[Element, PermissibleValue], cu
if elt.description:
break
try:
defn = oi.get_definition_by_curie(x)
defn = oi.definition(x)
if defn:
elt.description = defn
else:
Expand Down
30 changes: 27 additions & 3 deletions schema_automator/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,13 @@ def main(verbose: int, quiet: bool):
@click.option('--column-separator', '-s', default='\t', help='separator')
@click.option('--downcase-header/--no-downcase-header', default=False, help='if true make headers lowercase')
@click.option('--enum-columns', '-E', multiple=True, help='column that is forced to be an enum')
@click.option('--enum-threshold', type=click.FLOAT, help='set high to be more inclusive')
@click.option('--max-enum-size',
type=click.INT,
help='set high to be more inclusive')
@click.option('--data-dictionary-row-count',
type=click.INT,
help='rows that provide metadata about columns')
@click.option('--robot/--no-robot', default=False, help='set if the TSV is a ROBOT template')
@click.option('--pandera/--no-pandera', default=False, help='set to use panderas as inference engine')
def generalize_tsv(tsvfile, output, class_name, schema_name, pandera: bool, annotator, **kwargs):
Expand All @@ -99,6 +106,7 @@ def generalize_tsv(tsvfile, output, class_name, schema_name, pandera: bool, anno
schemauto generalize-tsv --class-name Person --schema-name PersonInfo my/data/persons.tsv
"""
kwargs = {k:v for k, v in kwargs.items() if v is not None}
if pandera:
ie = PandasDataGeneralizer(**kwargs)
else:
Expand Down Expand Up @@ -387,16 +395,32 @@ def annotate_schema(schema: str, input: str, output: str, **kwargs):
@main.command()
@click.argument('schema')
@click.option('--input', '-i', help="OAK input ontology selector")
@click.option('--annotate/--no-annotate', default=True, help="If true, annotate the schema")
@output_option
def enrich_schema(schema: str, input: str, output: str, **args):
def enrich_schema(schema: str, input: str, output: str, annotate: bool, **args):
"""
Annotate all elements of a schema
Enrich a schema using an ontology.
This will use OAK to add additional metadata using uris and mappings in the schema.
For example, if your schema has a class with a mapping to a SO class,
then the definition of that will be copied to the class description.
Example:
schemauto enrich-schema -i bioportal: my-schema.yaml -o my-enriched.yaml
If your schema has no mappings you can use --annotate to add them
Example:
Requires Bioportal API key
schemauto enrich-schema -i so.obo --annotate my-schema.yaml -o my-enriched.yaml --annotate
"""
impl = get_implementation_from_shorthand(input)
annr = SchemaAnnotator(impl)
logging.info(f"Enriching: {schema}")
if annotate:
schema = annr.annotate_schema(schema)
schema = annr.enrich(schema)
write_schema(schema, output)

Expand Down
156 changes: 133 additions & 23 deletions schema_automator/generalizers/csv_data_generalizer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import click
import logging
import yaml
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Set, Any
from collections import defaultdict
import os
import re
Expand All @@ -14,6 +14,7 @@
from deprecation import deprecated
from linkml_runtime import SchemaView
from linkml_runtime.linkml_model import SchemaDefinition, ClassDefinition, TypeDefinition, SlotDefinition
from linkml_runtime.linkml_model.meta import UniqueKey
from quantulum3 import parser as q_parser
from dataclasses import dataclass, field

Expand Down Expand Up @@ -63,17 +64,46 @@ class CsvDataGeneralizer(Generalizer):
"""

column_separator: str = "\t"
"""character that separates columns in the input file"""

schema_name: str = 'example'
"""LinkML schema name (no spaces)"""

robot: bool = False
"""If true, conforms to robot template format. Data dictionary rows start with '>'"""

data_dictionary_row_count: int = field(default=0)
"""number of rows after header containing data dictionary information"""

enum_columns: List[str] = field(default_factory=lambda: [])
"""List of columns that are coerced into enums"""

enum_mask_columns: List[str] = field(default_factory=lambda: [])
"""List of columns that are excluded from being enums"""

enum_threshold: float = 0.1
"""If number if distinct values divided by total number of values is greater than this, then the column is considered an enum"""

enum_strlen_threshold: int = 30
"""Maximimum length of a string to be considered a permissible enum value"""

max_enum_size: int = 50
"""Max number of permissible values for a column to be considered an enum"""

downcase_header: bool = False
"""If true, coerce column names to be lower case"""

infer_foreign_keys: bool = False
max_pk_len: int = 60 # URIs can be long..
"""For multi-CVS files, infer linkages between rows"""

max_pk_len: int = 60
"""Maximum length to be considered for a primary key column. Note: URIs can be long"""

min_distinct_fk_val: int = 8
"""For inferring foreign keys, there must be a minimum number."""

source_schema: Optional[SchemaDefinition] = None
"""Optional base schema to draw from"""

def infer_linkages(self, files: List[str], **kwargs) -> List[ForeignKey]:
"""
Expand Down Expand Up @@ -297,14 +327,31 @@ def convert_dicts(self,
rr: List[Dict],
schema_name: str = 'example',
class_name: str = DEFAULT_CLASS_NAME,
**kwargs) -> SchemaDefinition:
**kwargs) -> Optional[SchemaDefinition]:
"""
Converts a list of row objects to a schema.
Each row is a data item, presumed to be of the same type,
that is generalized.
:param rr:
:param schema_name:
:param class_name:
:param kwargs:
:return:
"""
slots = {}
slot_values = {}

slot_distinct_values: Dict[str, Set[Any]] = {}
"""distinct values for each slot"""

slot_values: Dict[str, List[Any]] = defaultdict(list)
"""all values for each slot"""

n = 0
enums = {}
robot_defs = {}
slot_usage = {}
types = {}
enum_columns = self.enum_columns
enum_mask_columns = self.enum_mask_columns
if len(rr) == 0:
Expand All @@ -317,6 +364,14 @@ def convert_dicts(self,
for k, v in row.items():
robot_defs[k] = v
continue
if n <= self.data_dictionary_row_count:
if self.source_schema is None:
self.source_schema = SchemaDefinition(id="auto", name="auto")
for k, v in row.items():
if k not in self.source_schema.slots:
self.source_schema.slots[k] = SlotDefinition(k)
self.source_schema.slots[k].description = v
continue
for k, v in row.items():
if k is None or k == '':
continue
Expand All @@ -332,22 +387,44 @@ def convert_dicts(self,
vs = [v]
if k not in slots:
slots[k] = {'range': None}
slot_values[k] = set()
slot_distinct_values[k] = set()
if v is not None and v != "" and not str(v).startswith('$ref:'):
slots[k]['examples'] = [{'value': v}]
slot_values[k].update(vs)
slot_distinct_values[k].update(vs)
slot_values[k] += vs
if len(vs) > 1:
slots[k]['multivalued'] = True
types = {}
new_slots = {}
col_number = 0
unique_keys = []
for sn, s in slots.items():
vals = slot_values[sn]
col_number += 1
is_unique = len(set(slot_values[sn])) == len(slot_values[sn])
is_pk = is_unique and col_number == 1
if self.source_schema and sn in self.source_schema.slots and self.source_schema.slots[sn].identifier:
is_pk = True
if is_pk:
s['identifier'] = True
elif is_unique:
unique_keys.append(sn)
vals = slot_distinct_values[sn]
if self.source_schema:
if sn in self.source_schema.slots:
s['description'] = self.source_schema.slots[sn].description
s['range'] = infer_range(s, vals, types)
logging.info(f"Slot {sn} has range {s['range']}")
if (s['range'] == 'string' or sn in enum_columns) and sn not in enum_mask_columns:
filtered_vals = \
[v
for v in slot_values[sn]
if not isinteger(v) and not isfloat(v) and not isboolean(v) and not is_date(v)]
n_filtered_vals = len(filtered_vals) + 1
n_distinct = len(vals)
longest = max([len(str(v)) for v in vals]) if n_distinct > 0 else 0
logging.info(f"Considering {sn} as enum: {n_distinct} distinct values / {n_filtered_vals}, longest={longest}")
if sn in enum_columns or \
((n_distinct / n) < self.enum_threshold and 0 < n_distinct <= self.max_enum_size
((n_distinct / n_filtered_vals) < self.enum_threshold and 0 < n_distinct <= self.max_enum_size
and longest < self.enum_strlen_threshold):
enum_name = sn.replace(' ', '_').replace('(s)', '')
enum_name = f'{enum_name}_enum'
Expand Down Expand Up @@ -416,6 +493,9 @@ def convert_dicts(self,
for sn, s in new_slots.items():
if sn not in slots:
slots[sn] = s

unique_keys = [UniqueKey(f"{k}_key",
unique_key_slots=[k]) for k in unique_keys]
schema = SchemaDefinition(
id=f'https://w3id.org/{schema_name}',
name=schema_name,
Expand All @@ -426,7 +506,9 @@ def convert_dicts(self,
classes=[
ClassDefinition(class_name,
slots=class_slots,
slot_usage=slot_usage)
slot_usage=slot_usage,
unique_keys=unique_keys,
)
],
slots=slots,
enums=enums
Expand Down Expand Up @@ -465,6 +547,16 @@ def isfloat(value):
except ValueError:
return False

def isinteger(value):
try:
int(value)
return True
except ValueError:
return False

def isboolean(value):
return value in ['true', 'false']


def is_measurement(value):
ms = q_parser.parse(value)
Expand Down Expand Up @@ -503,8 +595,18 @@ def is_all_measurement(values):
return False


def infer_range(slot: dict, vals: set, types: dict) -> str:
def infer_range(slot: dict, vals: set, types: dict, coerce=True) -> str:
"""
Infers the range of a slot based on the values
:param slot:
:param vals:
:param types:
:return:
"""
logging.info(f"Inferring value for {list(vals)[0:5]}...")
nn_vals = [v for v in vals if v is not None and v != ""]
logging.info(f"FILTERED: {list(nn_vals)[0:5]}...")
if len(nn_vals) == 0:
return 'string'
if all(str(v).startswith('$ref:') for v in nn_vals):
Expand All @@ -513,12 +615,15 @@ def infer_range(slot: dict, vals: set, types: dict) -> str:
return 'integer'
if all(isinstance(v, float) for v in nn_vals):
return 'float'
if all(str(v).isdigit() for v in nn_vals):
return 'integer'
if all(is_date(v) for v in nn_vals):
return 'datetime'
if all(isfloat(v) for v in nn_vals):
return 'float'
if coerce:
if all(isinteger(v) for v in nn_vals):
return 'integer'
if all(isboolean(v) for v in nn_vals):
return 'boolean'
if all(isfloat(v) for v in nn_vals):
return 'float'
if all(is_date(v) for v in nn_vals):
return 'datetime'
if is_all_measurement(nn_vals):
return 'measurement'
v0 = nn_vals[0]
Expand All @@ -535,12 +640,17 @@ def infer_range(slot: dict, vals: set, types: dict) -> str:
return 'string'


def get_db(db_id: str) -> str:
parts = db_id.split(':')
if len(parts) > 1:
return parts[0]
else:
return None
def get_db(db_id: str) -> Optional[str]:
"""
Extracts the database from a CURIE
:param db_id:
:return:
"""
if isinstance(db_id, str) and ':' in db_id:
parts = db_id.split(':')
if len(parts) > 1:
return parts[0]


def is_date(string, fuzzy=False):
Expand Down
13 changes: 13 additions & 0 deletions tests/resources/bio.obo
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
format-version: 1.2
ontology: bio

[Term]
id: BIO:1
name: biochemical reaction
def: "A biochemical reaction" []

[Term]
id: BIO:2
name: chemical structure
def: "A chemical structure" []

Loading

0 comments on commit b96e566

Please sign in to comment.