linkml
diff --git a/‎Makefile
Lines changed: 5 additions & 0 deletions b/‎Makefile
Lines changed: 5 additions & 0 deletions
diff --git a/‎schema_automator/annotators/enum_annotator.py
Lines changed: 0 additions & 1 deletion b/‎schema_automator/annotators/enum_annotator.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎schema_automator/annotators/schema_annotator.py
Lines changed: 63 additions & 89 deletions b/‎schema_automator/annotators/schema_annotator.py
Lines changed: 63 additions & 89 deletions
diff --git a/‎schema_automator/cli.py
Lines changed: 14 additions & 6 deletions b/‎schema_automator/cli.py
Lines changed: 14 additions & 6 deletions
diff --git a/‎schema_automator/enhancer/__init__.py b/‎schema_automator/enhancer/__init__.py
diff --git a/‎schema_automator/enhancer/general_enhancer.py
Lines changed: 17 additions & 0 deletions b/‎schema_automator/enhancer/general_enhancer.py
Lines changed: 17 additions & 0 deletions
diff --git a/‎schema_automator/generalizers/csv_data_generalizer.py
Lines changed: 20 additions & 3 deletions b/‎schema_automator/generalizers/csv_data_generalizer.py
Lines changed: 20 additions & 3 deletions
diff --git a/‎schema_automator/generalizers/json_instance_generalizer.py
Lines changed: 0 additions & 1 deletion b/‎schema_automator/generalizers/json_instance_generalizer.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎schema_automator/importers/dosdp_import_engine.py
Lines changed: 12 additions & 6 deletions b/‎schema_automator/importers/dosdp_import_engine.py
Lines changed: 12 additions & 6 deletions
@@ -58,3 +58,8 @@ target/availabilities_g_s_strain_202112151116_org_meanings_curated.yaml: target/
 		--model_in target/availabilities_g_s_strain_202112151116_org_meanings.yaml \
 		--curated_yaml $@ \
 		--selected_enum organism_enum
+
+# create a convenient wrapper script;
+# this can be used outside the poetry environment
+bin/schemauto:
+	echo `poetry run which schemauto` '"$$@"' > $@ && chmod +x $@
@@ -321,7 +321,6 @@ def enum_annotator(modelfile, all_mappings_fn, requested_enum_name, whiteout_cha
             # A value is trying to be set on a copy of a slice from a DataFrame.
             for_str_dist["tidied_query_lc"] = for_str_dist["tidied_query"].str.lower()
             for_str_dist["name_lc"] = for_str_dist["name"].str.lower()
-            logger.debug(for_str_dist)
 
             # favoring simplicity over efficiency
             # ie may be string-comparing some duplicates
 
@@ -8,120 +8,94 @@
 import os
 from dataclasses import dataclass
 from pprint import pprint
-from typing import Any, List, Dict, Union
+from typing import Any, List, Dict, Union, Iterator
 
 from linkml_runtime.linkml_model import SchemaDefinition
-from linkml_runtime.utils.schemaview import SchemaView
+from linkml_runtime.utils.metamodelcore import Curie
+from linkml_runtime.utils.schemaview import SchemaView, re
+from oaklib import BasicOntologyInterface
+from oaklib.datamodels.search import SearchConfiguration
+from oaklib.datamodels.text_annotator import TextAnnotation
+from oaklib.interfaces import SearchInterface
+from oaklib.interfaces.text_annotator_interface import TextAnnotatorInterface
 
 from schema_automator.utils.schemautils import minify_schema
 
 REST_URL = "http://data.bioontology.org"
+camel_case_pattern = re.compile(r'(?<!^)(?=[A-Z])')
 
-ANNOTATION = Dict[str, Any]
-
-@dataclass
-class Term:
-    id: str
-    prefLabel: str
-    synonyms: List[str] = None
-    definition: str = None
-    semanticType: str = None
-    cui: str = None
-
-@dataclass
-class Annotation:
-    start_position: int
-    end_position: int
-    matchType: str
-    text: str
-    source: str
-
-    def complete(self) -> bool:
-        return len(self.source) == (self.end_position - self.start_position) + 1
-
-@dataclass
-class Result:
-    annotatedClass: Term
-    annotations: List[Annotation] = None
-    mappings: List = None
-
-    def complete(self) -> bool:
-        return any(a for a in self.annotations if a.complete())
-
-@dataclass
-class ResultSet:
-    results: List[Result] = None
+def uncamel(n: str):
+    return camel_case_pattern.sub(' ', n).lower().replace('_', ' ')
 
 @dataclass
 class SchemaAnnotator:
-    bioportal_api_key: str = None
-
-    def load_bioportal_api_key(self, path: str = None) -> None:
-        if path is None:
-            path = os.path.join('conf', 'bioportal_apikey.txt')
-        with open(path) as stream:
-            lines = stream.readlines()
-            key = lines[0].strip()
-            self.bioportal_api_key = key
-
-    def get_json(self, url) -> Any:
-        opener = urllib.request.build_opener()
-        opener.addheaders = [('Authorization', 'apikey token=' + API_KEY)]
-        return json.loads(opener.open(url).read())
-
-    def annotate_text(self, text, include: List = None, require_exact_match=True) -> ResultSet:
-        logging.info(f'Annotating text: {text}')
-        if include is None:
-            include =['prefLabel', 'synonym', 'definition', 'semanticType', 'cui']
-        include_str = ','.join(include)
-        params = {'include':  include_str,
-                  'require_exact_match': require_exact_match,
-                  'text': text}
-        if self.bioportal_api_key is  None:
-            self.load_bioportal_api_key()
-        r = requests.get(REST_URL + '/annotator',
-                         headers={'Authorization': 'apikey token=' + self.bioportal_api_key},
-                         params=params)
-        #return r.json()
-        return self.json_to_results(r.json(), text)
-
-    def json_to_results(self, json_list: List[Any], text: str) -> ResultSet:
-        results = []
-        for obj in json_list:
-            #print(f'JSON: {obj}')
-            ac_obj = obj['annotatedClass']
-            ac = Term(id=ac_obj['@id'], prefLabel=ac_obj.get('prefLabel', None))
-            anns = [Annotation(start_position=x['from'],
-                               end_position=x['to'],
-                               matchType=x['matchType'],
-                               text=x['text'],
-                               source=text) for x in obj['annotations']]
-            r = Result(annotatedClass=ac, annotations=anns)
-            logging.debug(f'RESULT: {r}')
-            results.append(r)
-        return ResultSet(results)
-
-    def annotate_schema(self, schema: Union[SchemaDefinition, str], match_only=True) -> SchemaDefinition:
+    ontology_implementation: BasicOntologyInterface
+
+    def annotate_text(self, text: str) -> Iterator[TextAnnotation]:
+        # this is a wrapper over OAK annotation and search;
+        # it (1) expands CamelCase (2) abstracts over annotation vs search
+        # TODO: fold this functionality back into OAK
+        oi = self.ontology_implementation
+        text_exp = uncamel(text)
+        if isinstance(oi, TextAnnotatorInterface):
+            # TextAnnotation is available; use this by default
+            for r in oi.annotate_text(text_exp):
+                yield r
+            if text_exp != text.lower():
+                for r in oi.annotate_text(text_exp):
+                    yield r
+        elif isinstance(oi, SearchInterface):
+            # use search as an alternative
+            cfg = SearchConfiguration(is_complete=True)
+            for r in oi.basic_search(text, config=cfg):
+                yield TextAnnotation(object_id=r, matches_whole_text=True)
+            if text_exp != text.lower():
+                for r in oi.basic_search(text_exp, config=cfg):
+                    yield TextAnnotation(object_id=r, matches_whole_text=True)
+        else:
+            raise NotImplementedError
+
+    def annotate_schema(self, schema: Union[SchemaDefinition, str], curie_only=True) -> SchemaDefinition:
         """
         Annotate all elements of a schema, adding mappings
         """
         sv = SchemaView(schema)
+        oi = self.ontology_implementation
         for elt_name, elt in sv.all_elements().items():
             for n in [elt.name] + elt.aliases:
-                rs = self.annotate_text(n, require_exact_match=True)
-                for r in rs.results:
-                     if r.complete():
-                        xref = r.annotatedClass.id
+                for r in self.annotate_text(n):
+                    logging.debug(f'MATCH: {r}')
+                    if r.matches_whole_text:
+                        xref = r.object_id
+                        if curie_only and not Curie.is_curie(xref):
+                            continue
                         logging.info(f'Mapping from {elt_name} "{n}" to {xref}')
                         if xref not in elt.exact_mappings:
                             elt.exact_mappings.append(xref)
+        for e in sv.all_enums().values():
+            for pv in e.permissible_values.values():
+                for r in self.annotate_text(pv.text):
+                    logging.debug(f'MATCH: {r}')
+                    if r.matches_whole_text:
+                        xref = r.object_id
+                        if curie_only and not Curie.is_curie(xref):
+                            continue
+                        logging.info(f'Mapping from {elt_name} "{n}" to {xref}')
+                        if pv.meaning is None:
+                            logging.info(f'Arbitrarily choosing first match: {xref}')
+                            pv.meaning = xref
+                        else:
+                            if xref not in pv.exact_mappings:
+                                pv.exact_mappings.append(xref)
+
         return sv.schema
 
 
 @click.command()
 @click.argument('schema')
+@click.option('--input', '-i', help="OAK input ontology selector")
 @click.option('--output', '-o', help="Path to saved yaml schema")
-def annotate_schema(schema: str, output: str, **args):
+def annotate_schema(schema: str, input: str, output: str, **args):
     """
     Annotate all elements of a schema
     """
 
@@ -9,6 +9,7 @@
 
 import yaml
 from linkml_runtime.linkml_model import SchemaDefinition
+from oaklib.selector import get_resource_from_shorthand, get_implementation_from_shorthand
 
 from schema_automator import JsonLdAnnotator
 from schema_automator.annotators.schema_annotator import SchemaAnnotator
@@ -135,14 +136,15 @@ def import_dosdps(dpfiles, output, **args):
 @main.command()
 @click.argument('input')
 @output_option
-@click.option('--container-class-name', help="name of root class")
+@schema_name_option
+@click.option('--container-class-name', default='Container', help="name of root class")
 @click.option('--format', '-f', default='json', help="json or yaml (or json.gz or yaml.gz) or frontmatter")
 @click.option('--enum-columns', '-E', multiple=True, help='column(s) that is forced to be an enum')
 @click.option('--enum-mask-columns', multiple=True, help='column(s) that are excluded from being enums')
 @click.option('--max-enum-size', default=50, help='do not create an enum if more than max distinct members')
 @click.option('--enum-threshold', default=0.1, help='if the number of distinct values / rows is less than this, do not make an enum')
 @click.option('--omit-null/--no-omit-null', default=False, help="if true, ignore null values")
-def generalize_json(input, output, format, omit_null, **kwargs):
+def generalize_json(input, output, schema_name, format, omit_null, **kwargs):
     """
     Generalizes from a JSON file to a schema
 
@@ -153,7 +155,7 @@ def generalize_json(input, output, format, omit_null, **kwargs):
         schemauto generalize-json my/data/persons.json
     """
     ie = JsonDataGeneralizer(omit_null=omit_null)
-    schema = ie.convert(input, dir=dir, format=format, **kwargs)
+    schema = ie.convert(input, format=format, **kwargs)
     write_schema(schema, output)
 
 
@@ -224,16 +226,22 @@ def generalize_rdf(rdffile, dir, output, **args):
 
 @main.command()
 @click.argument('schema')
+@click.option('--curie-only/--no-curie-only',
+              default=False,
+              show_default=True,
+              help="if set, only use results that are mapped to CURIEs")
+@click.option('--input', '-i', help="OAK input ontology selector")
 @output_option
-def annotate_schema(schema: str, output: str, **args):
+def annotate_schema(schema: str, input: str, output: str, curie_only: bool, **args):
     """
     Annotate all elements of a schema
 
     Requires Bioportal API key
     """
+    impl = get_implementation_from_shorthand(input)
     logging.basicConfig(level=logging.INFO)
-    annr = SchemaAnnotator()
-    schema = annr.annotate_schema(schema)
+    annr = SchemaAnnotator(impl)
+    schema = annr.annotate_schema(schema, curie_only=curie_only)
     write_schema(schema, output)
 
 
 
@@ -0,0 +1,17 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class GeneralSchemaEnhancer:
+    """
+    Main functions have moved to core linkml, see https://github.com/linkml/linkml/pull/854
+
+    This is currently a stub for future enhancements
+    """
+    pass
+
+
+
+
+
+
@@ -31,6 +31,9 @@
 
 @dataclass
 class ForeignKey:
+    """
+    Represents a field in one table that points to an identifier field in another
+    """
     source_table: str
     source_column: str
     target_table: str
@@ -94,7 +97,7 @@ def infer_linkages(self, files: List[str], **kwargs) -> List[ForeignKey]:
             c = os.path.splitext(os.path.basename(file))[0]
             if self.downcase_header:
                 c = c.lower()
-            print(f'READING {file} ')
+            logging.info(f'READING {file} ')
             df = pd.read_csv(file, sep=self.column_separator, skipinitialspace=True).fillna("")
             if self.downcase_header:
                 df = df.rename(columns=str.lower)
@@ -187,6 +190,13 @@ def inject_foreign_keys(self, sv: SchemaView, fks: List[ForeignKey]) -> None:
             #tgt_slot['identifier'] = True
 
     def convert_multiple(self, files: List[str], **kwargs) -> SchemaDefinition:
+        """
+        Converts multiple TSVs to a schema
+
+        :param files:
+        :param kwargs:
+        :return:
+        """
         if self.infer_foreign_keys:
             fks = self.infer_linkages(files)
         else:
@@ -199,16 +209,23 @@ def convert_multiple(self, files: List[str], **kwargs) -> SchemaDefinition:
             s = self.convert(file, class_name=c, **kwargs)
             if s is not None:
                 schemas.append(s)
-            print(f'CLASSES={list(s.classes.keys())}')
+            logging.info(f'Classes={list(s.classes.keys())}')
         sv = SchemaView(schemas[0])
         for s in schemas[1:]:
             sv.merge_schema(s)
-            print(f'xxxCLASSES={list(sv.all_classes().keys())}')
+            logging.info(f'Classes, post merge={list(sv.all_classes().keys())}')
         #s = merge_schemas(yamlobjs)
         self.inject_foreign_keys(sv, fks)
         return sv.schema
 
     def convert(self, file: str, **kwargs) -> SchemaDefinition:
+        """
+        Converts a single TSV file to a single-class schema
+        
+        :param file:
+        :param kwargs:
+        :return:
+        """
         with open(file, newline='') as tsv_file:
             header = [h.strip() for h in tsv_file.readline().split('\t')]
             rr = csv.DictReader(tsv_file, fieldnames=header, delimiter=self.column_separator, skipinitialspace=False)
 
@@ -140,7 +140,6 @@ def frontmatter2model(inputs, format, omit_null, **kwargs):
 
 
     """
-    print(f'INPUTS={inputs}')
     ie = JsonDataGeneralizer(omit_null=omit_null)
     objs = parse_frontmatter_files(list(inputs))
     schema = ie.convert({'objects': objs}, dir=dir, format=format, **kwargs)
 
@@ -28,14 +28,12 @@
 @dataclass
 class DOSDPImportEngine(ImportEngine):
     """
-    For every template with name Foo, a LinkML class FooTemplate is created
+    An ImportEngine that imports Ontology Design Patterns specified as DOSDP Yaml into a LinkML schema
 
-    The following builtin slots are created:
+    See `DOSDPs <https://github.com/INCATools/dead_simple_owl_design_patterns>`_
+
+    Every template maps to a LinkML class, the default name for a template Foo as FooTemplate
 
-    - name
-    - definition
-    - subclass_of
-    - equivalentTo
     """
     mappings: dict = None
     include_unmapped_annotations = False
@@ -49,6 +47,14 @@ def load_dp(self, path) -> Pattern:
         return yaml_loader.load(obj, target_class=Pattern)
 
     def convert(self, files: str, range_as_enums = True, **kwargs) -> SchemaDefinition:
+        """
+        Converts one or more YAML files into a Schema
+
+        :param files:
+        :param range_as_enums: if True, then class ranges are mapped to Enums
+        :param kwargs:
+        :return:
+        """
         patterns = [self.load_dp(file) for file in files]
         schema = SchemaDefinition(**kwargs)
         if not schema.default_prefix: