Skip to content

Commit be6d7f0

Browse files
authored
Merge pull request #75 from linkml/refactoring
refactoring
2 parents c2a2a40 + 9aaabb3 commit be6d7f0

14 files changed

+257
-143
lines changed

Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,8 @@ target/availabilities_g_s_strain_202112151116_org_meanings_curated.yaml: target/
5858
--model_in target/availabilities_g_s_strain_202112151116_org_meanings.yaml \
5959
--curated_yaml $@ \
6060
--selected_enum organism_enum
61+
62+
# create a convenient wrapper script;
63+
# this can be used outside the poetry environment
64+
bin/schemauto:
65+
echo `poetry run which schemauto` '"$$@"' > $@ && chmod +x $@

schema_automator/annotators/enum_annotator.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,6 @@ def enum_annotator(modelfile, all_mappings_fn, requested_enum_name, whiteout_cha
321321
# A value is trying to be set on a copy of a slice from a DataFrame.
322322
for_str_dist["tidied_query_lc"] = for_str_dist["tidied_query"].str.lower()
323323
for_str_dist["name_lc"] = for_str_dist["name"].str.lower()
324-
logger.debug(for_str_dist)
325324

326325
# favoring simplicity over efficiency
327326
# ie may be string-comparing some duplicates

schema_automator/annotators/schema_annotator.py

Lines changed: 63 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -8,120 +8,94 @@
88
import os
99
from dataclasses import dataclass
1010
from pprint import pprint
11-
from typing import Any, List, Dict, Union
11+
from typing import Any, List, Dict, Union, Iterator
1212

1313
from linkml_runtime.linkml_model import SchemaDefinition
14-
from linkml_runtime.utils.schemaview import SchemaView
14+
from linkml_runtime.utils.metamodelcore import Curie
15+
from linkml_runtime.utils.schemaview import SchemaView, re
16+
from oaklib import BasicOntologyInterface
17+
from oaklib.datamodels.search import SearchConfiguration
18+
from oaklib.datamodels.text_annotator import TextAnnotation
19+
from oaklib.interfaces import SearchInterface
20+
from oaklib.interfaces.text_annotator_interface import TextAnnotatorInterface
1521

1622
from schema_automator.utils.schemautils import minify_schema
1723

1824
REST_URL = "http://data.bioontology.org"
25+
camel_case_pattern = re.compile(r'(?<!^)(?=[A-Z])')
1926

20-
ANNOTATION = Dict[str, Any]
21-
22-
@dataclass
23-
class Term:
24-
id: str
25-
prefLabel: str
26-
synonyms: List[str] = None
27-
definition: str = None
28-
semanticType: str = None
29-
cui: str = None
30-
31-
@dataclass
32-
class Annotation:
33-
start_position: int
34-
end_position: int
35-
matchType: str
36-
text: str
37-
source: str
38-
39-
def complete(self) -> bool:
40-
return len(self.source) == (self.end_position - self.start_position) + 1
41-
42-
@dataclass
43-
class Result:
44-
annotatedClass: Term
45-
annotations: List[Annotation] = None
46-
mappings: List = None
47-
48-
def complete(self) -> bool:
49-
return any(a for a in self.annotations if a.complete())
50-
51-
@dataclass
52-
class ResultSet:
53-
results: List[Result] = None
27+
def uncamel(n: str):
28+
return camel_case_pattern.sub(' ', n).lower().replace('_', ' ')
5429

5530
@dataclass
5631
class SchemaAnnotator:
57-
bioportal_api_key: str = None
58-
59-
def load_bioportal_api_key(self, path: str = None) -> None:
60-
if path is None:
61-
path = os.path.join('conf', 'bioportal_apikey.txt')
62-
with open(path) as stream:
63-
lines = stream.readlines()
64-
key = lines[0].strip()
65-
self.bioportal_api_key = key
66-
67-
def get_json(self, url) -> Any:
68-
opener = urllib.request.build_opener()
69-
opener.addheaders = [('Authorization', 'apikey token=' + API_KEY)]
70-
return json.loads(opener.open(url).read())
71-
72-
def annotate_text(self, text, include: List = None, require_exact_match=True) -> ResultSet:
73-
logging.info(f'Annotating text: {text}')
74-
if include is None:
75-
include =['prefLabel', 'synonym', 'definition', 'semanticType', 'cui']
76-
include_str = ','.join(include)
77-
params = {'include': include_str,
78-
'require_exact_match': require_exact_match,
79-
'text': text}
80-
if self.bioportal_api_key is None:
81-
self.load_bioportal_api_key()
82-
r = requests.get(REST_URL + '/annotator',
83-
headers={'Authorization': 'apikey token=' + self.bioportal_api_key},
84-
params=params)
85-
#return r.json()
86-
return self.json_to_results(r.json(), text)
87-
88-
def json_to_results(self, json_list: List[Any], text: str) -> ResultSet:
89-
results = []
90-
for obj in json_list:
91-
#print(f'JSON: {obj}')
92-
ac_obj = obj['annotatedClass']
93-
ac = Term(id=ac_obj['@id'], prefLabel=ac_obj.get('prefLabel', None))
94-
anns = [Annotation(start_position=x['from'],
95-
end_position=x['to'],
96-
matchType=x['matchType'],
97-
text=x['text'],
98-
source=text) for x in obj['annotations']]
99-
r = Result(annotatedClass=ac, annotations=anns)
100-
logging.debug(f'RESULT: {r}')
101-
results.append(r)
102-
return ResultSet(results)
103-
104-
def annotate_schema(self, schema: Union[SchemaDefinition, str], match_only=True) -> SchemaDefinition:
32+
ontology_implementation: BasicOntologyInterface
33+
34+
def annotate_text(self, text: str) -> Iterator[TextAnnotation]:
35+
# this is a wrapper over OAK annotation and search;
36+
# it (1) expands CamelCase (2) abstracts over annotation vs search
37+
# TODO: fold this functionality back into OAK
38+
oi = self.ontology_implementation
39+
text_exp = uncamel(text)
40+
if isinstance(oi, TextAnnotatorInterface):
41+
# TextAnnotation is available; use this by default
42+
for r in oi.annotate_text(text_exp):
43+
yield r
44+
if text_exp != text.lower():
45+
for r in oi.annotate_text(text_exp):
46+
yield r
47+
elif isinstance(oi, SearchInterface):
48+
# use search as an alternative
49+
cfg = SearchConfiguration(is_complete=True)
50+
for r in oi.basic_search(text, config=cfg):
51+
yield TextAnnotation(object_id=r, matches_whole_text=True)
52+
if text_exp != text.lower():
53+
for r in oi.basic_search(text_exp, config=cfg):
54+
yield TextAnnotation(object_id=r, matches_whole_text=True)
55+
else:
56+
raise NotImplementedError
57+
58+
def annotate_schema(self, schema: Union[SchemaDefinition, str], curie_only=True) -> SchemaDefinition:
10559
"""
10660
Annotate all elements of a schema, adding mappings
10761
"""
10862
sv = SchemaView(schema)
63+
oi = self.ontology_implementation
10964
for elt_name, elt in sv.all_elements().items():
11065
for n in [elt.name] + elt.aliases:
111-
rs = self.annotate_text(n, require_exact_match=True)
112-
for r in rs.results:
113-
if r.complete():
114-
xref = r.annotatedClass.id
66+
for r in self.annotate_text(n):
67+
logging.debug(f'MATCH: {r}')
68+
if r.matches_whole_text:
69+
xref = r.object_id
70+
if curie_only and not Curie.is_curie(xref):
71+
continue
11572
logging.info(f'Mapping from {elt_name} "{n}" to {xref}')
11673
if xref not in elt.exact_mappings:
11774
elt.exact_mappings.append(xref)
75+
for e in sv.all_enums().values():
76+
for pv in e.permissible_values.values():
77+
for r in self.annotate_text(pv.text):
78+
logging.debug(f'MATCH: {r}')
79+
if r.matches_whole_text:
80+
xref = r.object_id
81+
if curie_only and not Curie.is_curie(xref):
82+
continue
83+
logging.info(f'Mapping from {elt_name} "{n}" to {xref}')
84+
if pv.meaning is None:
85+
logging.info(f'Arbitrarily choosing first match: {xref}')
86+
pv.meaning = xref
87+
else:
88+
if xref not in pv.exact_mappings:
89+
pv.exact_mappings.append(xref)
90+
11891
return sv.schema
11992

12093

12194
@click.command()
12295
@click.argument('schema')
96+
@click.option('--input', '-i', help="OAK input ontology selector")
12397
@click.option('--output', '-o', help="Path to saved yaml schema")
124-
def annotate_schema(schema: str, output: str, **args):
98+
def annotate_schema(schema: str, input: str, output: str, **args):
12599
"""
126100
Annotate all elements of a schema
127101
"""

schema_automator/cli.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import yaml
1111
from linkml_runtime.linkml_model import SchemaDefinition
12+
from oaklib.selector import get_resource_from_shorthand, get_implementation_from_shorthand
1213

1314
from schema_automator import JsonLdAnnotator
1415
from schema_automator.annotators.schema_annotator import SchemaAnnotator
@@ -135,14 +136,15 @@ def import_dosdps(dpfiles, output, **args):
135136
@main.command()
136137
@click.argument('input')
137138
@output_option
138-
@click.option('--container-class-name', help="name of root class")
139+
@schema_name_option
140+
@click.option('--container-class-name', default='Container', help="name of root class")
139141
@click.option('--format', '-f', default='json', help="json or yaml (or json.gz or yaml.gz) or frontmatter")
140142
@click.option('--enum-columns', '-E', multiple=True, help='column(s) that is forced to be an enum')
141143
@click.option('--enum-mask-columns', multiple=True, help='column(s) that are excluded from being enums')
142144
@click.option('--max-enum-size', default=50, help='do not create an enum if more than max distinct members')
143145
@click.option('--enum-threshold', default=0.1, help='if the number of distinct values / rows is less than this, do not make an enum')
144146
@click.option('--omit-null/--no-omit-null', default=False, help="if true, ignore null values")
145-
def generalize_json(input, output, format, omit_null, **kwargs):
147+
def generalize_json(input, output, schema_name, format, omit_null, **kwargs):
146148
"""
147149
Generalizes from a JSON file to a schema
148150
@@ -153,7 +155,7 @@ def generalize_json(input, output, format, omit_null, **kwargs):
153155
schemauto generalize-json my/data/persons.json
154156
"""
155157
ie = JsonDataGeneralizer(omit_null=omit_null)
156-
schema = ie.convert(input, dir=dir, format=format, **kwargs)
158+
schema = ie.convert(input, format=format, **kwargs)
157159
write_schema(schema, output)
158160

159161

@@ -224,16 +226,22 @@ def generalize_rdf(rdffile, dir, output, **args):
224226

225227
@main.command()
226228
@click.argument('schema')
229+
@click.option('--curie-only/--no-curie-only',
230+
default=False,
231+
show_default=True,
232+
help="if set, only use results that are mapped to CURIEs")
233+
@click.option('--input', '-i', help="OAK input ontology selector")
227234
@output_option
228-
def annotate_schema(schema: str, output: str, **args):
235+
def annotate_schema(schema: str, input: str, output: str, curie_only: bool, **args):
229236
"""
230237
Annotate all elements of a schema
231238
232239
Requires Bioportal API key
233240
"""
241+
impl = get_implementation_from_shorthand(input)
234242
logging.basicConfig(level=logging.INFO)
235-
annr = SchemaAnnotator()
236-
schema = annr.annotate_schema(schema)
243+
annr = SchemaAnnotator(impl)
244+
schema = annr.annotate_schema(schema, curie_only=curie_only)
237245
write_schema(schema, output)
238246

239247

schema_automator/enhancer/__init__.py

Whitespace-only changes.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from dataclasses import dataclass
2+
3+
4+
@dataclass
5+
class GeneralSchemaEnhancer:
6+
"""
7+
Main functions have moved to core linkml, see https://github.com/linkml/linkml/pull/854
8+
9+
This is currently a stub for future enhancements
10+
"""
11+
pass
12+
13+
14+
15+
16+
17+

schema_automator/generalizers/csv_data_generalizer.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@
3131

3232
@dataclass
3333
class ForeignKey:
34+
"""
35+
Represents a field in one table that points to an identifier field in another
36+
"""
3437
source_table: str
3538
source_column: str
3639
target_table: str
@@ -94,7 +97,7 @@ def infer_linkages(self, files: List[str], **kwargs) -> List[ForeignKey]:
9497
c = os.path.splitext(os.path.basename(file))[0]
9598
if self.downcase_header:
9699
c = c.lower()
97-
print(f'READING {file} ')
100+
logging.info(f'READING {file} ')
98101
df = pd.read_csv(file, sep=self.column_separator, skipinitialspace=True).fillna("")
99102
if self.downcase_header:
100103
df = df.rename(columns=str.lower)
@@ -187,6 +190,13 @@ def inject_foreign_keys(self, sv: SchemaView, fks: List[ForeignKey]) -> None:
187190
#tgt_slot['identifier'] = True
188191

189192
def convert_multiple(self, files: List[str], **kwargs) -> SchemaDefinition:
193+
"""
194+
Converts multiple TSVs to a schema
195+
196+
:param files:
197+
:param kwargs:
198+
:return:
199+
"""
190200
if self.infer_foreign_keys:
191201
fks = self.infer_linkages(files)
192202
else:
@@ -199,16 +209,23 @@ def convert_multiple(self, files: List[str], **kwargs) -> SchemaDefinition:
199209
s = self.convert(file, class_name=c, **kwargs)
200210
if s is not None:
201211
schemas.append(s)
202-
print(f'CLASSES={list(s.classes.keys())}')
212+
logging.info(f'Classes={list(s.classes.keys())}')
203213
sv = SchemaView(schemas[0])
204214
for s in schemas[1:]:
205215
sv.merge_schema(s)
206-
print(f'xxxCLASSES={list(sv.all_classes().keys())}')
216+
logging.info(f'Classes, post merge={list(sv.all_classes().keys())}')
207217
#s = merge_schemas(yamlobjs)
208218
self.inject_foreign_keys(sv, fks)
209219
return sv.schema
210220

211221
def convert(self, file: str, **kwargs) -> SchemaDefinition:
222+
"""
223+
Converts a single TSV file to a single-class schema
224+
225+
:param file:
226+
:param kwargs:
227+
:return:
228+
"""
212229
with open(file, newline='') as tsv_file:
213230
header = [h.strip() for h in tsv_file.readline().split('\t')]
214231
rr = csv.DictReader(tsv_file, fieldnames=header, delimiter=self.column_separator, skipinitialspace=False)

schema_automator/generalizers/json_instance_generalizer.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,6 @@ def frontmatter2model(inputs, format, omit_null, **kwargs):
140140
141141
142142
"""
143-
print(f'INPUTS={inputs}')
144143
ie = JsonDataGeneralizer(omit_null=omit_null)
145144
objs = parse_frontmatter_files(list(inputs))
146145
schema = ie.convert({'objects': objs}, dir=dir, format=format, **kwargs)

schema_automator/importers/dosdp_import_engine.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,12 @@
2828
@dataclass
2929
class DOSDPImportEngine(ImportEngine):
3030
"""
31-
For every template with name Foo, a LinkML class FooTemplate is created
31+
An ImportEngine that imports Ontology Design Patterns specified as DOSDP Yaml into a LinkML schema
3232
33-
The following builtin slots are created:
33+
See `DOSDPs <https://github.com/INCATools/dead_simple_owl_design_patterns>`_
34+
35+
Every template maps to a LinkML class, the default name for a template Foo as FooTemplate
3436
35-
- name
36-
- definition
37-
- subclass_of
38-
- equivalentTo
3937
"""
4038
mappings: dict = None
4139
include_unmapped_annotations = False
@@ -49,6 +47,14 @@ def load_dp(self, path) -> Pattern:
4947
return yaml_loader.load(obj, target_class=Pattern)
5048

5149
def convert(self, files: str, range_as_enums = True, **kwargs) -> SchemaDefinition:
50+
"""
51+
Converts one or more YAML files into a Schema
52+
53+
:param files:
54+
:param range_as_enums: if True, then class ranges are mapped to Enums
55+
:param kwargs:
56+
:return:
57+
"""
5258
patterns = [self.load_dp(file) for file in files]
5359
schema = SchemaDefinition(**kwargs)
5460
if not schema.default_prefix:

0 commit comments

Comments
 (0)