draft schemas for table extraction (#501)

monarch-initiative · Jan 25, 2025 · a0fb9ba · a0fb9ba
2 parents 3945c8d + 62da307
commit a0fb9ba
Show file tree

Hide file tree

Showing 6 changed files with 1,317 additions and 590 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -109,4 +109,4 @@ build-backend = "poetry_dynamic_versioning.backend"
 skip = '.git,*.pdf,*.svg,poetry.lock,output,*.tsv,./tests/input,old'
 # some specific phrases, variables and mixed case (CamelCase etc)
 ignore-regex = '\b(Torsades de pointes|[A-Z][a-zA-Z]*|[a-z]+[A-Z][a-zA-Z]*|de pointes)\b|\bcommments:'
-ignore-words-list = 'langual,sting,infarction,holliday,cyclin,convertor,ser,collapsin,infarctions,euclidian,dependant,vrsatile'
+ignore-words-list = 'langual,sting,infarction,holliday,cyclin,convertor,ser,collapsin,infarctions,euclidian,dependant,vrsatile,anc,disjointness'
diff --git a/src/ontogpt/templates/table_arrays.py b/src/ontogpt/templates/table_arrays.py
@@ -0,0 +1,281 @@
+from __future__ import annotations 
+
+import re
+import sys
+from datetime import (
+    date,
+    datetime,
+    time
+)
+from decimal import Decimal 
+from enum import Enum 
+from typing import (
+    Any,
+    ClassVar,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Union
+)
+
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    RootModel,
+    field_validator
+)
+
+
+metamodel_version = "None"
+version = "None"
+
+
+class ConfiguredBaseModel(BaseModel):
+    model_config = ConfigDict(
+        validate_assignment = True,
+        validate_default = True,
+        extra = "forbid",
+        arbitrary_types_allowed = True,
+        use_enum_values = True,
+        strict = False,
+    )
+    pass
+
+
+
+
+class LinkMLMeta(RootModel):
+    root: Dict[str, Any] = {}
+    model_config = ConfigDict(frozen=True)
+
+    def __getattr__(self, key:str):
+        return getattr(self.root, key)
+
+    def __getitem__(self, key:str):
+        return self.root[key]
+
+    def __setitem__(self, key:str, value):
+        self.root[key] = value
+
+    def __contains__(self, key:str) -> bool:
+        return key in self.root
+
+
+linkml_meta = LinkMLMeta({'default_prefix': 'STE',
+     'description': 'An OntoGPT schema for extracting simple numerical tables with '
+                    'a separate header row, label row, and data arrays',
+     'id': 'https://w3id.org/ontogpt/table_arrays',
+     'imports': ['core'],
+     'name': 'SimpleTableExtraction',
+     'source_file': 'src/ontogpt/templates/table_arrays.yaml'} )
+
+class NullDataOptions(str, Enum):
+    UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION"
+    NOT_APPLICABLE = "NOT_APPLICABLE"
+    NOT_MENTIONED = "NOT_MENTIONED"
+
+
+
+class ExtractionResult(ConfiguredBaseModel):
+    """
+    A result of extracting knowledge on text
+    """
+    linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'})
+
+    input_id: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_id', 'domain_of': ['ExtractionResult']} })
+    input_title: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_title', 'domain_of': ['ExtractionResult']} })
+    input_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_text', 'domain_of': ['ExtractionResult']} })
+    raw_completion_output: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'raw_completion_output', 'domain_of': ['ExtractionResult']} })
+    prompt: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'prompt', 'domain_of': ['ExtractionResult']} })
+    extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""", json_schema_extra = { "linkml_meta": {'alias': 'extracted_object', 'domain_of': ['ExtractionResult']} })
+    named_entities: Optional[List[Any]] = Field(None, description="""Named entities extracted from the text""", json_schema_extra = { "linkml_meta": {'alias': 'named_entities', 'domain_of': ['ExtractionResult']} })
+
+
+class NamedEntity(ConfiguredBaseModel):
+    linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'})
+
+    id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id',
+         'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
+         'comments': ['this is populated during the grounding and normalization step'],
+         'domain_of': ['NamedEntity', 'Publication']} })
+    label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label',
+         'aliases': ['name'],
+         'annotations': {'owl': {'tag': 'owl',
+                                 'value': 'AnnotationProperty, AnnotationAssertion'}},
+         'domain_of': ['NamedEntity'],
+         'slot_uri': 'rdfs:label'} })
+    original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans',
+         'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
+         'comments': ['This is determined during grounding and normalization',
+                      'But is based on the full input text'],
+         'domain_of': ['NamedEntity']} })
+
+    @field_validator('original_spans')
+    def pattern_original_spans(cls, v):
+        pattern=re.compile(r"^\d+:\d+$")
+        if isinstance(v,list):
+            for element in v:
+                if isinstance(v, str) and not pattern.match(element):
+                    raise ValueError(f"Invalid original_spans format: {element}")
+        elif isinstance(v,str):
+            if not pattern.match(v):
+                raise ValueError(f"Invalid original_spans format: {v}")
+        return v
+
+
+class CompoundExpression(ConfiguredBaseModel):
+    linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'})
+
+    pass
+
+
+class Triple(CompoundExpression):
+    """
+    Abstract parent for Relation Extraction tasks
+    """
+    linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'})
+
+    subject: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'subject', 'domain_of': ['Triple']} })
+    predicate: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'predicate', 'domain_of': ['Triple']} })
+    object: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object', 'domain_of': ['Triple']} })
+    qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""", json_schema_extra = { "linkml_meta": {'alias': 'qualifier', 'domain_of': ['Triple']} })
+    subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""", json_schema_extra = { "linkml_meta": {'alias': 'subject_qualifier', 'domain_of': ['Triple']} })
+    object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""", json_schema_extra = { "linkml_meta": {'alias': 'object_qualifier', 'domain_of': ['Triple']} })
+
+
+class TextWithTriples(ConfiguredBaseModel):
+    """
+    A text containing one or more relations of the Triple type.
+    """
+    linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'})
+
+    publication: Optional[Publication] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'publication',
+         'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
+         'domain_of': ['TextWithTriples', 'TextWithEntity']} })
+    triples: Optional[List[Triple]] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'triples', 'domain_of': ['TextWithTriples']} })
+
+
+class TextWithEntity(ConfiguredBaseModel):
+    """
+    A text containing one or more instances of a single type of entity.
+    """
+    linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'})
+
+    publication: Optional[Publication] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'publication',
+         'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
+         'domain_of': ['TextWithTriples', 'TextWithEntity']} })
+    entities: Optional[List[str]] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'entities', 'domain_of': ['TextWithEntity']} })
+
+
+class RelationshipType(NamedEntity):
+    linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core',
+         'id_prefixes': ['RO', 'biolink']})
+
+    id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id',
+         'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
+         'comments': ['this is populated during the grounding and normalization step'],
+         'domain_of': ['NamedEntity', 'Publication']} })
+    label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label',
+         'aliases': ['name'],
+         'annotations': {'owl': {'tag': 'owl',
+                                 'value': 'AnnotationProperty, AnnotationAssertion'}},
+         'domain_of': ['NamedEntity'],
+         'slot_uri': 'rdfs:label'} })
+    original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans',
+         'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
+         'comments': ['This is determined during grounding and normalization',
+                      'But is based on the full input text'],
+         'domain_of': ['NamedEntity']} })
+
+    @field_validator('original_spans')
+    def pattern_original_spans(cls, v):
+        pattern=re.compile(r"^\d+:\d+$")
+        if isinstance(v,list):
+            for element in v:
+                if isinstance(v, str) and not pattern.match(element):
+                    raise ValueError(f"Invalid original_spans format: {element}")
+        elif isinstance(v,str):
+            if not pattern.match(v):
+                raise ValueError(f"Invalid original_spans format: {v}")
+        return v
+
+
+class Publication(ConfiguredBaseModel):
+    linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'})
+
+    id: Optional[str] = Field(None, description="""The publication identifier""", json_schema_extra = { "linkml_meta": {'alias': 'id', 'domain_of': ['NamedEntity', 'Publication']} })
+    title: Optional[str] = Field(None, description="""The title of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'title', 'domain_of': ['Publication']} })
+    abstract: Optional[str] = Field(None, description="""The abstract of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'abstract', 'domain_of': ['Publication']} })
+    combined_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'combined_text', 'domain_of': ['Publication']} })
+    full_text: Optional[str] = Field(None, description="""The full text of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'full_text', 'domain_of': ['Publication']} })
+
+
+class AnnotatorResult(ConfiguredBaseModel):
+    linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'})
+
+    subject_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'subject_text', 'domain_of': ['AnnotatorResult']} })
+    object_id: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object_id', 'domain_of': ['AnnotatorResult']} })
+    object_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object_text', 'domain_of': ['AnnotatorResult']} })
+
+
+class Table(ConfiguredBaseModel):
+    """
+    “A table extracted from a publication, containing rows and columns in array form.”
+    """
+    linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'https://w3id.org/ontogpt/table_arrays'})
+
+    table_id: Optional[str] = Field(None, description="""“Identifier for the table, e.g. ‘Table 1’, ‘Table 2’, etc.”""", json_schema_extra = { "linkml_meta": {'alias': 'table_id', 'domain_of': ['Table']} })
+    source_document: Optional[str] = Field(None, description="""“Identifier or reference to the PDF or publication source.”""", json_schema_extra = { "linkml_meta": {'alias': 'source_document', 'domain_of': ['Table']} })
+    caption: Optional[str] = Field(None, description="""“Text of the table caption or title.”""", json_schema_extra = { "linkml_meta": {'alias': 'caption', 'domain_of': ['Table']} })
+    header_row: Optional[TableHeaderRow] = Field(None, description="""“The row containing the column headers.”""", json_schema_extra = { "linkml_meta": {'alias': 'header_row', 'domain_of': ['Table']} })
+    label_row: Optional[TableLabelRow] = Field(None, description="""“The row containing row labels.”""", json_schema_extra = { "linkml_meta": {'alias': 'label_row', 'domain_of': ['Table']} })
+    data_rows: Optional[List[TableDataRow]] = Field(None, description="""“A list of data rows, each representing a row in the table body.”""", json_schema_extra = { "linkml_meta": {'alias': 'data_rows', 'domain_of': ['Table']} })
+
+
+class TableHeaderRow(ConfiguredBaseModel):
+    """
+    “A single row containing all column headers.”
+    """
+    linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'https://w3id.org/ontogpt/table_arrays'})
+
+    header_values: Optional[List[str]] = Field(None, description="""“An array of strings corresponding to each column header.”""", json_schema_extra = { "linkml_meta": {'alias': 'header_values', 'domain_of': ['TableHeaderRow']} })
+
+
+class TableLabelRow(ConfiguredBaseModel):
+    """
+    “A single row containing the labels for each of the data rows.”
+    """
+    linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'https://w3id.org/ontogpt/table_arrays'})
+
+    label_values: Optional[List[str]] = Field(None, description="""“An array of strings corresponding to each row label, aligning with data_rows.”""", json_schema_extra = { "linkml_meta": {'alias': 'label_values', 'domain_of': ['TableLabelRow']} })
+
+
+class TableDataRow(ConfiguredBaseModel):
+    """
+    “A row of data in the table body, indexed and containing an array of numeric values.”
+    """
+    linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'https://w3id.org/ontogpt/table_arrays'})
+
+    row_index: Optional[int] = Field(None, description="""“Numeric index of this row (starting from 0 or 1).”""", json_schema_extra = { "linkml_meta": {'alias': 'row_index', 'domain_of': ['TableDataRow']} })
+    values: Optional[List[float]] = Field(None, description="""“An array of numeric values in this data row.”""", json_schema_extra = { "linkml_meta": {'alias': 'values', 'domain_of': ['TableDataRow']} })
+    note: Optional[str] = Field(None, description="""“Optional note or comment about this particular row.”""", json_schema_extra = { "linkml_meta": {'alias': 'note', 'domain_of': ['TableDataRow']} })
+
+
+# Model rebuild
+# see https://pydantic-docs.helpmanual.io/usage/models/#rebuilding-a-model
+ExtractionResult.model_rebuild()
+NamedEntity.model_rebuild()
+CompoundExpression.model_rebuild()
+Triple.model_rebuild()
+TextWithTriples.model_rebuild()
+TextWithEntity.model_rebuild()
+RelationshipType.model_rebuild()
+Publication.model_rebuild()
+AnnotatorResult.model_rebuild()
+Table.model_rebuild()
+TableHeaderRow.model_rebuild()
+TableLabelRow.model_rebuild()
+TableDataRow.model_rebuild()
+
diff --git a/src/ontogpt/templates/table_arrays.yaml b/src/ontogpt/templates/table_arrays.yaml
@@ -0,0 +1,69 @@
+id: https://w3id.org/ontogpt/table_arrays
+name: SimpleTableExtraction
+description: An OntoGPT schema for extracting simple numerical tables with a separate header row, label row, and data arrays
+default_prefix: STE
+imports:
+  - core
+classes:
+  Table:
+    description: “A table extracted from a publication, containing rows and columns in array form.”
+    slots:
+      - table_id
+      - source_document
+      - caption
+      - header_row
+      - label_row
+      - data_rows
+  TableHeaderRow:
+    description: “A single row containing all column headers.”
+    slots:
+      - header_values
+  TableLabelRow:
+    description: “A single row containing the labels for each of the data rows.”
+    slots:
+      - label_values
+  TableDataRow:
+    description: “A row of data in the table body, indexed and containing an array of numeric values.”
+    slots:
+      - row_index
+      - values
+      - note
+slots:
+  table_id:
+    description: “Identifier for the table, e.g. ‘Table 1’, ‘Table 2’, etc.”
+    range: string
+  source_document:
+    description: “Identifier or reference to the PDF or publication source.”
+    range: string
+  caption:
+    description: “Text of the table caption or title.”
+    range: string
+  header_row:
+    description: “The row containing the column headers.”
+    range: TableHeaderRow
+  label_row:
+    description: “The row containing row labels.”
+    range: TableLabelRow
+  data_rows:
+    description: “A list of data rows, each representing a row in the table body.”
+    range: TableDataRow
+    multivalued: true
+  header_values:
+    description: “An array of strings corresponding to each column header.”
+    range: string
+    multivalued: true
+  label_values:
+    description: “An array of strings corresponding to each row label, aligning with data_rows.”
+    range: string
+    multivalued: true
+  row_index:
+    description: “Numeric index of this row (starting from 0 or 1).”
+    range: integer
+  values:
+    description: “An array of numeric values in this data row.”
+    range: float
+    multivalued: true
+  note:
+    description: “Optional note or comment about this particular row.”
+    range: string
+# templates: {}