Skip to content

Commit

Permalink
draft schemas for table extraction (#501)
Browse files Browse the repository at this point in the history
  • Loading branch information
caufieldjh authored Jan 25, 2025
2 parents 3945c8d + 62da307 commit a0fb9ba
Show file tree
Hide file tree
Showing 6 changed files with 1,317 additions and 590 deletions.
1,187 changes: 598 additions & 589 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -109,4 +109,4 @@ build-backend = "poetry_dynamic_versioning.backend"
skip = '.git,*.pdf,*.svg,poetry.lock,output,*.tsv,./tests/input,old'
# some specific phrases, variables and mixed case (CamelCase etc)
ignore-regex = '\b(Torsades de pointes|[A-Z][a-zA-Z]*|[a-z]+[A-Z][a-zA-Z]*|de pointes)\b|\bcommments:'
ignore-words-list = 'langual,sting,infarction,holliday,cyclin,convertor,ser,collapsin,infarctions,euclidian,dependant,vrsatile'
ignore-words-list = 'langual,sting,infarction,holliday,cyclin,convertor,ser,collapsin,infarctions,euclidian,dependant,vrsatile,anc,disjointness'
281 changes: 281 additions & 0 deletions src/ontogpt/templates/table_arrays.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
from __future__ import annotations

import re
import sys
from datetime import (
date,
datetime,
time
)
from decimal import Decimal
from enum import Enum
from typing import (
Any,
ClassVar,
Dict,
List,
Literal,
Optional,
Union
)

from pydantic import (
BaseModel,
ConfigDict,
Field,
RootModel,
field_validator
)


metamodel_version = "None"
version = "None"


class ConfiguredBaseModel(BaseModel):
model_config = ConfigDict(
validate_assignment = True,
validate_default = True,
extra = "forbid",
arbitrary_types_allowed = True,
use_enum_values = True,
strict = False,
)
pass




class LinkMLMeta(RootModel):
root: Dict[str, Any] = {}
model_config = ConfigDict(frozen=True)

def __getattr__(self, key:str):
return getattr(self.root, key)

def __getitem__(self, key:str):
return self.root[key]

def __setitem__(self, key:str, value):
self.root[key] = value

def __contains__(self, key:str) -> bool:
return key in self.root


linkml_meta = LinkMLMeta({'default_prefix': 'STE',
'description': 'An OntoGPT schema for extracting simple numerical tables with '
'a separate header row, label row, and data arrays',
'id': 'https://w3id.org/ontogpt/table_arrays',
'imports': ['core'],
'name': 'SimpleTableExtraction',
'source_file': 'src/ontogpt/templates/table_arrays.yaml'} )

class NullDataOptions(str, Enum):
UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION"
NOT_APPLICABLE = "NOT_APPLICABLE"
NOT_MENTIONED = "NOT_MENTIONED"



class ExtractionResult(ConfiguredBaseModel):
"""
A result of extracting knowledge on text
"""
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'})

input_id: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_id', 'domain_of': ['ExtractionResult']} })
input_title: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_title', 'domain_of': ['ExtractionResult']} })
input_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'input_text', 'domain_of': ['ExtractionResult']} })
raw_completion_output: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'raw_completion_output', 'domain_of': ['ExtractionResult']} })
prompt: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'prompt', 'domain_of': ['ExtractionResult']} })
extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""", json_schema_extra = { "linkml_meta": {'alias': 'extracted_object', 'domain_of': ['ExtractionResult']} })
named_entities: Optional[List[Any]] = Field(None, description="""Named entities extracted from the text""", json_schema_extra = { "linkml_meta": {'alias': 'named_entities', 'domain_of': ['ExtractionResult']} })


class NamedEntity(ConfiguredBaseModel):
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'})

id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id',
'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
'comments': ['this is populated during the grounding and normalization step'],
'domain_of': ['NamedEntity', 'Publication']} })
label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label',
'aliases': ['name'],
'annotations': {'owl': {'tag': 'owl',
'value': 'AnnotationProperty, AnnotationAssertion'}},
'domain_of': ['NamedEntity'],
'slot_uri': 'rdfs:label'} })
original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans',
'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
'comments': ['This is determined during grounding and normalization',
'But is based on the full input text'],
'domain_of': ['NamedEntity']} })

@field_validator('original_spans')
def pattern_original_spans(cls, v):
pattern=re.compile(r"^\d+:\d+$")
if isinstance(v,list):
for element in v:
if isinstance(v, str) and not pattern.match(element):
raise ValueError(f"Invalid original_spans format: {element}")
elif isinstance(v,str):
if not pattern.match(v):
raise ValueError(f"Invalid original_spans format: {v}")
return v


class CompoundExpression(ConfiguredBaseModel):
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'})

pass


class Triple(CompoundExpression):
"""
Abstract parent for Relation Extraction tasks
"""
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'abstract': True, 'from_schema': 'http://w3id.org/ontogpt/core'})

subject: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'subject', 'domain_of': ['Triple']} })
predicate: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'predicate', 'domain_of': ['Triple']} })
object: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object', 'domain_of': ['Triple']} })
qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""", json_schema_extra = { "linkml_meta": {'alias': 'qualifier', 'domain_of': ['Triple']} })
subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""", json_schema_extra = { "linkml_meta": {'alias': 'subject_qualifier', 'domain_of': ['Triple']} })
object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""", json_schema_extra = { "linkml_meta": {'alias': 'object_qualifier', 'domain_of': ['Triple']} })


class TextWithTriples(ConfiguredBaseModel):
"""
A text containing one or more relations of the Triple type.
"""
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'})

publication: Optional[Publication] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'publication',
'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
'domain_of': ['TextWithTriples', 'TextWithEntity']} })
triples: Optional[List[Triple]] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'triples', 'domain_of': ['TextWithTriples']} })


class TextWithEntity(ConfiguredBaseModel):
"""
A text containing one or more instances of a single type of entity.
"""
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'})

publication: Optional[Publication] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'publication',
'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
'domain_of': ['TextWithTriples', 'TextWithEntity']} })
entities: Optional[List[str]] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'entities', 'domain_of': ['TextWithEntity']} })


class RelationshipType(NamedEntity):
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core',
'id_prefixes': ['RO', 'biolink']})

id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id',
'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
'comments': ['this is populated during the grounding and normalization step'],
'domain_of': ['NamedEntity', 'Publication']} })
label: Optional[str] = Field(None, description="""The label (name) of the named thing""", json_schema_extra = { "linkml_meta": {'alias': 'label',
'aliases': ['name'],
'annotations': {'owl': {'tag': 'owl',
'value': 'AnnotationProperty, AnnotationAssertion'}},
'domain_of': ['NamedEntity'],
'slot_uri': 'rdfs:label'} })
original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans',
'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
'comments': ['This is determined during grounding and normalization',
'But is based on the full input text'],
'domain_of': ['NamedEntity']} })

@field_validator('original_spans')
def pattern_original_spans(cls, v):
pattern=re.compile(r"^\d+:\d+$")
if isinstance(v,list):
for element in v:
if isinstance(v, str) and not pattern.match(element):
raise ValueError(f"Invalid original_spans format: {element}")
elif isinstance(v,str):
if not pattern.match(v):
raise ValueError(f"Invalid original_spans format: {v}")
return v


class Publication(ConfiguredBaseModel):
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'})

id: Optional[str] = Field(None, description="""The publication identifier""", json_schema_extra = { "linkml_meta": {'alias': 'id', 'domain_of': ['NamedEntity', 'Publication']} })
title: Optional[str] = Field(None, description="""The title of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'title', 'domain_of': ['Publication']} })
abstract: Optional[str] = Field(None, description="""The abstract of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'abstract', 'domain_of': ['Publication']} })
combined_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'combined_text', 'domain_of': ['Publication']} })
full_text: Optional[str] = Field(None, description="""The full text of the publication""", json_schema_extra = { "linkml_meta": {'alias': 'full_text', 'domain_of': ['Publication']} })


class AnnotatorResult(ConfiguredBaseModel):
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/core'})

subject_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'subject_text', 'domain_of': ['AnnotatorResult']} })
object_id: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object_id', 'domain_of': ['AnnotatorResult']} })
object_text: Optional[str] = Field(None, json_schema_extra = { "linkml_meta": {'alias': 'object_text', 'domain_of': ['AnnotatorResult']} })


class Table(ConfiguredBaseModel):
"""
“A table extracted from a publication, containing rows and columns in array form.”
"""
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'https://w3id.org/ontogpt/table_arrays'})

table_id: Optional[str] = Field(None, description="""“Identifier for the table, e.g. ‘Table 1’, ‘Table 2’, etc.”""", json_schema_extra = { "linkml_meta": {'alias': 'table_id', 'domain_of': ['Table']} })
source_document: Optional[str] = Field(None, description="""“Identifier or reference to the PDF or publication source.”""", json_schema_extra = { "linkml_meta": {'alias': 'source_document', 'domain_of': ['Table']} })
caption: Optional[str] = Field(None, description="""“Text of the table caption or title.”""", json_schema_extra = { "linkml_meta": {'alias': 'caption', 'domain_of': ['Table']} })
header_row: Optional[TableHeaderRow] = Field(None, description="""“The row containing the column headers.”""", json_schema_extra = { "linkml_meta": {'alias': 'header_row', 'domain_of': ['Table']} })
label_row: Optional[TableLabelRow] = Field(None, description="""“The row containing row labels.”""", json_schema_extra = { "linkml_meta": {'alias': 'label_row', 'domain_of': ['Table']} })
data_rows: Optional[List[TableDataRow]] = Field(None, description="""“A list of data rows, each representing a row in the table body.”""", json_schema_extra = { "linkml_meta": {'alias': 'data_rows', 'domain_of': ['Table']} })


class TableHeaderRow(ConfiguredBaseModel):
"""
“A single row containing all column headers.”
"""
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'https://w3id.org/ontogpt/table_arrays'})

header_values: Optional[List[str]] = Field(None, description="""“An array of strings corresponding to each column header.”""", json_schema_extra = { "linkml_meta": {'alias': 'header_values', 'domain_of': ['TableHeaderRow']} })


class TableLabelRow(ConfiguredBaseModel):
"""
“A single row containing the labels for each of the data rows.”
"""
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'https://w3id.org/ontogpt/table_arrays'})

label_values: Optional[List[str]] = Field(None, description="""“An array of strings corresponding to each row label, aligning with data_rows.”""", json_schema_extra = { "linkml_meta": {'alias': 'label_values', 'domain_of': ['TableLabelRow']} })


class TableDataRow(ConfiguredBaseModel):
"""
“A row of data in the table body, indexed and containing an array of numeric values.”
"""
linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'https://w3id.org/ontogpt/table_arrays'})

row_index: Optional[int] = Field(None, description="""“Numeric index of this row (starting from 0 or 1).”""", json_schema_extra = { "linkml_meta": {'alias': 'row_index', 'domain_of': ['TableDataRow']} })
values: Optional[List[float]] = Field(None, description="""“An array of numeric values in this data row.”""", json_schema_extra = { "linkml_meta": {'alias': 'values', 'domain_of': ['TableDataRow']} })
note: Optional[str] = Field(None, description="""“Optional note or comment about this particular row.”""", json_schema_extra = { "linkml_meta": {'alias': 'note', 'domain_of': ['TableDataRow']} })


# Model rebuild
# see https://pydantic-docs.helpmanual.io/usage/models/#rebuilding-a-model
ExtractionResult.model_rebuild()
NamedEntity.model_rebuild()
CompoundExpression.model_rebuild()
Triple.model_rebuild()
TextWithTriples.model_rebuild()
TextWithEntity.model_rebuild()
RelationshipType.model_rebuild()
Publication.model_rebuild()
AnnotatorResult.model_rebuild()
Table.model_rebuild()
TableHeaderRow.model_rebuild()
TableLabelRow.model_rebuild()
TableDataRow.model_rebuild()

69 changes: 69 additions & 0 deletions src/ontogpt/templates/table_arrays.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
id: https://w3id.org/ontogpt/table_arrays
name: SimpleTableExtraction
description: An OntoGPT schema for extracting simple numerical tables with a separate header row, label row, and data arrays
default_prefix: STE
imports:
- core
classes:
Table:
description: “A table extracted from a publication, containing rows and columns in array form.”
slots:
- table_id
- source_document
- caption
- header_row
- label_row
- data_rows
TableHeaderRow:
description: “A single row containing all column headers.”
slots:
- header_values
TableLabelRow:
description: “A single row containing the labels for each of the data rows.”
slots:
- label_values
TableDataRow:
description: “A row of data in the table body, indexed and containing an array of numeric values.”
slots:
- row_index
- values
- note
slots:
table_id:
description: “Identifier for the table, e.g. ‘Table 1’, ‘Table 2’, etc.”
range: string
source_document:
description: “Identifier or reference to the PDF or publication source.”
range: string
caption:
description: “Text of the table caption or title.”
range: string
header_row:
description: “The row containing the column headers.”
range: TableHeaderRow
label_row:
description: “The row containing row labels.”
range: TableLabelRow
data_rows:
description: “A list of data rows, each representing a row in the table body.”
range: TableDataRow
multivalued: true
header_values:
description: “An array of strings corresponding to each column header.”
range: string
multivalued: true
label_values:
description: “An array of strings corresponding to each row label, aligning with data_rows.”
range: string
multivalued: true
row_index:
description: “Numeric index of this row (starting from 0 or 1).”
range: integer
values:
description: “An array of numeric values in this data row.”
range: float
multivalued: true
note:
description: “Optional note or comment about this particular row.”
range: string
# templates: {}
Loading

0 comments on commit a0fb9ba

Please sign in to comment.