From e9a59443614ffe0ef394c67f99a79949f2f380ed Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 14 Jan 2025 11:26:57 +0100 Subject: [PATCH 1/7] Add ClinicalTrials.gov source --- src/pyobo/sources/__init__.py | 2 + src/pyobo/sources/clinicaltrials.py | 80 +++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 src/pyobo/sources/clinicaltrials.py diff --git a/src/pyobo/sources/__init__.py b/src/pyobo/sources/__init__.py index 2d726a28..de596521 100644 --- a/src/pyobo/sources/__init__.py +++ b/src/pyobo/sources/__init__.py @@ -8,6 +8,7 @@ from .cgnc import CGNCGetter from .chembl import ChEMBLCompoundGetter from .civic_gene import CIVICGeneGetter +from .clinicaltrials import ClinicalTrialsGetter from .complexportal import ComplexPortalGetter from .conso import CONSOGetter from .cpt import CPTGetter @@ -70,6 +71,7 @@ "CPTGetter", "CVXGetter", "ChEMBLCompoundGetter", + "ClinicalTrialsGetter", "ComplexPortalGetter", "CreditGetter", "DepMapGetter", diff --git a/src/pyobo/sources/clinicaltrials.py b/src/pyobo/sources/clinicaltrials.py new file mode 100644 index 00000000..d1034a5e --- /dev/null +++ b/src/pyobo/sources/clinicaltrials.py @@ -0,0 +1,80 @@ +"""A source for ClinicalTrials.gov.""" + +from collections.abc import Iterable + +from clinicaltrials_downloader import get_studies + +from pyobo import Obo, Reference, Term, default_reference +from pyobo.struct.struct import CHARLIE_TERM, HUMAN_TERM, PYOBO_INJECTED +from pyobo.struct.typedef import has_category, has_contributor + +__all__ = [ + "ClinicalTrialsGetter", +] + +PREFIX = "clinicaltrials" +DEFAULT_FIELDS = [ + "NCTId", + "BriefTitle", + "Condition", + "ConditionMeshTerm", # ConditionMeshTerm is the name of the disease + "ConditionMeshId", + "InterventionName", # InterventionName is the name of the drug/vaccine + "InterventionType", + "InterventionMeshTerm", + "InterventionMeshId", + "StudyType", + "DesignAllocation", + "OverallStatus", + "Phase", + "WhyStopped", + "SecondaryIdType", + "SecondaryId", + "StartDate", # Month [day], year: "November 1, 2023", "May 1984" or NaN + "StartDateType", # "Actual" or "Anticipated" (or NaN) + "ReferencePMID", # these are tagged as relevant by the author, but not necessarily about the trial +] + +CLINICAL_TRIAL_TERM = ( + Term(reference=default_reference(PREFIX, "clinical-trial", name="clinical trial")) + .annotate_object(has_contributor, CHARLIE_TERM) + .append_comment(PYOBO_INJECTED) + .append_see_also_uri("https://github.com/obi-ontology/obi/issues/1831#issuecomment-2587810590") +) + + +class ClinicalTrialsGetter(Obo): + """Get the ClinicalTrials.gov database as an ontology.""" + + ontology = PREFIX + dynamic_version = True + typedefs = [has_contributor, has_category] + + def iter_terms(self, force: bool = False) -> Iterable[Term]: + """Iterate over terms for studies.""" + yield CLINICAL_TRIAL_TERM + yield CHARLIE_TERM + yield HUMAN_TERM + yield from iterate_studies() + + +def iterate_studies(*, force: bool = False) -> Iterable[Term]: + """Iterate over terms for studies.""" + for study in get_studies(force=force): + yield _process_study(study) + + +def _process_study(raw_study) -> Term: + protocol_section = raw_study["protocolSection"] + identification_module = protocol_section["identificationModule"] + identifier = identification_module["nctId"] + name = identification_module["officialTitle"] + synonym = identification_module["briefTitle"] + design_module = protocol_section["design_module"] + study_type = design_module["studyType"] + term = Term(reference=Reference(prefix=PREFIX, identifier=identifier, name=name)) + term.append_synonym(synonym) + # TODO make the study type into inheritance + term.annotate_literal(has_category, study_type) + term.append_parent(CLINICAL_TRIAL_TERM) + return term From dbb5dec54f43348d94475753b2554e19cfba6332 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 14 Jan 2025 11:41:36 +0100 Subject: [PATCH 2/7] Update clinicaltrials.py --- src/pyobo/sources/clinicaltrials.py | 59 ++++++++++++++--------------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/src/pyobo/sources/clinicaltrials.py b/src/pyobo/sources/clinicaltrials.py index d1034a5e..d8f646ac 100644 --- a/src/pyobo/sources/clinicaltrials.py +++ b/src/pyobo/sources/clinicaltrials.py @@ -13,27 +13,6 @@ ] PREFIX = "clinicaltrials" -DEFAULT_FIELDS = [ - "NCTId", - "BriefTitle", - "Condition", - "ConditionMeshTerm", # ConditionMeshTerm is the name of the disease - "ConditionMeshId", - "InterventionName", # InterventionName is the name of the drug/vaccine - "InterventionType", - "InterventionMeshTerm", - "InterventionMeshId", - "StudyType", - "DesignAllocation", - "OverallStatus", - "Phase", - "WhyStopped", - "SecondaryIdType", - "SecondaryId", - "StartDate", # Month [day], year: "November 1, 2023", "May 1984" or NaN - "StartDateType", # "Actual" or "Anticipated" (or NaN) - "ReferencePMID", # these are tagged as relevant by the author, but not necessarily about the trial -] CLINICAL_TRIAL_TERM = ( Term(reference=default_reference(PREFIX, "clinical-trial", name="clinical trial")) @@ -60,21 +39,41 @@ def iter_terms(self, force: bool = False) -> Iterable[Term]: def iterate_studies(*, force: bool = False) -> Iterable[Term]: """Iterate over terms for studies.""" - for study in get_studies(force=force): + studies = get_studies(force=force) + for study in studies: yield _process_study(study) + +ENCOUNTERED_STUDY_TYPES = set() + + def _process_study(raw_study) -> Term: protocol_section = raw_study["protocolSection"] identification_module = protocol_section["identificationModule"] identifier = identification_module["nctId"] - name = identification_module["officialTitle"] - synonym = identification_module["briefTitle"] - design_module = protocol_section["design_module"] - study_type = design_module["studyType"] - term = Term(reference=Reference(prefix=PREFIX, identifier=identifier, name=name)) - term.append_synonym(synonym) - # TODO make the study type into inheritance - term.annotate_literal(has_category, study_type) + + name = identification_module.get("officialTitle") + synonym = identification_module.get("briefTitle") + if synonym and not name: + name, synonym = synonym, None + + term = Term( + reference=Reference(prefix=PREFIX, identifier=identifier, name=name), type="Instance" + ) + if synonym: + term.append_synonym(synonym) + + # TODO make the study type into inheritance, when available + design_module = protocol_section.get("design_module", {}) + study_type = design_module.get("studyType") + if study_type: + term.annotate_literal(has_category, study_type) + ENCOUNTERED_STUDY_TYPES.add(study_type) term.append_parent(CLINICAL_TRIAL_TERM) + return term + + +if __name__ == "__main__": + ClinicalTrialsGetter.cli() From 7c39a7838c590301e5a5980fab1bcdf90453fea2 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 14 Jan 2025 12:18:54 +0100 Subject: [PATCH 3/7] Update --- pyproject.toml | 1 + src/pyobo/sources/clinicaltrials.py | 64 +++++++++++++++++++---------- 2 files changed, 43 insertions(+), 22 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ac164ba2..ea4743d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,6 +84,7 @@ dependencies = [ "drugbank_downloader", "chembl_downloader", "umls_downloader>=0.1.3", + "clinicaltrials_downloader>=0.0.2", "typing_extensions", "rdflib", ] diff --git a/src/pyobo/sources/clinicaltrials.py b/src/pyobo/sources/clinicaltrials.py index d8f646ac..29274eca 100644 --- a/src/pyobo/sources/clinicaltrials.py +++ b/src/pyobo/sources/clinicaltrials.py @@ -2,11 +2,11 @@ from collections.abc import Iterable -from clinicaltrials_downloader import get_studies +from clinicaltrials_downloader import get_studies_slim from pyobo import Obo, Reference, Term, default_reference from pyobo.struct.struct import CHARLIE_TERM, HUMAN_TERM, PYOBO_INJECTED -from pyobo.struct.typedef import has_category, has_contributor +from pyobo.struct.typedef import has_contributor __all__ = [ "ClinicalTrialsGetter", @@ -14,12 +14,37 @@ PREFIX = "clinicaltrials" -CLINICAL_TRIAL_TERM = ( - Term(reference=default_reference(PREFIX, "clinical-trial", name="clinical trial")) - .annotate_object(has_contributor, CHARLIE_TERM) - .append_comment(PYOBO_INJECTED) - .append_see_also_uri("https://github.com/obi-ontology/obi/issues/1831#issuecomment-2587810590") -) +STUDY = Term(reference=default_reference(PREFIX, "study", name="study")) + +CLINICAL_TRIAL_TERM = Term( + reference=default_reference(PREFIX, "clinical-trial", name="clinical trial") +).append_parent(STUDY) + +INTERVENTIONAL = Term( + reference=default_reference( + PREFIX, "interventional-clinical-trial", name="interventional clinical trial" + ) +).append_parent(CLINICAL_TRIAL_TERM) + +OBSERVATIONAL = Term( + reference=default_reference( + PREFIX, "observational-clinical-trial", name="observational clinical trial" + ) +).append_parent(CLINICAL_TRIAL_TERM) + +EXPANDED_ACCESS = Term( + reference=default_reference( + PREFIX, "expanded-access-study", name="expanded access study" + ) +).append_parent(STUDY) + +TERMS = [STUDY, CLINICAL_TRIAL_TERM, OBSERVATIONAL, INTERVENTIONAL, EXPANDED_ACCESS] +PARENTS: dict[str | None, Term] = { + "INTERVENTIONAL": INTERVENTIONAL, + "OBSERVATIONAL": OBSERVATIONAL, + "EXPANDED_ACCESS": EXPANDED_ACCESS, + None: STUDY, +} class ClinicalTrialsGetter(Obo): @@ -27,27 +52,27 @@ class ClinicalTrialsGetter(Obo): ontology = PREFIX dynamic_version = True - typedefs = [has_contributor, has_category] + typedefs = [has_contributor] + root_terms = [STUDY] def iter_terms(self, force: bool = False) -> Iterable[Term]: """Iterate over terms for studies.""" - yield CLINICAL_TRIAL_TERM yield CHARLIE_TERM yield HUMAN_TERM + for term in TERMS: + term.annotate_object(has_contributor, CHARLIE_TERM) + term.append_comment(PYOBO_INJECTED) + yield term yield from iterate_studies() def iterate_studies(*, force: bool = False) -> Iterable[Term]: """Iterate over terms for studies.""" - studies = get_studies(force=force) + studies = get_studies_slim(force=force) for study in studies: yield _process_study(study) - -ENCOUNTERED_STUDY_TYPES = set() - - def _process_study(raw_study) -> Term: protocol_section = raw_study["protocolSection"] identification_module = protocol_section["identificationModule"] @@ -64,14 +89,9 @@ def _process_study(raw_study) -> Term: if synonym: term.append_synonym(synonym) - # TODO make the study type into inheritance, when available - design_module = protocol_section.get("design_module", {}) + design_module = protocol_section.get("designModule", {}) study_type = design_module.get("studyType") - if study_type: - term.annotate_literal(has_category, study_type) - ENCOUNTERED_STUDY_TYPES.add(study_type) - term.append_parent(CLINICAL_TRIAL_TERM) - + term.append_parent(PARENTS[study_type]) return term From 14fa1640ca14fb13a9f89cd2f85416789678825c Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 14 Jan 2025 12:33:28 +0100 Subject: [PATCH 4/7] Refactor --- src/pyobo/sources/clinicaltrials.py | 40 +++++++++++++++++------------ 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/src/pyobo/sources/clinicaltrials.py b/src/pyobo/sources/clinicaltrials.py index 29274eca..631122e3 100644 --- a/src/pyobo/sources/clinicaltrials.py +++ b/src/pyobo/sources/clinicaltrials.py @@ -14,36 +14,44 @@ PREFIX = "clinicaltrials" -STUDY = Term(reference=default_reference(PREFIX, "study", name="study")) +STUDY_TERM = Term(reference=default_reference(PREFIX, "study", name="study")) CLINICAL_TRIAL_TERM = Term( reference=default_reference(PREFIX, "clinical-trial", name="clinical trial") -).append_parent(STUDY) +).append_parent(STUDY_TERM) -INTERVENTIONAL = Term( +INTERVENTIONAL_CLINICAL_TRIAL_TERM = Term( reference=default_reference( PREFIX, "interventional-clinical-trial", name="interventional clinical trial" ) ).append_parent(CLINICAL_TRIAL_TERM) -OBSERVATIONAL = Term( +OBSERVATIONAL_CLINICAL_TRIAL_TERM = Term( reference=default_reference( PREFIX, "observational-clinical-trial", name="observational clinical trial" ) ).append_parent(CLINICAL_TRIAL_TERM) -EXPANDED_ACCESS = Term( - reference=default_reference( - PREFIX, "expanded-access-study", name="expanded access study" - ) -).append_parent(STUDY) +EXPANDED_ACCESS_STUDY_TERM = Term( + reference=default_reference(PREFIX, "expanded-access-study", name="expanded access study") +).append_parent(STUDY_TERM) + +TERMS = [ + STUDY_TERM, + CLINICAL_TRIAL_TERM, + OBSERVATIONAL_CLINICAL_TRIAL_TERM, + INTERVENTIONAL_CLINICAL_TRIAL_TERM, + EXPANDED_ACCESS_STUDY_TERM, +] -TERMS = [STUDY, CLINICAL_TRIAL_TERM, OBSERVATIONAL, INTERVENTIONAL, EXPANDED_ACCESS] +# These were identified as the 4 possibilities for study +# types in ClinicalTrials.gov. See summary script at +# https://gist.github.com/cthoyt/12a3cb3c63ad68d73fe5a2f0d506526f PARENTS: dict[str | None, Term] = { - "INTERVENTIONAL": INTERVENTIONAL, - "OBSERVATIONAL": OBSERVATIONAL, - "EXPANDED_ACCESS": EXPANDED_ACCESS, - None: STUDY, + "INTERVENTIONAL": INTERVENTIONAL_CLINICAL_TRIAL_TERM, + "OBSERVATIONAL": OBSERVATIONAL_CLINICAL_TRIAL_TERM, + "EXPANDED_ACCESS": EXPANDED_ACCESS_STUDY_TERM, + None: STUDY_TERM, } @@ -53,14 +61,14 @@ class ClinicalTrialsGetter(Obo): ontology = PREFIX dynamic_version = True typedefs = [has_contributor] - root_terms = [STUDY] + root_terms = [STUDY_TERM] def iter_terms(self, force: bool = False) -> Iterable[Term]: """Iterate over terms for studies.""" yield CHARLIE_TERM yield HUMAN_TERM for term in TERMS: - term.annotate_object(has_contributor, CHARLIE_TERM) + term.append_contributor(CHARLIE_TERM) term.append_comment(PYOBO_INJECTED) yield term yield from iterate_studies() From 2d1869b827da22cbaf49f74cea75e047f39ed5f3 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 14 Jan 2025 12:34:10 +0100 Subject: [PATCH 5/7] Update clinicaltrials.py --- src/pyobo/sources/clinicaltrials.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pyobo/sources/clinicaltrials.py b/src/pyobo/sources/clinicaltrials.py index 631122e3..4d1a026e 100644 --- a/src/pyobo/sources/clinicaltrials.py +++ b/src/pyobo/sources/clinicaltrials.py @@ -61,7 +61,7 @@ class ClinicalTrialsGetter(Obo): ontology = PREFIX dynamic_version = True typedefs = [has_contributor] - root_terms = [STUDY_TERM] + root_terms = [STUDY_TERM.reference] def iter_terms(self, force: bool = False) -> Iterable[Term]: """Iterate over terms for studies.""" From e49917156d7158f280581bb8d86e647bf149310a Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 14 Jan 2025 13:01:11 +0100 Subject: [PATCH 6/7] Update clinicaltrials.py --- src/pyobo/sources/clinicaltrials.py | 65 +++++++++++++++++++++++++---- 1 file changed, 57 insertions(+), 8 deletions(-) diff --git a/src/pyobo/sources/clinicaltrials.py b/src/pyobo/sources/clinicaltrials.py index 4d1a026e..91d029f0 100644 --- a/src/pyobo/sources/clinicaltrials.py +++ b/src/pyobo/sources/clinicaltrials.py @@ -4,7 +4,7 @@ from clinicaltrials_downloader import get_studies_slim -from pyobo import Obo, Reference, Term, default_reference +from pyobo import Obo, Reference, Term, TypeDef, default_reference from pyobo.struct.struct import CHARLIE_TERM, HUMAN_TERM, PYOBO_INJECTED from pyobo.struct.typedef import has_contributor @@ -14,6 +14,17 @@ PREFIX = "clinicaltrials" +INVESTIGATES_CONDITION = TypeDef( + reference=Reference( + prefix=PREFIX, identifier="investigates_condition", name="investigates condition" + ), + is_metadata_tag=True, +) +HAS_INTERVENTION = TypeDef( + reference=Reference(prefix=PREFIX, identifier="has_intervention", name="has intervention"), + is_metadata_tag=True, +) + STUDY_TERM = Term(reference=default_reference(PREFIX, "study", name="study")) CLINICAL_TRIAL_TERM = Term( @@ -26,6 +37,22 @@ ) ).append_parent(CLINICAL_TRIAL_TERM) +RANDOMIZED_INTERVENTIONAL_CLINICAL_TRIAL_TERM = Term( + reference=default_reference( + PREFIX, + "randomized-interventional-clinical-trial", + name="randomized interventional clinical trial", + ) +).append_parent(INTERVENTIONAL_CLINICAL_TRIAL_TERM) + +NON_RANDOMIZED_INTERVENTIONAL_CLINICAL_TRIAL_TERM = Term( + reference=default_reference( + PREFIX, + "non-randomized-interventional-clinical-trial", + name="non-randomized interventional clinical trial", + ) +).append_parent(INTERVENTIONAL_CLINICAL_TRIAL_TERM) + OBSERVATIONAL_CLINICAL_TRIAL_TERM = Term( reference=default_reference( PREFIX, "observational-clinical-trial", name="observational clinical trial" @@ -47,11 +74,14 @@ # These were identified as the 4 possibilities for study # types in ClinicalTrials.gov. See summary script at # https://gist.github.com/cthoyt/12a3cb3c63ad68d73fe5a2f0d506526f -PARENTS: dict[str | None, Term] = { - "INTERVENTIONAL": INTERVENTIONAL_CLINICAL_TRIAL_TERM, - "OBSERVATIONAL": OBSERVATIONAL_CLINICAL_TRIAL_TERM, - "EXPANDED_ACCESS": EXPANDED_ACCESS_STUDY_TERM, - None: STUDY_TERM, +PARENTS: dict[tuple[str | None, str | None], Term] = { + ("INTERVENTIONAL", None): INTERVENTIONAL_CLINICAL_TRIAL_TERM, + ("INTERVENTIONAL", "NA"): INTERVENTIONAL_CLINICAL_TRIAL_TERM, + ("INTERVENTIONAL", "RANDOMIZED"): RANDOMIZED_INTERVENTIONAL_CLINICAL_TRIAL_TERM, + ("INTERVENTIONAL", "NON_RANDOMIZED"): NON_RANDOMIZED_INTERVENTIONAL_CLINICAL_TRIAL_TERM, + ("OBSERVATIONAL", None): OBSERVATIONAL_CLINICAL_TRIAL_TERM, + ("EXPANDED_ACCESS", None): EXPANDED_ACCESS_STUDY_TERM, + (None, None): STUDY_TERM, } @@ -60,7 +90,7 @@ class ClinicalTrialsGetter(Obo): ontology = PREFIX dynamic_version = True - typedefs = [has_contributor] + typedefs = [has_contributor, INVESTIGATES_CONDITION, HAS_INTERVENTION] root_terms = [STUDY_TERM.reference] def iter_terms(self, force: bool = False) -> Iterable[Term]: @@ -99,9 +129,28 @@ def _process_study(raw_study) -> Term: design_module = protocol_section.get("designModule", {}) study_type = design_module.get("studyType") - term.append_parent(PARENTS[study_type]) + allocation = design_module.get("designInfo", {}).get("allocation") + term.append_parent(PARENTS[study_type, allocation]) + + references_module = protocol_section.get("referencesModule", {}) + for reference in references_module.get("references", []): + if pubmed_id := reference.get("pmid"): + term.append_see_also(Reference(prefix="pubmed", identifier=pubmed_id)) + + derived_section = raw_study["derivedSection"] + for mesh_record in derived_section.get("conditionBrowseModule", {}).get("meshes", []): + term.annotate_object(INVESTIGATES_CONDITION, _mesh(mesh_record)) + + for mesh_record in derived_section.get("interventionBrowseModule", {}).get("meshes", []): + term.annotate_object(HAS_INTERVENTION, _mesh(mesh_record)) return term +def _mesh(mesh_record: dict[str, str]) -> Reference: + return Reference( + prefix="mesh", identifier=mesh_record["id"], name=mesh_record.get("term") or None + ) + + if __name__ == "__main__": ClinicalTrialsGetter.cli() From 8ea0f76293176c5443bb2e20fccd164afa2a2262 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 14 Jan 2025 13:13:59 +0100 Subject: [PATCH 7/7] Update clinicaltrials.py --- src/pyobo/sources/clinicaltrials.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/pyobo/sources/clinicaltrials.py b/src/pyobo/sources/clinicaltrials.py index 91d029f0..78b71815 100644 --- a/src/pyobo/sources/clinicaltrials.py +++ b/src/pyobo/sources/clinicaltrials.py @@ -69,6 +69,8 @@ OBSERVATIONAL_CLINICAL_TRIAL_TERM, INTERVENTIONAL_CLINICAL_TRIAL_TERM, EXPANDED_ACCESS_STUDY_TERM, + RANDOMIZED_INTERVENTIONAL_CLINICAL_TRIAL_TERM, + NON_RANDOMIZED_INTERVENTIONAL_CLINICAL_TRIAL_TERM, ] # These were identified as the 4 possibilities for study