From dc9ad7d2aa11d42b82bca94a2be282370d4c9fcc Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt <cthoyt@gmail.com> Date: Wed, 3 Jan 2024 16:27:33 +0100 Subject: [PATCH] Update EC, ComplexPortal, NPASS, and UniProt source --- src/pyobo/identifier_utils.py | 2 +- src/pyobo/sources/complexportal.py | 4 ++++ src/pyobo/sources/npass.py | 4 +++- src/pyobo/sources/uniprot/uniprot.py | 10 +++++----- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/pyobo/identifier_utils.py b/src/pyobo/identifier_utils.py index efd8caca..aabec170 100644 --- a/src/pyobo/identifier_utils.py +++ b/src/pyobo/identifier_utils.py @@ -119,7 +119,7 @@ def _wrapped(prefix, *args, **kwargs): def standardize_ec(ec: str) -> str: """Standardize an EC code identifier by removing all trailing dashes and dots.""" - ec = ec.strip() + ec = ec.strip().replace(" ", "") for _ in range(4): ec = ec.rstrip("-").rstrip(".") return ec diff --git a/src/pyobo/sources/complexportal.py b/src/pyobo/sources/complexportal.py index 4afe93da..0a6d71bb 100644 --- a/src/pyobo/sources/complexportal.py +++ b/src/pyobo/sources/complexportal.py @@ -82,6 +82,10 @@ def _parse_xrefs(s) -> List[Tuple[Reference, str]]: logger.warning("xref missing (: %s", xref) continue note = note.rstrip(")") + if note.lower().startswith("rhea "): + note = note[len("Rhea ") :] + if note.lower().startswith("EC:"): + note = note[len("EC:") :] try: reference = Reference.from_curie(xref_curie) except ValueError: diff --git a/src/pyobo/sources/npass.py b/src/pyobo/sources/npass.py index e1a0c1c6..a4949a34 100644 --- a/src/pyobo/sources/npass.py +++ b/src/pyobo/sources/npass.py @@ -72,7 +72,9 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]: # TODO check that the first is always the parent compound? if pd.notna(pubchem_compound_ids): - pubchem_compound_ids = pubchem_compound_ids.split(";") + pubchem_compound_ids = [ + yy.strip() for xx in pubchem_compound_ids.split(";") for yy in xx.strip().split(",") + ] if len(pubchem_compound_ids) > 1: logger.debug("multiple cids for %s: %s", identifier, pubchem_compound_ids) for pubchem_compound_id in pubchem_compound_ids: diff --git a/src/pyobo/sources/uniprot/uniprot.py b/src/pyobo/sources/uniprot/uniprot.py index 3f5d4355..67ceb0e3 100644 --- a/src/pyobo/sources/uniprot/uniprot.py +++ b/src/pyobo/sources/uniprot/uniprot.py @@ -30,7 +30,7 @@ class UniProtGetter(Obo): def iter_terms(self, force: bool = False) -> Iterable[Term]: """Iterate over terms in the ontology.""" - yield from iter_terms(force=force, version=self._version_or_raise) + yield from iter_terms(version=self._version_or_raise) def get_obo(force: bool = False) -> Obo: @@ -38,9 +38,9 @@ def get_obo(force: bool = False) -> Obo: return UniProtGetter(force=force) -def iter_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Term]: +def iter_terms(version: Optional[str] = None) -> Iterable[Term]: """Iterate over UniProt Terms.""" - with open_reader(ensure(version=version, force=force)) as reader: + with open_reader(ensure(version=version)) as reader: _ = next(reader) # header for uniprot_id, name, taxonomy_id, _synonyms, ecs, pubmeds, pdbs in tqdm( reader, desc="Mapping UniProt", unit_scale=True @@ -63,11 +63,11 @@ def iter_terms(version: Optional[str] = None, force: bool = False) -> Iterable[T yield term -def ensure(version: Optional[str] = None, force: bool = False) -> Path: +def ensure(version: Optional[str] = None) -> Path: """Ensure the reviewed uniprot names are available.""" if version is None: version = bioversions.get_version("uniprot") - return RAW_MODULE.ensure(PREFIX, version, name="reviewed.tsv.gz", url=REVIEWED_URL, force=force) + return RAW_MODULE.ensure(PREFIX, version, name="reviewed.tsv.gz", url=REVIEWED_URL) if __name__ == "__main__":