From dc9ad7d2aa11d42b82bca94a2be282370d4c9fcc Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Wed, 3 Jan 2024 16:27:33 +0100
Subject: [PATCH] Update EC, ComplexPortal, NPASS, and UniProt source

---
 src/pyobo/identifier_utils.py        |  2 +-
 src/pyobo/sources/complexportal.py   |  4 ++++
 src/pyobo/sources/npass.py           |  4 +++-
 src/pyobo/sources/uniprot/uniprot.py | 10 +++++-----
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/pyobo/identifier_utils.py b/src/pyobo/identifier_utils.py
index efd8caca..aabec170 100644
--- a/src/pyobo/identifier_utils.py
+++ b/src/pyobo/identifier_utils.py
@@ -119,7 +119,7 @@ def _wrapped(prefix, *args, **kwargs):
 
 def standardize_ec(ec: str) -> str:
     """Standardize an EC code identifier by removing all trailing dashes and dots."""
-    ec = ec.strip()
+    ec = ec.strip().replace(" ", "")
     for _ in range(4):
         ec = ec.rstrip("-").rstrip(".")
     return ec
diff --git a/src/pyobo/sources/complexportal.py b/src/pyobo/sources/complexportal.py
index 4afe93da..0a6d71bb 100644
--- a/src/pyobo/sources/complexportal.py
+++ b/src/pyobo/sources/complexportal.py
@@ -82,6 +82,10 @@ def _parse_xrefs(s) -> List[Tuple[Reference, str]]:
             logger.warning("xref missing (: %s", xref)
             continue
         note = note.rstrip(")")
+        if note.lower().startswith("rhea "):
+            note = note[len("Rhea ") :]
+        if note.lower().startswith("EC:"):
+            note = note[len("EC:") :]
         try:
             reference = Reference.from_curie(xref_curie)
         except ValueError:
diff --git a/src/pyobo/sources/npass.py b/src/pyobo/sources/npass.py
index e1a0c1c6..a4949a34 100644
--- a/src/pyobo/sources/npass.py
+++ b/src/pyobo/sources/npass.py
@@ -72,7 +72,9 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
 
         # TODO check that the first is always the parent compound?
         if pd.notna(pubchem_compound_ids):
-            pubchem_compound_ids = pubchem_compound_ids.split(";")
+            pubchem_compound_ids = [
+                yy.strip() for xx in pubchem_compound_ids.split(";") for yy in xx.strip().split(",")
+            ]
             if len(pubchem_compound_ids) > 1:
                 logger.debug("multiple cids for %s: %s", identifier, pubchem_compound_ids)
             for pubchem_compound_id in pubchem_compound_ids:
diff --git a/src/pyobo/sources/uniprot/uniprot.py b/src/pyobo/sources/uniprot/uniprot.py
index 3f5d4355..67ceb0e3 100644
--- a/src/pyobo/sources/uniprot/uniprot.py
+++ b/src/pyobo/sources/uniprot/uniprot.py
@@ -30,7 +30,7 @@ class UniProtGetter(Obo):
 
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
         """Iterate over terms in the ontology."""
-        yield from iter_terms(force=force, version=self._version_or_raise)
+        yield from iter_terms(version=self._version_or_raise)
 
 
 def get_obo(force: bool = False) -> Obo:
@@ -38,9 +38,9 @@ def get_obo(force: bool = False) -> Obo:
     return UniProtGetter(force=force)
 
 
-def iter_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Term]:
+def iter_terms(version: Optional[str] = None) -> Iterable[Term]:
     """Iterate over UniProt Terms."""
-    with open_reader(ensure(version=version, force=force)) as reader:
+    with open_reader(ensure(version=version)) as reader:
         _ = next(reader)  # header
         for uniprot_id, name, taxonomy_id, _synonyms, ecs, pubmeds, pdbs in tqdm(
             reader, desc="Mapping UniProt", unit_scale=True
@@ -63,11 +63,11 @@ def iter_terms(version: Optional[str] = None, force: bool = False) -> Iterable[T
             yield term
 
 
-def ensure(version: Optional[str] = None, force: bool = False) -> Path:
+def ensure(version: Optional[str] = None) -> Path:
     """Ensure the reviewed uniprot names are available."""
     if version is None:
         version = bioversions.get_version("uniprot")
-    return RAW_MODULE.ensure(PREFIX, version, name="reviewed.tsv.gz", url=REVIEWED_URL, force=force)
+    return RAW_MODULE.ensure(PREFIX, version, name="reviewed.tsv.gz", url=REVIEWED_URL)
 
 
 if __name__ == "__main__":