Merge pull request #1423 from sorgerlab/db-sources-updates

Db sources updates
sorgerlab · Nov 22, 2023 · 4c33d60 · 4c33d60
2 parents 68f7d15 + 5360dea
commit 4c33d60
Show file tree

Hide file tree

Showing 12 changed files with 82 additions and 53 deletions.
diff --git a/indra/sources/bel/api.py b/indra/sources/bel/api.py
@@ -17,7 +17,7 @@
 
 logger = logging.getLogger(__name__)
 
-version = 'v1.0.0'
+version = 'v1.1.2'
 branch = 'https://github.com/cthoyt/selventa-knowledge/raw/' \
          '{}/selventa_knowledge/{}'
 large_corpus_url = branch.format(version, 'large_corpus.bel.nodelink.json.gz')

diff --git a/indra/sources/bel/processor.py b/indra/sources/bel/processor.py
@@ -547,9 +547,14 @@ def get_db_refs_by_name(ns, name, node_data):
             db_refs = {'UP': up_id}
     # Map Selventa families and complexes to FamPlex
     elif ns == 'SFAM':
-        sfam_id, xrefs = selventa_lookup[('SFAM', name)]
-        db_refs = {'SFAM': sfam_id}
-        indra_name = bel_to_indra.get(name)
+        try:
+            sfam_id, xrefs = selventa_lookup[('SFAM', name)]
+            db_refs = {"SFAM": sfam_id}
+            indra_name = bel_to_indra.get(name)
+        except KeyError:
+            indra_name = None
+            db_refs = None
+
         if indra_name is None:
             logger.info('Could not find mapping for BEL/SFAM family: '
                         '%s (%s)' % (name, node_data))
@@ -614,9 +619,15 @@ def get_db_refs_by_name(ns, name, node_data):
         name = chebi_client.get_chebi_name_from_id(chebi_id)
     # SDIS, SCHEM: Look up the ID and include it in the db_refs
     elif ns in {'SDIS', 'SCHEM'}:
-        sid, xrefs = selventa_lookup[(ns, name)]
-        db_refs = xrefs.copy()
-        db_refs[ns] = sid
+        try:
+            sid, xrefs = selventa_lookup[(ns, name)]
+            db_refs = xrefs.copy()
+            db_refs[ns] = sid
+        except KeyError:
+            logger.info(
+                f"Could not map Selventa name {name} to ID for {ns}."
+            )
+            return name, None
     elif ns == 'TEXT':
         db_refs = {ns: name}
     elif ns == 'TAX':

diff --git a/indra/sources/ctd/api.py b/indra/sources/ctd/api.py
@@ -2,7 +2,7 @@
 from .processor import CTDProcessor, CTDChemicalDiseaseProcessor, \
     CTDGeneDiseaseProcessor, CTDChemicalGeneProcessor
 
-base_url = 'http://ctdbase.org/reports/'
+base_url = 'https://ctdbase.org/reports/'
 
 urls = {
     'chemical_gene': base_url + 'CTD_chem_gene_ixns.tsv.gz',

diff --git a/indra/sources/phosphoelm/api.py b/indra/sources/phosphoelm/api.py
@@ -1,6 +1,8 @@
 import csv
 import logging
 
+from tqdm import tqdm
+
 from .processor import PhosphoElmProcessor
 
 logger = logging.getLogger(__name__)
@@ -36,7 +38,8 @@ def _get_json_from_entry_rows(row_iter):
     """Loop body to generate a json friendly structure"""
     ppelm_json = []
     columns = next(row_iter)
-    for entry in row_iter:
+    logger.info('Processing Phospho.ELM dump')
+    for entry in tqdm(row_iter):
         row_dict = {c: e for c, e in zip(columns, entry)}
         ppelm_json.append(row_dict)
     return ppelm_json
diff --git a/indra/sources/phosphoelm/processor.py b/indra/sources/phosphoelm/processor.py
@@ -1,13 +1,13 @@
 import logging
-import requests
+from tqdm import tqdm
 
+import gilda
 from indra.databases import uniprot_client, hgnc_client
 from indra.statements.validate import validate_text_refs
 from indra.statements import Phosphorylation, Evidence, Agent
 
 from .phosphoelm_mapping import phosphoelm_mapping
 
-gilda_url = 'http://grounding.indra.bio/ground'
 logger = logging.getLogger(__name__)
 
 
@@ -40,7 +40,8 @@ def process_phosphorylations(self, skip_empty=True):
             Default: True. If False, also create statements when upstream
             kinases in entry['kinases'] are not known.
         """
-        for entry in self._phosphoelm_data:
+        logger.info("Processing Phospho.ELM phosphorylations")
+        for entry in tqdm(self._phosphoelm_data):
             if entry['species'].lower() != 'homo sapiens' or\
                     skip_empty and not entry['kinases']:
                 # Skip entries without any kinases or if species is other
@@ -156,17 +157,12 @@ def _agent_from_str(txt):
 
 def _gilda_grounder(txt):
     # Pre-process text for grounding
-    txt = txt.replace('_group', '')
-    txt = txt.replace('_', '-')
-    txt = txt.split('/')[0]
-    res = requests.post(gilda_url, json={'text': txt})
-    if res.status_code != 200:
-        logger.warning('Gilda service responded with status code %d' %
-                       res.status_code)
+    txt = txt.replace("_group", "")
+    txt = txt.replace("_", "-")
+    txt = txt.split("/")[0]
+    res = gilda.ground(txt)
+    if not res:
+        logger.warning(f"Gilda grounder returned no results for {txt}")
         return None
-    rj = res.json()
-    if not rj:
-        return None
-    top_term = rj[0]['term']
-    return top_term
-
+    top_term = res[0].term
+    return top_term.to_json()
diff --git a/indra/sources/rlimsp/processor.py b/indra/sources/rlimsp/processor.py
@@ -251,7 +251,10 @@ def get_agent_from_entity_info(entity_info):
             refs['CHEBI'] = 'CHEBI:%s' % id_dict['idString']
         # These we take as is
         elif id_dict['source'] in ('MESH', 'OMIM'):
-            refs[id_dict['source']] = id_dict['idString']
+            if ';' in id_dict['idString']:
+                refs[id_dict['source']] = id_dict['idString'].split(';')[0]
+            else:
+                refs[id_dict['source']] = id_dict['idString']
         # CTD is sometimes used for MESH chemical IDs but can also be just '-'
         elif id_dict['source'] == 'CTD':
             if id_dict['idString'] != '-':

diff --git a/indra/sources/signor/processor.py b/indra/sources/signor/processor.py
@@ -246,7 +246,11 @@ def _get_agent(self, ent_name, ent_type, id, database):
                     # SIGNOR's format in which it leaves extra spaces around
                     # the ID, as in 'CID: 923'
                     id = id[4:].strip()
-                elif database == 'ChEBI' and id.startswith('SID:'):
+                # In older releases PubChem substance IDs were used with
+                # ChEBI as the source, these were later changed to use
+                # PUBCHEM
+                elif database in {'ChEBI', 'PUBCHEM'} \
+                        and id.startswith('SID:'):
                     gnd_type = 'PUBCHEM.SUBSTANCE'
                     id = id[4:].strip()
                 db_refs = {gnd_type: id}

diff --git a/indra/sources/ubibrowser/api.py b/indra/sources/ubibrowser/api.py
@@ -4,7 +4,7 @@
 from .processor import UbiBrowserProcessor
 
 
-DOWNLOAD_URL = 'http://ubibrowser.ncpsb.org.cn/v2/Public/download/literature/'
+DOWNLOAD_URL = 'http://ubibrowser.bio-it.cn/ubibrowser_v3/Public/download/literature/'
 E3_URL = DOWNLOAD_URL + 'literature.E3.txt'
 DUB_URL = DOWNLOAD_URL + 'literature.DUB.txt'
 

diff --git a/indra/sources/ubibrowser/processor.py b/indra/sources/ubibrowser/processor.py
@@ -10,23 +10,33 @@ def __init__(self, e3_df, dub_df):
         self.statements = []
 
     def extract_statements(self):
-        for df, stmt_type in [(self.e3_df, Ubiquitination),
-                              (self.dub_df, Deubiquitination)]:
+        for df, stmt_type, subj_suffix in \
+                [(self.e3_df, Ubiquitination, 'E3'),
+                 (self.dub_df, Deubiquitination, 'DUB')]:
             for _, row in df.iterrows():
-                stmt = self._process_row(row, stmt_type)
+                stmt = self._process_row(row, stmt_type, subj_suffix)
                 if stmt:
                     self.statements.append(stmt)
 
     @staticmethod
-    def _process_row(row, stmt_type):
+    def _process_row(row, stmt_type, subj_suffix):
         # Note that even in the DUB table the subject of the statement
         # is called "E3"
         # There are some examples where a complex is implied (e.g., BMI1-RNF2),
         # for simplicity we just ignore these
-        if '-' in row['E3AC']:
+        if '#' in row[f'SwissProt AC ({subj_suffix})']:
             return None
-        subj_agent = get_standard_agent(row['E3GENE'], {'UP': row['E3AC']})
-        obj_agent = get_standard_agent(row['SUBGENE'], {'UP': row['SUBAC']})
+        # Interestingly, some of the E3s are missing entirely, we skip these
+        elif row[f'SwissProt AC ({subj_suffix})'] == '-':
+            return None
+        # Some of the same corner cases apply to the substrate as well
+        if row['SwissProt AC (Substrate)'] == '-':
+            return None
+        subj_agent = \
+            get_standard_agent(row[f'Gene Symbol ({subj_suffix})'],
+                               {'UP': row[f'SwissProt AC ({subj_suffix})']})
+        obj_agent = get_standard_agent(row['Gene Symbol (Substrate)'],
+                                       {'UP': row['SwissProt AC (Substrate)']})
         if row['SOURCE'] == 'MEDLINE' and row['SOURCEID'] != 'UNIPROT':
             # Note: we sometimes get int here
             pmid = str(row['SOURCEID'])

diff --git a/indra/tests/test_sources/resources/ubibrowser_dub.txt b/indra/tests/test_sources/resources/ubibrowser_dub.txt
@@ -1,5 +1,5 @@
-NUMBER	E3ID	SUBID	E3AC	SUBAC	E3GENE	SUBGENE	SOURCE	SOURCEID	SENTENCE	E3TYPE	COUNT	type	species
-671	UBP10_HUMAN	PTEN_HUMAN	Q14694	P60484	USP10	PTEN	MEDLINE	28852924	We further demonstrated that USP10 directly interacted with and stabilized PTEN via deubiquitination.	USP	1	DUB	H.sapiens
-673	UBP36_HUMAN	SODM_HUMAN	Q9P275	P04179	USP36	SOD2	MEDLINE	21268071	we identified a deubiquitinating enzyme USP36 that regulates the protein stability of SOD2	USP	1	DUB	H.sapiens
-675	UBP13_HUMAN	UBL4A_HUMAN	Q92995	P11441	USP13	UBL4A	MEDLINE	24424410	we identify USP13 as a gp78-associated DUB that eliminates ubiquitin conjugates from Ubl4A to maintain the functionality of  Bag6.	USP	2	DUB	H.sapiens
-677	UBP33_HUMAN	ARRB1_HUMAN	Q8TEY7	P49407	USP33	ARRB1	MEDLINE	19363159	We now report the discovery that the deubiquitinating enzyme ubiquitin-specific protease 33 (USP33) binds beta-arrestin2 and leads to the deubiquitination of beta-arrestins.	USP	1	DUB	H.sapiens
+NUMBER	SwissProt ID (DUB)	SwissProt ID (Substrate)	SwissProt AC (DUB)	SwissProt AC (Substrate)	Gene Symbol (DUB)	Gene Symbol (Substrate)	SOURCE	SOURCEID	SENTENCE	DUBTYPE	COUNT	type	species
+56	UBP33_HUMAN	ARRB1_HUMAN	Q8TEY7	P49407	USP33	ARRB1	MEDLINE	19363159	"We now report the discovery that the deubiquitinating enzyme ubiquitin-specific protease 33 (<span class=""match term0"">USP33</span>) binds beta-arrestin2 and leads to the deubiquitination of beta-arrestins"	USP	1	Training data	H.sapiens
+129	UBP13_HUMAN	UBL4A_HUMAN	Q92995	P11441	USP13	UBL4A	MEDLINE	24424410	"we identify <span class=""match term0"">USP13</span> as a gp78-associated DUB that eliminates ubiquitin conjugates from <span class=""match term1"">Ubl4A</span> to maintain the functionality of Bag6"	USP	1	Other	H.sapiens
+388	UBP10_HUMAN	PTEN_HUMAN	Q14694	P60484	USP10	PTEN	MEDLINE	28852924	"We further demonstrated that <span class=""match term0"">USP10</span> directly interacted with and stabilized <span class=""match term1"">PTEN</span> via deubiquitination"	USP	1	Training data	H.sapiens
+726	UBP36_HUMAN	SODM_HUMAN	Q9P275	P04179	USP36	SOD2	MEDLINE	21268071	"we identified a deubiquitinating enzyme <span class=""match term0"">USP36</span> that regulates the protein stability of <span class=""match term1"">SOD2</span>"	USP	1	Other	H.sapiens
diff --git a/indra/tests/test_sources/resources/ubibrowser_e3.txt b/indra/tests/test_sources/resources/ubibrowser_e3.txt
@@ -1,6 +1,6 @@
-NUMBER	E3ID	SUBID	E3AC	SUBAC	E3GENE	SUBGENE	SOURCE	SOURCEID	SENTENCE	E3TYPE	COUNT	type	species
-1	AMFR2_HUMAN	A1AT_HUMAN	Q9UKV5	P01009	AMFR	SERPINA1	MEDLINE	16979136	Here we report that gp78, a ubiquitin ligase (E3) pairing with mammalian Ubc7 for ERAD, ubiquitinates and facilitates degradation of ATZ, the classic deficiency variant of AAT having a Z mutation (Glu 342 Lys)	RING	3	E3	H.sapiens
-15	AMFR2_HUMAN	HMDH_HUMAN	Q9UKV5	P04035	AMFR	HMGCR	MEDLINE	20458442	UBE2G2, a previously known E2 of gp78, is demonstrated to be involved in the sterol-regulated ubiquitination and degradation of HMGCR	RING	4	E3	H.sapiens
-300	BRCA1_HUMAN	BRCA1_HUMAN	P38398	P38398	BRCA1	BRCA1	UniProt	RNF8_BOVIN	Following DNA double-strand breaks (DSBs), it is recruited to the sites of damage by ATM-phosphorylated MDC1, mediates the ubiquitination of histones H2A and H2AX, thereby promoting the formation of TP53BP1 and BRCA1 ionizing radiation-induced foci (IRIF)	RING	21	E3	H.sapiens
-5642	HRD1_CAEEL	Q9BMU4_CAEEL	Q20798	Q9BMU4	sel-11	atln-1	MEDLINE	32916628	UbiNet 2.0	RING	1	E3	C.elegans
-5644	A0A2I4KBP1_DANRE	SHH_DANRE	A0A2I4KBP1	Q92008	gan	shha	MEDLINE	31503551	UbiNet 2.0	other	1	E3	D.rerio
+NUMBER	SwissProt ID (E3)	SwissProt ID (Substrate)	SwissProt AC (E3)	SwissProt AC (Substrate)	Gene Symbol (E3)	Gene Symbol (Substrate)	SOURCE	SOURCEID	SENTENCE	E3TYPE	COUNT	type	species
+109	HRD1_CAEEL	Q9BMU4_CAEEL	Q20798	Q9BMU4	sel-11	atln-1	MEDLINE	32916628	32916628	RING	1	Other	C.elegans
+167	A0A2I4KBP1_DANRE	SHH_DANRE	A0A2I4KBP1	Q92008	gan	shha	MEDLINE	31503551	31503551	Other	1	Other	D.rerio
+198	AMFR2_HUMAN	A1AT_HUMAN	Q9UKV5	P01009	AMFR	SERPINA1	MEDLINE	16979136	"Here we report that <span class=""match term0"">gp78</span>, a ubiquitin ligase (E3) pairing with mammalian Ubc7 for ERAD, ubiquitinates and facilitates degradation of ATZ, the classic deficiency variant of <span class=""match term1"">AAT</span> having a Z mutation (Glu 342 Lys)"	RING	1	Training data	H.sapiens
+1040	GAN_HUMAN	SHH_HUMAN	Q9H2C0	Q15465	GAN	SHH	MEDLINE	31503551	31503551	BTB_3	1	Other	H.sapiens
+2631	SYVN1_HUMAN	ATLA1_HUMAN	Q86TM6	Q8WXF7	SYVN1	ATL1	MEDLINE	32916628	"The E3 Ubiquitin Ligase <span class=""match term0"">SYVN1</span> Ubiquitinates Atlastins to Remodel the Endoplasmic Reticulum Network."	RING	1	Other	H.sapiens
diff --git a/indra/tests/test_sources/test_ubibrowser.py b/indra/tests/test_sources/test_ubibrowser.py
@@ -13,14 +13,15 @@
 def test_extract_statements():
     up = ubibrowser.process_file(e3_file, dub_file)
     assert len(up.statements) == 9
-    assert isinstance(up.statements[0], Ubiquitination)
+    assert isinstance(up.statements[2], Ubiquitination)
     assert isinstance(up.statements[-1], Deubiquitination)
 
     assert_valid_statements(up.statements)
 
-    #1	AMFR2_HUMAN	A1AT_HUMAN	Q9UKV5	P01009	AMFR	SERPINA1
-    # MEDLINE	16979136	Here we report that ...	RING	3	E3	H.sapiens
-    e3_stmt = up.statements[0]
+    #198	AMFR2_HUMAN	A1AT_HUMAN	Q9UKV5	P01009	AMFR	SERPINA1
+    # MEDLINE	16979136	"Here we report that ..."	RING	1
+    # "Training data"	H.sapiens
+    e3_stmt = up.statements[2]
     assert e3_stmt.enz.name == 'AMFR'
     assert e3_stmt.enz.db_refs['UP'] == 'Q9UKV5'
     assert e3_stmt.sub.name == 'SERPINA1'
@@ -30,9 +31,10 @@ def test_extract_statements():
     assert e3_stmt.evidence[0].pmid == '16979136'
     assert e3_stmt.evidence[0].text.startswith('Here we report that')
 
-    # 677	UBP33_HUMAN	ARRB1_HUMAN	Q8TEY7	P49407	USP33	ARRB1
-    # MEDLINE	19363159	We now report the discovery that...	USP	1	DUB	H.sapiens
-    dub_stmt = up.statements[-1]
+    # 56	UBP33_HUMAN	ARRB1_HUMAN	Q8TEY7	P49407	USP33	ARRB1	MEDLINE
+    # 19363159	"We now report the discovery that "	"USP	1"	"Training data"
+    # H.sapiens
+    dub_stmt = up.statements[5]
     assert dub_stmt.enz.name == 'USP33'
     assert dub_stmt.enz.db_refs['UP'] == 'Q8TEY7'
     assert dub_stmt.sub.name == 'ARRB1'