diff --git a/indra/sources/bel/api.py b/indra/sources/bel/api.py
index fb1621c54d..a0de304502 100644
--- a/indra/sources/bel/api.py
+++ b/indra/sources/bel/api.py
@@ -17,7 +17,7 @@
logger = logging.getLogger(__name__)
-version = 'v1.0.0'
+version = 'v1.1.2'
branch = 'https://github.com/cthoyt/selventa-knowledge/raw/' \
'{}/selventa_knowledge/{}'
large_corpus_url = branch.format(version, 'large_corpus.bel.nodelink.json.gz')
diff --git a/indra/sources/bel/processor.py b/indra/sources/bel/processor.py
index 5a485d6984..49eed505eb 100644
--- a/indra/sources/bel/processor.py
+++ b/indra/sources/bel/processor.py
@@ -547,9 +547,14 @@ def get_db_refs_by_name(ns, name, node_data):
db_refs = {'UP': up_id}
# Map Selventa families and complexes to FamPlex
elif ns == 'SFAM':
- sfam_id, xrefs = selventa_lookup[('SFAM', name)]
- db_refs = {'SFAM': sfam_id}
- indra_name = bel_to_indra.get(name)
+ try:
+ sfam_id, xrefs = selventa_lookup[('SFAM', name)]
+ db_refs = {"SFAM": sfam_id}
+ indra_name = bel_to_indra.get(name)
+ except KeyError:
+ indra_name = None
+ db_refs = None
+
if indra_name is None:
logger.info('Could not find mapping for BEL/SFAM family: '
'%s (%s)' % (name, node_data))
@@ -614,9 +619,15 @@ def get_db_refs_by_name(ns, name, node_data):
name = chebi_client.get_chebi_name_from_id(chebi_id)
# SDIS, SCHEM: Look up the ID and include it in the db_refs
elif ns in {'SDIS', 'SCHEM'}:
- sid, xrefs = selventa_lookup[(ns, name)]
- db_refs = xrefs.copy()
- db_refs[ns] = sid
+ try:
+ sid, xrefs = selventa_lookup[(ns, name)]
+ db_refs = xrefs.copy()
+ db_refs[ns] = sid
+ except KeyError:
+ logger.info(
+ f"Could not map Selventa name {name} to ID for {ns}."
+ )
+ return name, None
elif ns == 'TEXT':
db_refs = {ns: name}
elif ns == 'TAX':
diff --git a/indra/sources/ctd/api.py b/indra/sources/ctd/api.py
index 2ee8055851..a46d07f62c 100644
--- a/indra/sources/ctd/api.py
+++ b/indra/sources/ctd/api.py
@@ -2,7 +2,7 @@
from .processor import CTDProcessor, CTDChemicalDiseaseProcessor, \
CTDGeneDiseaseProcessor, CTDChemicalGeneProcessor
-base_url = 'http://ctdbase.org/reports/'
+base_url = 'https://ctdbase.org/reports/'
urls = {
'chemical_gene': base_url + 'CTD_chem_gene_ixns.tsv.gz',
diff --git a/indra/sources/phosphoelm/api.py b/indra/sources/phosphoelm/api.py
index b8cc7bd03c..6b542f19a4 100644
--- a/indra/sources/phosphoelm/api.py
+++ b/indra/sources/phosphoelm/api.py
@@ -1,6 +1,8 @@
import csv
import logging
+from tqdm import tqdm
+
from .processor import PhosphoElmProcessor
logger = logging.getLogger(__name__)
@@ -36,7 +38,8 @@ def _get_json_from_entry_rows(row_iter):
"""Loop body to generate a json friendly structure"""
ppelm_json = []
columns = next(row_iter)
- for entry in row_iter:
+ logger.info('Processing Phospho.ELM dump')
+ for entry in tqdm(row_iter):
row_dict = {c: e for c, e in zip(columns, entry)}
ppelm_json.append(row_dict)
return ppelm_json
diff --git a/indra/sources/phosphoelm/processor.py b/indra/sources/phosphoelm/processor.py
index 9fb7e0b572..4b8b9aad91 100644
--- a/indra/sources/phosphoelm/processor.py
+++ b/indra/sources/phosphoelm/processor.py
@@ -1,13 +1,13 @@
import logging
-import requests
+from tqdm import tqdm
+import gilda
from indra.databases import uniprot_client, hgnc_client
from indra.statements.validate import validate_text_refs
from indra.statements import Phosphorylation, Evidence, Agent
from .phosphoelm_mapping import phosphoelm_mapping
-gilda_url = 'http://grounding.indra.bio/ground'
logger = logging.getLogger(__name__)
@@ -40,7 +40,8 @@ def process_phosphorylations(self, skip_empty=True):
Default: True. If False, also create statements when upstream
kinases in entry['kinases'] are not known.
"""
- for entry in self._phosphoelm_data:
+ logger.info("Processing Phospho.ELM phosphorylations")
+ for entry in tqdm(self._phosphoelm_data):
if entry['species'].lower() != 'homo sapiens' or\
skip_empty and not entry['kinases']:
# Skip entries without any kinases or if species is other
@@ -156,17 +157,12 @@ def _agent_from_str(txt):
def _gilda_grounder(txt):
# Pre-process text for grounding
- txt = txt.replace('_group', '')
- txt = txt.replace('_', '-')
- txt = txt.split('/')[0]
- res = requests.post(gilda_url, json={'text': txt})
- if res.status_code != 200:
- logger.warning('Gilda service responded with status code %d' %
- res.status_code)
+ txt = txt.replace("_group", "")
+ txt = txt.replace("_", "-")
+ txt = txt.split("/")[0]
+ res = gilda.ground(txt)
+ if not res:
+ logger.warning(f"Gilda grounder returned no results for {txt}")
return None
- rj = res.json()
- if not rj:
- return None
- top_term = rj[0]['term']
- return top_term
-
+ top_term = res[0].term
+ return top_term.to_json()
diff --git a/indra/sources/rlimsp/processor.py b/indra/sources/rlimsp/processor.py
index d17df1ef24..7587354c8b 100644
--- a/indra/sources/rlimsp/processor.py
+++ b/indra/sources/rlimsp/processor.py
@@ -251,7 +251,10 @@ def get_agent_from_entity_info(entity_info):
refs['CHEBI'] = 'CHEBI:%s' % id_dict['idString']
# These we take as is
elif id_dict['source'] in ('MESH', 'OMIM'):
- refs[id_dict['source']] = id_dict['idString']
+ if ';' in id_dict['idString']:
+ refs[id_dict['source']] = id_dict['idString'].split(';')[0]
+ else:
+ refs[id_dict['source']] = id_dict['idString']
# CTD is sometimes used for MESH chemical IDs but can also be just '-'
elif id_dict['source'] == 'CTD':
if id_dict['idString'] != '-':
diff --git a/indra/sources/signor/processor.py b/indra/sources/signor/processor.py
index 91fd1f33c2..5a163c6d3b 100644
--- a/indra/sources/signor/processor.py
+++ b/indra/sources/signor/processor.py
@@ -246,7 +246,11 @@ def _get_agent(self, ent_name, ent_type, id, database):
# SIGNOR's format in which it leaves extra spaces around
# the ID, as in 'CID: 923'
id = id[4:].strip()
- elif database == 'ChEBI' and id.startswith('SID:'):
+ # In older releases PubChem substance IDs were used with
+ # ChEBI as the source, these were later changed to use
+ # PUBCHEM
+ elif database in {'ChEBI', 'PUBCHEM'} \
+ and id.startswith('SID:'):
gnd_type = 'PUBCHEM.SUBSTANCE'
id = id[4:].strip()
db_refs = {gnd_type: id}
diff --git a/indra/sources/ubibrowser/api.py b/indra/sources/ubibrowser/api.py
index e2a8fea4e3..e9690a9a28 100644
--- a/indra/sources/ubibrowser/api.py
+++ b/indra/sources/ubibrowser/api.py
@@ -4,7 +4,7 @@
from .processor import UbiBrowserProcessor
-DOWNLOAD_URL = 'http://ubibrowser.ncpsb.org.cn/v2/Public/download/literature/'
+DOWNLOAD_URL = 'http://ubibrowser.bio-it.cn/ubibrowser_v3/Public/download/literature/'
E3_URL = DOWNLOAD_URL + 'literature.E3.txt'
DUB_URL = DOWNLOAD_URL + 'literature.DUB.txt'
diff --git a/indra/sources/ubibrowser/processor.py b/indra/sources/ubibrowser/processor.py
index 0403fa8d2e..68182fb763 100644
--- a/indra/sources/ubibrowser/processor.py
+++ b/indra/sources/ubibrowser/processor.py
@@ -10,23 +10,33 @@ def __init__(self, e3_df, dub_df):
self.statements = []
def extract_statements(self):
- for df, stmt_type in [(self.e3_df, Ubiquitination),
- (self.dub_df, Deubiquitination)]:
+ for df, stmt_type, subj_suffix in \
+ [(self.e3_df, Ubiquitination, 'E3'),
+ (self.dub_df, Deubiquitination, 'DUB')]:
for _, row in df.iterrows():
- stmt = self._process_row(row, stmt_type)
+ stmt = self._process_row(row, stmt_type, subj_suffix)
if stmt:
self.statements.append(stmt)
@staticmethod
- def _process_row(row, stmt_type):
+ def _process_row(row, stmt_type, subj_suffix):
# Note that even in the DUB table the subject of the statement
# is called "E3"
# There are some examples where a complex is implied (e.g., BMI1-RNF2),
# for simplicity we just ignore these
- if '-' in row['E3AC']:
+ if '#' in row[f'SwissProt AC ({subj_suffix})']:
return None
- subj_agent = get_standard_agent(row['E3GENE'], {'UP': row['E3AC']})
- obj_agent = get_standard_agent(row['SUBGENE'], {'UP': row['SUBAC']})
+ # Interestingly, some of the E3s are missing entirely, we skip these
+ elif row[f'SwissProt AC ({subj_suffix})'] == '-':
+ return None
+ # Some of the same corner cases apply to the substrate as well
+ if row['SwissProt AC (Substrate)'] == '-':
+ return None
+ subj_agent = \
+ get_standard_agent(row[f'Gene Symbol ({subj_suffix})'],
+ {'UP': row[f'SwissProt AC ({subj_suffix})']})
+ obj_agent = get_standard_agent(row['Gene Symbol (Substrate)'],
+ {'UP': row['SwissProt AC (Substrate)']})
if row['SOURCE'] == 'MEDLINE' and row['SOURCEID'] != 'UNIPROT':
# Note: we sometimes get int here
pmid = str(row['SOURCEID'])
diff --git a/indra/tests/test_sources/resources/ubibrowser_dub.txt b/indra/tests/test_sources/resources/ubibrowser_dub.txt
index ab5d69977d..3d290f03b0 100644
--- a/indra/tests/test_sources/resources/ubibrowser_dub.txt
+++ b/indra/tests/test_sources/resources/ubibrowser_dub.txt
@@ -1,5 +1,5 @@
-NUMBER E3ID SUBID E3AC SUBAC E3GENE SUBGENE SOURCE SOURCEID SENTENCE E3TYPE COUNT type species
-671 UBP10_HUMAN PTEN_HUMAN Q14694 P60484 USP10 PTEN MEDLINE 28852924 We further demonstrated that USP10 directly interacted with and stabilized PTEN via deubiquitination. USP 1 DUB H.sapiens
-673 UBP36_HUMAN SODM_HUMAN Q9P275 P04179 USP36 SOD2 MEDLINE 21268071 we identified a deubiquitinating enzyme USP36 that regulates the protein stability of SOD2 USP 1 DUB H.sapiens
-675 UBP13_HUMAN UBL4A_HUMAN Q92995 P11441 USP13 UBL4A MEDLINE 24424410 we identify USP13 as a gp78-associated DUB that eliminates ubiquitin conjugates from Ubl4A to maintain the functionality of Bag6. USP 2 DUB H.sapiens
-677 UBP33_HUMAN ARRB1_HUMAN Q8TEY7 P49407 USP33 ARRB1 MEDLINE 19363159 We now report the discovery that the deubiquitinating enzyme ubiquitin-specific protease 33 (USP33) binds beta-arrestin2 and leads to the deubiquitination of beta-arrestins. USP 1 DUB H.sapiens
+NUMBER SwissProt ID (DUB) SwissProt ID (Substrate) SwissProt AC (DUB) SwissProt AC (Substrate) Gene Symbol (DUB) Gene Symbol (Substrate) SOURCE SOURCEID SENTENCE DUBTYPE COUNT type species
+56 UBP33_HUMAN ARRB1_HUMAN Q8TEY7 P49407 USP33 ARRB1 MEDLINE 19363159 "We now report the discovery that the deubiquitinating enzyme ubiquitin-specific protease 33 (USP33) binds beta-arrestin2 and leads to the deubiquitination of beta-arrestins" USP 1 Training data H.sapiens
+129 UBP13_HUMAN UBL4A_HUMAN Q92995 P11441 USP13 UBL4A MEDLINE 24424410 "we identify USP13 as a gp78-associated DUB that eliminates ubiquitin conjugates from Ubl4A to maintain the functionality of Bag6" USP 1 Other H.sapiens
+388 UBP10_HUMAN PTEN_HUMAN Q14694 P60484 USP10 PTEN MEDLINE 28852924 "We further demonstrated that USP10 directly interacted with and stabilized PTEN via deubiquitination" USP 1 Training data H.sapiens
+726 UBP36_HUMAN SODM_HUMAN Q9P275 P04179 USP36 SOD2 MEDLINE 21268071 "we identified a deubiquitinating enzyme USP36 that regulates the protein stability of SOD2" USP 1 Other H.sapiens
diff --git a/indra/tests/test_sources/resources/ubibrowser_e3.txt b/indra/tests/test_sources/resources/ubibrowser_e3.txt
index 22a1e52e21..95cd044892 100644
--- a/indra/tests/test_sources/resources/ubibrowser_e3.txt
+++ b/indra/tests/test_sources/resources/ubibrowser_e3.txt
@@ -1,6 +1,6 @@
-NUMBER E3ID SUBID E3AC SUBAC E3GENE SUBGENE SOURCE SOURCEID SENTENCE E3TYPE COUNT type species
-1 AMFR2_HUMAN A1AT_HUMAN Q9UKV5 P01009 AMFR SERPINA1 MEDLINE 16979136 Here we report that gp78, a ubiquitin ligase (E3) pairing with mammalian Ubc7 for ERAD, ubiquitinates and facilitates degradation of ATZ, the classic deficiency variant of AAT having a Z mutation (Glu 342 Lys) RING 3 E3 H.sapiens
-15 AMFR2_HUMAN HMDH_HUMAN Q9UKV5 P04035 AMFR HMGCR MEDLINE 20458442 UBE2G2, a previously known E2 of gp78, is demonstrated to be involved in the sterol-regulated ubiquitination and degradation of HMGCR RING 4 E3 H.sapiens
-300 BRCA1_HUMAN BRCA1_HUMAN P38398 P38398 BRCA1 BRCA1 UniProt RNF8_BOVIN Following DNA double-strand breaks (DSBs), it is recruited to the sites of damage by ATM-phosphorylated MDC1, mediates the ubiquitination of histones H2A and H2AX, thereby promoting the formation of TP53BP1 and BRCA1 ionizing radiation-induced foci (IRIF) RING 21 E3 H.sapiens
-5642 HRD1_CAEEL Q9BMU4_CAEEL Q20798 Q9BMU4 sel-11 atln-1 MEDLINE 32916628 UbiNet 2.0 RING 1 E3 C.elegans
-5644 A0A2I4KBP1_DANRE SHH_DANRE A0A2I4KBP1 Q92008 gan shha MEDLINE 31503551 UbiNet 2.0 other 1 E3 D.rerio
+NUMBER SwissProt ID (E3) SwissProt ID (Substrate) SwissProt AC (E3) SwissProt AC (Substrate) Gene Symbol (E3) Gene Symbol (Substrate) SOURCE SOURCEID SENTENCE E3TYPE COUNT type species
+109 HRD1_CAEEL Q9BMU4_CAEEL Q20798 Q9BMU4 sel-11 atln-1 MEDLINE 32916628 32916628 RING 1 Other C.elegans
+167 A0A2I4KBP1_DANRE SHH_DANRE A0A2I4KBP1 Q92008 gan shha MEDLINE 31503551 31503551 Other 1 Other D.rerio
+198 AMFR2_HUMAN A1AT_HUMAN Q9UKV5 P01009 AMFR SERPINA1 MEDLINE 16979136 "Here we report that gp78, a ubiquitin ligase (E3) pairing with mammalian Ubc7 for ERAD, ubiquitinates and facilitates degradation of ATZ, the classic deficiency variant of AAT having a Z mutation (Glu 342 Lys)" RING 1 Training data H.sapiens
+1040 GAN_HUMAN SHH_HUMAN Q9H2C0 Q15465 GAN SHH MEDLINE 31503551 31503551 BTB_3 1 Other H.sapiens
+2631 SYVN1_HUMAN ATLA1_HUMAN Q86TM6 Q8WXF7 SYVN1 ATL1 MEDLINE 32916628 "The E3 Ubiquitin Ligase SYVN1 Ubiquitinates Atlastins to Remodel the Endoplasmic Reticulum Network." RING 1 Other H.sapiens
diff --git a/indra/tests/test_sources/test_ubibrowser.py b/indra/tests/test_sources/test_ubibrowser.py
index fefe074b64..fb4c040853 100644
--- a/indra/tests/test_sources/test_ubibrowser.py
+++ b/indra/tests/test_sources/test_ubibrowser.py
@@ -13,14 +13,15 @@
def test_extract_statements():
up = ubibrowser.process_file(e3_file, dub_file)
assert len(up.statements) == 9
- assert isinstance(up.statements[0], Ubiquitination)
+ assert isinstance(up.statements[2], Ubiquitination)
assert isinstance(up.statements[-1], Deubiquitination)
assert_valid_statements(up.statements)
- #1 AMFR2_HUMAN A1AT_HUMAN Q9UKV5 P01009 AMFR SERPINA1
- # MEDLINE 16979136 Here we report that ... RING 3 E3 H.sapiens
- e3_stmt = up.statements[0]
+ #198 AMFR2_HUMAN A1AT_HUMAN Q9UKV5 P01009 AMFR SERPINA1
+ # MEDLINE 16979136 "Here we report that ..." RING 1
+ # "Training data" H.sapiens
+ e3_stmt = up.statements[2]
assert e3_stmt.enz.name == 'AMFR'
assert e3_stmt.enz.db_refs['UP'] == 'Q9UKV5'
assert e3_stmt.sub.name == 'SERPINA1'
@@ -30,9 +31,10 @@ def test_extract_statements():
assert e3_stmt.evidence[0].pmid == '16979136'
assert e3_stmt.evidence[0].text.startswith('Here we report that')
- # 677 UBP33_HUMAN ARRB1_HUMAN Q8TEY7 P49407 USP33 ARRB1
- # MEDLINE 19363159 We now report the discovery that... USP 1 DUB H.sapiens
- dub_stmt = up.statements[-1]
+ # 56 UBP33_HUMAN ARRB1_HUMAN Q8TEY7 P49407 USP33 ARRB1 MEDLINE
+ # 19363159 "We now report the discovery that " "USP 1" "Training data"
+ # H.sapiens
+ dub_stmt = up.statements[5]
assert dub_stmt.enz.name == 'USP33'
assert dub_stmt.enz.db_refs['UP'] == 'Q8TEY7'
assert dub_stmt.sub.name == 'ARRB1'