Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Db sources updates #1423

Merged
merged 14 commits into from
Nov 22, 2023
2 changes: 1 addition & 1 deletion indra/sources/bel/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

logger = logging.getLogger(__name__)

version = 'v1.0.0'
version = 'v1.1.2'
branch = 'https://github.com/cthoyt/selventa-knowledge/raw/' \
'{}/selventa_knowledge/{}'
large_corpus_url = branch.format(version, 'large_corpus.bel.nodelink.json.gz')
Expand Down
23 changes: 17 additions & 6 deletions indra/sources/bel/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,9 +547,14 @@ def get_db_refs_by_name(ns, name, node_data):
db_refs = {'UP': up_id}
# Map Selventa families and complexes to FamPlex
elif ns == 'SFAM':
sfam_id, xrefs = selventa_lookup[('SFAM', name)]
db_refs = {'SFAM': sfam_id}
indra_name = bel_to_indra.get(name)
try:
sfam_id, xrefs = selventa_lookup[('SFAM', name)]
db_refs = {"SFAM": sfam_id}
indra_name = bel_to_indra.get(name)
except KeyError:
indra_name = None
db_refs = None

if indra_name is None:
logger.info('Could not find mapping for BEL/SFAM family: '
'%s (%s)' % (name, node_data))
Expand Down Expand Up @@ -614,9 +619,15 @@ def get_db_refs_by_name(ns, name, node_data):
name = chebi_client.get_chebi_name_from_id(chebi_id)
# SDIS, SCHEM: Look up the ID and include it in the db_refs
elif ns in {'SDIS', 'SCHEM'}:
sid, xrefs = selventa_lookup[(ns, name)]
db_refs = xrefs.copy()
db_refs[ns] = sid
try:
sid, xrefs = selventa_lookup[(ns, name)]
db_refs = xrefs.copy()
db_refs[ns] = sid
except KeyError:
logger.info(
f"Could not map Selventa name {name} to ID for {ns}."
)
return name, None
elif ns == 'TEXT':
db_refs = {ns: name}
elif ns == 'TAX':
Expand Down
2 changes: 1 addition & 1 deletion indra/sources/ctd/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from .processor import CTDProcessor, CTDChemicalDiseaseProcessor, \
CTDGeneDiseaseProcessor, CTDChemicalGeneProcessor

base_url = 'http://ctdbase.org/reports/'
base_url = 'https://ctdbase.org/reports/'

urls = {
'chemical_gene': base_url + 'CTD_chem_gene_ixns.tsv.gz',
Expand Down
5 changes: 4 additions & 1 deletion indra/sources/phosphoelm/api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import csv
import logging

from tqdm import tqdm

from .processor import PhosphoElmProcessor

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -36,7 +38,8 @@ def _get_json_from_entry_rows(row_iter):
"""Loop body to generate a json friendly structure"""
ppelm_json = []
columns = next(row_iter)
for entry in row_iter:
logger.info('Processing Phospho.ELM dump')
for entry in tqdm(row_iter):
row_dict = {c: e for c, e in zip(columns, entry)}
ppelm_json.append(row_dict)
return ppelm_json
28 changes: 12 additions & 16 deletions indra/sources/phosphoelm/processor.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import logging
import requests
from tqdm import tqdm

import gilda
from indra.databases import uniprot_client, hgnc_client
from indra.statements.validate import validate_text_refs
from indra.statements import Phosphorylation, Evidence, Agent

from .phosphoelm_mapping import phosphoelm_mapping

gilda_url = 'http://grounding.indra.bio/ground'
logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -40,7 +40,8 @@ def process_phosphorylations(self, skip_empty=True):
Default: True. If False, also create statements when upstream
kinases in entry['kinases'] are not known.
"""
for entry in self._phosphoelm_data:
logger.info("Processing Phospho.ELM phosphorylations")
for entry in tqdm(self._phosphoelm_data):
if entry['species'].lower() != 'homo sapiens' or\
skip_empty and not entry['kinases']:
# Skip entries without any kinases or if species is other
Expand Down Expand Up @@ -156,17 +157,12 @@ def _agent_from_str(txt):

def _gilda_grounder(txt):
# Pre-process text for grounding
txt = txt.replace('_group', '')
txt = txt.replace('_', '-')
txt = txt.split('/')[0]
res = requests.post(gilda_url, json={'text': txt})
if res.status_code != 200:
logger.warning('Gilda service responded with status code %d' %
res.status_code)
txt = txt.replace("_group", "")
txt = txt.replace("_", "-")
txt = txt.split("/")[0]
res = gilda.ground(txt)
if not res:
logger.warning(f"Gilda grounder returned no results for {txt}")
return None
rj = res.json()
if not rj:
return None
top_term = rj[0]['term']
return top_term

top_term = res[0].term
return top_term.to_json()
5 changes: 4 additions & 1 deletion indra/sources/rlimsp/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,10 @@ def get_agent_from_entity_info(entity_info):
refs['CHEBI'] = 'CHEBI:%s' % id_dict['idString']
# These we take as is
elif id_dict['source'] in ('MESH', 'OMIM'):
refs[id_dict['source']] = id_dict['idString']
if ';' in id_dict['idString']:
refs[id_dict['source']] = id_dict['idString'].split(';')[0]
else:
refs[id_dict['source']] = id_dict['idString']
# CTD is sometimes used for MESH chemical IDs but can also be just '-'
elif id_dict['source'] == 'CTD':
if id_dict['idString'] != '-':
Expand Down
6 changes: 5 additions & 1 deletion indra/sources/signor/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,11 @@ def _get_agent(self, ent_name, ent_type, id, database):
# SIGNOR's format in which it leaves extra spaces around
# the ID, as in 'CID: 923'
id = id[4:].strip()
elif database == 'ChEBI' and id.startswith('SID:'):
# In older releases PubChem substance IDs were used with
# ChEBI as the source, these were later changed to use
# PUBCHEM
elif database in {'ChEBI', 'PUBCHEM'} \
and id.startswith('SID:'):
gnd_type = 'PUBCHEM.SUBSTANCE'
id = id[4:].strip()
db_refs = {gnd_type: id}
Expand Down
2 changes: 1 addition & 1 deletion indra/sources/ubibrowser/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from .processor import UbiBrowserProcessor


DOWNLOAD_URL = 'http://ubibrowser.ncpsb.org.cn/v2/Public/download/literature/'
DOWNLOAD_URL = 'http://ubibrowser.bio-it.cn/ubibrowser_v3/Public/download/literature/'
E3_URL = DOWNLOAD_URL + 'literature.E3.txt'
DUB_URL = DOWNLOAD_URL + 'literature.DUB.txt'

Expand Down
24 changes: 17 additions & 7 deletions indra/sources/ubibrowser/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,33 @@ def __init__(self, e3_df, dub_df):
self.statements = []

def extract_statements(self):
for df, stmt_type in [(self.e3_df, Ubiquitination),
(self.dub_df, Deubiquitination)]:
for df, stmt_type, subj_suffix in \
[(self.e3_df, Ubiquitination, 'E3'),
(self.dub_df, Deubiquitination, 'DUB')]:
for _, row in df.iterrows():
stmt = self._process_row(row, stmt_type)
stmt = self._process_row(row, stmt_type, subj_suffix)
if stmt:
self.statements.append(stmt)

@staticmethod
def _process_row(row, stmt_type):
def _process_row(row, stmt_type, subj_suffix):
# Note that even in the DUB table the subject of the statement
# is called "E3"
# There are some examples where a complex is implied (e.g., BMI1-RNF2),
# for simplicity we just ignore these
if '-' in row['E3AC']:
if '#' in row[f'SwissProt AC ({subj_suffix})']:
return None
subj_agent = get_standard_agent(row['E3GENE'], {'UP': row['E3AC']})
obj_agent = get_standard_agent(row['SUBGENE'], {'UP': row['SUBAC']})
# Interestingly, some of the E3s are missing entirely, we skip these
elif row[f'SwissProt AC ({subj_suffix})'] == '-':
return None
# Some of the same corner cases apply to the substrate as well
if row['SwissProt AC (Substrate)'] == '-':
return None
subj_agent = \
get_standard_agent(row[f'Gene Symbol ({subj_suffix})'],
{'UP': row[f'SwissProt AC ({subj_suffix})']})
obj_agent = get_standard_agent(row['Gene Symbol (Substrate)'],
{'UP': row['SwissProt AC (Substrate)']})
if row['SOURCE'] == 'MEDLINE' and row['SOURCEID'] != 'UNIPROT':
# Note: we sometimes get int here
pmid = str(row['SOURCEID'])
Expand Down
10 changes: 5 additions & 5 deletions indra/tests/test_sources/resources/ubibrowser_dub.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
NUMBER E3ID SUBID E3AC SUBAC E3GENE SUBGENE SOURCE SOURCEID SENTENCE E3TYPE COUNT type species
671 UBP10_HUMAN PTEN_HUMAN Q14694 P60484 USP10 PTEN MEDLINE 28852924 We further demonstrated that USP10 directly interacted with and stabilized PTEN via deubiquitination. USP 1 DUB H.sapiens
673 UBP36_HUMAN SODM_HUMAN Q9P275 P04179 USP36 SOD2 MEDLINE 21268071 we identified a deubiquitinating enzyme USP36 that regulates the protein stability of SOD2 USP 1 DUB H.sapiens
675 UBP13_HUMAN UBL4A_HUMAN Q92995 P11441 USP13 UBL4A MEDLINE 24424410 we identify USP13 as a gp78-associated DUB that eliminates ubiquitin conjugates from Ubl4A to maintain the functionality of Bag6. USP 2 DUB H.sapiens
677 UBP33_HUMAN ARRB1_HUMAN Q8TEY7 P49407 USP33 ARRB1 MEDLINE 19363159 We now report the discovery that the deubiquitinating enzyme ubiquitin-specific protease 33 (USP33) binds beta-arrestin2 and leads to the deubiquitination of beta-arrestins. USP 1 DUB H.sapiens
NUMBER SwissProt ID (DUB) SwissProt ID (Substrate) SwissProt AC (DUB) SwissProt AC (Substrate) Gene Symbol (DUB) Gene Symbol (Substrate) SOURCE SOURCEID SENTENCE DUBTYPE COUNT type species
56 UBP33_HUMAN ARRB1_HUMAN Q8TEY7 P49407 USP33 ARRB1 MEDLINE 19363159 "We now report the discovery that the deubiquitinating enzyme ubiquitin-specific protease 33 (<span class=""match term0"">USP33</span>) binds beta-arrestin2 and leads to the deubiquitination of beta-arrestins" USP 1 Training data H.sapiens
129 UBP13_HUMAN UBL4A_HUMAN Q92995 P11441 USP13 UBL4A MEDLINE 24424410 "we identify <span class=""match term0"">USP13</span> as a gp78-associated DUB that eliminates ubiquitin conjugates from <span class=""match term1"">Ubl4A</span> to maintain the functionality of Bag6" USP 1 Other H.sapiens
388 UBP10_HUMAN PTEN_HUMAN Q14694 P60484 USP10 PTEN MEDLINE 28852924 "We further demonstrated that <span class=""match term0"">USP10</span> directly interacted with and stabilized <span class=""match term1"">PTEN</span> via deubiquitination" USP 1 Training data H.sapiens
726 UBP36_HUMAN SODM_HUMAN Q9P275 P04179 USP36 SOD2 MEDLINE 21268071 "we identified a deubiquitinating enzyme <span class=""match term0"">USP36</span> that regulates the protein stability of <span class=""match term1"">SOD2</span>" USP 1 Other H.sapiens
12 changes: 6 additions & 6 deletions indra/tests/test_sources/resources/ubibrowser_e3.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
NUMBER E3ID SUBID E3AC SUBAC E3GENE SUBGENE SOURCE SOURCEID SENTENCE E3TYPE COUNT type species
1 AMFR2_HUMAN A1AT_HUMAN Q9UKV5 P01009 AMFR SERPINA1 MEDLINE 16979136 Here we report that gp78, a ubiquitin ligase (E3) pairing with mammalian Ubc7 for ERAD, ubiquitinates and facilitates degradation of ATZ, the classic deficiency variant of AAT having a Z mutation (Glu 342 Lys) RING 3 E3 H.sapiens
15 AMFR2_HUMAN HMDH_HUMAN Q9UKV5 P04035 AMFR HMGCR MEDLINE 20458442 UBE2G2, a previously known E2 of gp78, is demonstrated to be involved in the sterol-regulated ubiquitination and degradation of HMGCR RING 4 E3 H.sapiens
300 BRCA1_HUMAN BRCA1_HUMAN P38398 P38398 BRCA1 BRCA1 UniProt RNF8_BOVIN Following DNA double-strand breaks (DSBs), it is recruited to the sites of damage by ATM-phosphorylated MDC1, mediates the ubiquitination of histones H2A and H2AX, thereby promoting the formation of TP53BP1 and BRCA1 ionizing radiation-induced foci (IRIF) RING 21 E3 H.sapiens
5642 HRD1_CAEEL Q9BMU4_CAEEL Q20798 Q9BMU4 sel-11 atln-1 MEDLINE 32916628 UbiNet 2.0 RING 1 E3 C.elegans
5644 A0A2I4KBP1_DANRE SHH_DANRE A0A2I4KBP1 Q92008 gan shha MEDLINE 31503551 UbiNet 2.0 other 1 E3 D.rerio
NUMBER SwissProt ID (E3) SwissProt ID (Substrate) SwissProt AC (E3) SwissProt AC (Substrate) Gene Symbol (E3) Gene Symbol (Substrate) SOURCE SOURCEID SENTENCE E3TYPE COUNT type species
109 HRD1_CAEEL Q9BMU4_CAEEL Q20798 Q9BMU4 sel-11 atln-1 MEDLINE 32916628 32916628 RING 1 Other C.elegans
167 A0A2I4KBP1_DANRE SHH_DANRE A0A2I4KBP1 Q92008 gan shha MEDLINE 31503551 31503551 Other 1 Other D.rerio
198 AMFR2_HUMAN A1AT_HUMAN Q9UKV5 P01009 AMFR SERPINA1 MEDLINE 16979136 "Here we report that <span class=""match term0"">gp78</span>, a ubiquitin ligase (E3) pairing with mammalian Ubc7 for ERAD, ubiquitinates and facilitates degradation of ATZ, the classic deficiency variant of <span class=""match term1"">AAT</span> having a Z mutation (Glu 342 Lys)" RING 1 Training data H.sapiens
1040 GAN_HUMAN SHH_HUMAN Q9H2C0 Q15465 GAN SHH MEDLINE 31503551 31503551 BTB_3 1 Other H.sapiens
2631 SYVN1_HUMAN ATLA1_HUMAN Q86TM6 Q8WXF7 SYVN1 ATL1 MEDLINE 32916628 "The E3 Ubiquitin Ligase <span class=""match term0"">SYVN1</span> Ubiquitinates Atlastins to Remodel the Endoplasmic Reticulum Network." RING 1 Other H.sapiens
16 changes: 9 additions & 7 deletions indra/tests/test_sources/test_ubibrowser.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,15 @@
def test_extract_statements():
up = ubibrowser.process_file(e3_file, dub_file)
assert len(up.statements) == 9
assert isinstance(up.statements[0], Ubiquitination)
assert isinstance(up.statements[2], Ubiquitination)
assert isinstance(up.statements[-1], Deubiquitination)

assert_valid_statements(up.statements)

#1 AMFR2_HUMAN A1AT_HUMAN Q9UKV5 P01009 AMFR SERPINA1
# MEDLINE 16979136 Here we report that ... RING 3 E3 H.sapiens
e3_stmt = up.statements[0]
#198 AMFR2_HUMAN A1AT_HUMAN Q9UKV5 P01009 AMFR SERPINA1
# MEDLINE 16979136 "Here we report that ..." RING 1
# "Training data" H.sapiens
e3_stmt = up.statements[2]
assert e3_stmt.enz.name == 'AMFR'
assert e3_stmt.enz.db_refs['UP'] == 'Q9UKV5'
assert e3_stmt.sub.name == 'SERPINA1'
Expand All @@ -30,9 +31,10 @@ def test_extract_statements():
assert e3_stmt.evidence[0].pmid == '16979136'
assert e3_stmt.evidence[0].text.startswith('Here we report that')

# 677 UBP33_HUMAN ARRB1_HUMAN Q8TEY7 P49407 USP33 ARRB1
# MEDLINE 19363159 We now report the discovery that... USP 1 DUB H.sapiens
dub_stmt = up.statements[-1]
# 56 UBP33_HUMAN ARRB1_HUMAN Q8TEY7 P49407 USP33 ARRB1 MEDLINE
# 19363159 "We now report the discovery that " "USP 1" "Training data"
# H.sapiens
dub_stmt = up.statements[5]
assert dub_stmt.enz.name == 'USP33'
assert dub_stmt.enz.db_refs['UP'] == 'Q8TEY7'
assert dub_stmt.sub.name == 'ARRB1'
Expand Down
Loading