Skip to content

Commit

Permalink
Merge pull request #1423 from sorgerlab/db-sources-updates
Browse files Browse the repository at this point in the history
Db sources updates
  • Loading branch information
bgyori authored Nov 22, 2023
2 parents 68f7d15 + 5360dea commit 4c33d60
Show file tree
Hide file tree
Showing 12 changed files with 82 additions and 53 deletions.
2 changes: 1 addition & 1 deletion indra/sources/bel/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

logger = logging.getLogger(__name__)

version = 'v1.0.0'
version = 'v1.1.2'
branch = 'https://github.com/cthoyt/selventa-knowledge/raw/' \
'{}/selventa_knowledge/{}'
large_corpus_url = branch.format(version, 'large_corpus.bel.nodelink.json.gz')
Expand Down
23 changes: 17 additions & 6 deletions indra/sources/bel/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,9 +547,14 @@ def get_db_refs_by_name(ns, name, node_data):
db_refs = {'UP': up_id}
# Map Selventa families and complexes to FamPlex
elif ns == 'SFAM':
sfam_id, xrefs = selventa_lookup[('SFAM', name)]
db_refs = {'SFAM': sfam_id}
indra_name = bel_to_indra.get(name)
try:
sfam_id, xrefs = selventa_lookup[('SFAM', name)]
db_refs = {"SFAM": sfam_id}
indra_name = bel_to_indra.get(name)
except KeyError:
indra_name = None
db_refs = None

if indra_name is None:
logger.info('Could not find mapping for BEL/SFAM family: '
'%s (%s)' % (name, node_data))
Expand Down Expand Up @@ -614,9 +619,15 @@ def get_db_refs_by_name(ns, name, node_data):
name = chebi_client.get_chebi_name_from_id(chebi_id)
# SDIS, SCHEM: Look up the ID and include it in the db_refs
elif ns in {'SDIS', 'SCHEM'}:
sid, xrefs = selventa_lookup[(ns, name)]
db_refs = xrefs.copy()
db_refs[ns] = sid
try:
sid, xrefs = selventa_lookup[(ns, name)]
db_refs = xrefs.copy()
db_refs[ns] = sid
except KeyError:
logger.info(
f"Could not map Selventa name {name} to ID for {ns}."
)
return name, None
elif ns == 'TEXT':
db_refs = {ns: name}
elif ns == 'TAX':
Expand Down
2 changes: 1 addition & 1 deletion indra/sources/ctd/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from .processor import CTDProcessor, CTDChemicalDiseaseProcessor, \
CTDGeneDiseaseProcessor, CTDChemicalGeneProcessor

base_url = 'http://ctdbase.org/reports/'
base_url = 'https://ctdbase.org/reports/'

urls = {
'chemical_gene': base_url + 'CTD_chem_gene_ixns.tsv.gz',
Expand Down
5 changes: 4 additions & 1 deletion indra/sources/phosphoelm/api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import csv
import logging

from tqdm import tqdm

from .processor import PhosphoElmProcessor

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -36,7 +38,8 @@ def _get_json_from_entry_rows(row_iter):
"""Loop body to generate a json friendly structure"""
ppelm_json = []
columns = next(row_iter)
for entry in row_iter:
logger.info('Processing Phospho.ELM dump')
for entry in tqdm(row_iter):
row_dict = {c: e for c, e in zip(columns, entry)}
ppelm_json.append(row_dict)
return ppelm_json
28 changes: 12 additions & 16 deletions indra/sources/phosphoelm/processor.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import logging
import requests
from tqdm import tqdm

import gilda
from indra.databases import uniprot_client, hgnc_client
from indra.statements.validate import validate_text_refs
from indra.statements import Phosphorylation, Evidence, Agent

from .phosphoelm_mapping import phosphoelm_mapping

gilda_url = 'http://grounding.indra.bio/ground'
logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -40,7 +40,8 @@ def process_phosphorylations(self, skip_empty=True):
Default: True. If False, also create statements when upstream
kinases in entry['kinases'] are not known.
"""
for entry in self._phosphoelm_data:
logger.info("Processing Phospho.ELM phosphorylations")
for entry in tqdm(self._phosphoelm_data):
if entry['species'].lower() != 'homo sapiens' or\
skip_empty and not entry['kinases']:
# Skip entries without any kinases or if species is other
Expand Down Expand Up @@ -156,17 +157,12 @@ def _agent_from_str(txt):

def _gilda_grounder(txt):
# Pre-process text for grounding
txt = txt.replace('_group', '')
txt = txt.replace('_', '-')
txt = txt.split('/')[0]
res = requests.post(gilda_url, json={'text': txt})
if res.status_code != 200:
logger.warning('Gilda service responded with status code %d' %
res.status_code)
txt = txt.replace("_group", "")
txt = txt.replace("_", "-")
txt = txt.split("/")[0]
res = gilda.ground(txt)
if not res:
logger.warning(f"Gilda grounder returned no results for {txt}")
return None
rj = res.json()
if not rj:
return None
top_term = rj[0]['term']
return top_term

top_term = res[0].term
return top_term.to_json()
5 changes: 4 additions & 1 deletion indra/sources/rlimsp/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,10 @@ def get_agent_from_entity_info(entity_info):
refs['CHEBI'] = 'CHEBI:%s' % id_dict['idString']
# These we take as is
elif id_dict['source'] in ('MESH', 'OMIM'):
refs[id_dict['source']] = id_dict['idString']
if ';' in id_dict['idString']:
refs[id_dict['source']] = id_dict['idString'].split(';')[0]
else:
refs[id_dict['source']] = id_dict['idString']
# CTD is sometimes used for MESH chemical IDs but can also be just '-'
elif id_dict['source'] == 'CTD':
if id_dict['idString'] != '-':
Expand Down
6 changes: 5 additions & 1 deletion indra/sources/signor/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,11 @@ def _get_agent(self, ent_name, ent_type, id, database):
# SIGNOR's format in which it leaves extra spaces around
# the ID, as in 'CID: 923'
id = id[4:].strip()
elif database == 'ChEBI' and id.startswith('SID:'):
# In older releases PubChem substance IDs were used with
# ChEBI as the source, these were later changed to use
# PUBCHEM
elif database in {'ChEBI', 'PUBCHEM'} \
and id.startswith('SID:'):
gnd_type = 'PUBCHEM.SUBSTANCE'
id = id[4:].strip()
db_refs = {gnd_type: id}
Expand Down
2 changes: 1 addition & 1 deletion indra/sources/ubibrowser/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from .processor import UbiBrowserProcessor


DOWNLOAD_URL = 'http://ubibrowser.ncpsb.org.cn/v2/Public/download/literature/'
DOWNLOAD_URL = 'http://ubibrowser.bio-it.cn/ubibrowser_v3/Public/download/literature/'
E3_URL = DOWNLOAD_URL + 'literature.E3.txt'
DUB_URL = DOWNLOAD_URL + 'literature.DUB.txt'

Expand Down
24 changes: 17 additions & 7 deletions indra/sources/ubibrowser/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,33 @@ def __init__(self, e3_df, dub_df):
self.statements = []

def extract_statements(self):
for df, stmt_type in [(self.e3_df, Ubiquitination),
(self.dub_df, Deubiquitination)]:
for df, stmt_type, subj_suffix in \
[(self.e3_df, Ubiquitination, 'E3'),
(self.dub_df, Deubiquitination, 'DUB')]:
for _, row in df.iterrows():
stmt = self._process_row(row, stmt_type)
stmt = self._process_row(row, stmt_type, subj_suffix)
if stmt:
self.statements.append(stmt)

@staticmethod
def _process_row(row, stmt_type):
def _process_row(row, stmt_type, subj_suffix):
# Note that even in the DUB table the subject of the statement
# is called "E3"
# There are some examples where a complex is implied (e.g., BMI1-RNF2),
# for simplicity we just ignore these
if '-' in row['E3AC']:
if '#' in row[f'SwissProt AC ({subj_suffix})']:
return None
subj_agent = get_standard_agent(row['E3GENE'], {'UP': row['E3AC']})
obj_agent = get_standard_agent(row['SUBGENE'], {'UP': row['SUBAC']})
# Interestingly, some of the E3s are missing entirely, we skip these
elif row[f'SwissProt AC ({subj_suffix})'] == '-':
return None
# Some of the same corner cases apply to the substrate as well
if row['SwissProt AC (Substrate)'] == '-':
return None
subj_agent = \
get_standard_agent(row[f'Gene Symbol ({subj_suffix})'],
{'UP': row[f'SwissProt AC ({subj_suffix})']})
obj_agent = get_standard_agent(row['Gene Symbol (Substrate)'],
{'UP': row['SwissProt AC (Substrate)']})
if row['SOURCE'] == 'MEDLINE' and row['SOURCEID'] != 'UNIPROT':
# Note: we sometimes get int here
pmid = str(row['SOURCEID'])
Expand Down
10 changes: 5 additions & 5 deletions indra/tests/test_sources/resources/ubibrowser_dub.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
NUMBER E3ID SUBID E3AC SUBAC E3GENE SUBGENE SOURCE SOURCEID SENTENCE E3TYPE COUNT type species
671 UBP10_HUMAN PTEN_HUMAN Q14694 P60484 USP10 PTEN MEDLINE 28852924 We further demonstrated that USP10 directly interacted with and stabilized PTEN via deubiquitination. USP 1 DUB H.sapiens
673 UBP36_HUMAN SODM_HUMAN Q9P275 P04179 USP36 SOD2 MEDLINE 21268071 we identified a deubiquitinating enzyme USP36 that regulates the protein stability of SOD2 USP 1 DUB H.sapiens
675 UBP13_HUMAN UBL4A_HUMAN Q92995 P11441 USP13 UBL4A MEDLINE 24424410 we identify USP13 as a gp78-associated DUB that eliminates ubiquitin conjugates from Ubl4A to maintain the functionality of Bag6. USP 2 DUB H.sapiens
677 UBP33_HUMAN ARRB1_HUMAN Q8TEY7 P49407 USP33 ARRB1 MEDLINE 19363159 We now report the discovery that the deubiquitinating enzyme ubiquitin-specific protease 33 (USP33) binds beta-arrestin2 and leads to the deubiquitination of beta-arrestins. USP 1 DUB H.sapiens
NUMBER SwissProt ID (DUB) SwissProt ID (Substrate) SwissProt AC (DUB) SwissProt AC (Substrate) Gene Symbol (DUB) Gene Symbol (Substrate) SOURCE SOURCEID SENTENCE DUBTYPE COUNT type species
56 UBP33_HUMAN ARRB1_HUMAN Q8TEY7 P49407 USP33 ARRB1 MEDLINE 19363159 "We now report the discovery that the deubiquitinating enzyme ubiquitin-specific protease 33 (<span class=""match term0"">USP33</span>) binds beta-arrestin2 and leads to the deubiquitination of beta-arrestins" USP 1 Training data H.sapiens
129 UBP13_HUMAN UBL4A_HUMAN Q92995 P11441 USP13 UBL4A MEDLINE 24424410 "we identify <span class=""match term0"">USP13</span> as a gp78-associated DUB that eliminates ubiquitin conjugates from <span class=""match term1"">Ubl4A</span> to maintain the functionality of Bag6" USP 1 Other H.sapiens
388 UBP10_HUMAN PTEN_HUMAN Q14694 P60484 USP10 PTEN MEDLINE 28852924 "We further demonstrated that <span class=""match term0"">USP10</span> directly interacted with and stabilized <span class=""match term1"">PTEN</span> via deubiquitination" USP 1 Training data H.sapiens
726 UBP36_HUMAN SODM_HUMAN Q9P275 P04179 USP36 SOD2 MEDLINE 21268071 "we identified a deubiquitinating enzyme <span class=""match term0"">USP36</span> that regulates the protein stability of <span class=""match term1"">SOD2</span>" USP 1 Other H.sapiens
12 changes: 6 additions & 6 deletions indra/tests/test_sources/resources/ubibrowser_e3.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
NUMBER E3ID SUBID E3AC SUBAC E3GENE SUBGENE SOURCE SOURCEID SENTENCE E3TYPE COUNT type species
1 AMFR2_HUMAN A1AT_HUMAN Q9UKV5 P01009 AMFR SERPINA1 MEDLINE 16979136 Here we report that gp78, a ubiquitin ligase (E3) pairing with mammalian Ubc7 for ERAD, ubiquitinates and facilitates degradation of ATZ, the classic deficiency variant of AAT having a Z mutation (Glu 342 Lys) RING 3 E3 H.sapiens
15 AMFR2_HUMAN HMDH_HUMAN Q9UKV5 P04035 AMFR HMGCR MEDLINE 20458442 UBE2G2, a previously known E2 of gp78, is demonstrated to be involved in the sterol-regulated ubiquitination and degradation of HMGCR RING 4 E3 H.sapiens
300 BRCA1_HUMAN BRCA1_HUMAN P38398 P38398 BRCA1 BRCA1 UniProt RNF8_BOVIN Following DNA double-strand breaks (DSBs), it is recruited to the sites of damage by ATM-phosphorylated MDC1, mediates the ubiquitination of histones H2A and H2AX, thereby promoting the formation of TP53BP1 and BRCA1 ionizing radiation-induced foci (IRIF) RING 21 E3 H.sapiens
5642 HRD1_CAEEL Q9BMU4_CAEEL Q20798 Q9BMU4 sel-11 atln-1 MEDLINE 32916628 UbiNet 2.0 RING 1 E3 C.elegans
5644 A0A2I4KBP1_DANRE SHH_DANRE A0A2I4KBP1 Q92008 gan shha MEDLINE 31503551 UbiNet 2.0 other 1 E3 D.rerio
NUMBER SwissProt ID (E3) SwissProt ID (Substrate) SwissProt AC (E3) SwissProt AC (Substrate) Gene Symbol (E3) Gene Symbol (Substrate) SOURCE SOURCEID SENTENCE E3TYPE COUNT type species
109 HRD1_CAEEL Q9BMU4_CAEEL Q20798 Q9BMU4 sel-11 atln-1 MEDLINE 32916628 32916628 RING 1 Other C.elegans
167 A0A2I4KBP1_DANRE SHH_DANRE A0A2I4KBP1 Q92008 gan shha MEDLINE 31503551 31503551 Other 1 Other D.rerio
198 AMFR2_HUMAN A1AT_HUMAN Q9UKV5 P01009 AMFR SERPINA1 MEDLINE 16979136 "Here we report that <span class=""match term0"">gp78</span>, a ubiquitin ligase (E3) pairing with mammalian Ubc7 for ERAD, ubiquitinates and facilitates degradation of ATZ, the classic deficiency variant of <span class=""match term1"">AAT</span> having a Z mutation (Glu 342 Lys)" RING 1 Training data H.sapiens
1040 GAN_HUMAN SHH_HUMAN Q9H2C0 Q15465 GAN SHH MEDLINE 31503551 31503551 BTB_3 1 Other H.sapiens
2631 SYVN1_HUMAN ATLA1_HUMAN Q86TM6 Q8WXF7 SYVN1 ATL1 MEDLINE 32916628 "The E3 Ubiquitin Ligase <span class=""match term0"">SYVN1</span> Ubiquitinates Atlastins to Remodel the Endoplasmic Reticulum Network." RING 1 Other H.sapiens
16 changes: 9 additions & 7 deletions indra/tests/test_sources/test_ubibrowser.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,15 @@
def test_extract_statements():
up = ubibrowser.process_file(e3_file, dub_file)
assert len(up.statements) == 9
assert isinstance(up.statements[0], Ubiquitination)
assert isinstance(up.statements[2], Ubiquitination)
assert isinstance(up.statements[-1], Deubiquitination)

assert_valid_statements(up.statements)

#1 AMFR2_HUMAN A1AT_HUMAN Q9UKV5 P01009 AMFR SERPINA1
# MEDLINE 16979136 Here we report that ... RING 3 E3 H.sapiens
e3_stmt = up.statements[0]
#198 AMFR2_HUMAN A1AT_HUMAN Q9UKV5 P01009 AMFR SERPINA1
# MEDLINE 16979136 "Here we report that ..." RING 1
# "Training data" H.sapiens
e3_stmt = up.statements[2]
assert e3_stmt.enz.name == 'AMFR'
assert e3_stmt.enz.db_refs['UP'] == 'Q9UKV5'
assert e3_stmt.sub.name == 'SERPINA1'
Expand All @@ -30,9 +31,10 @@ def test_extract_statements():
assert e3_stmt.evidence[0].pmid == '16979136'
assert e3_stmt.evidence[0].text.startswith('Here we report that')

# 677 UBP33_HUMAN ARRB1_HUMAN Q8TEY7 P49407 USP33 ARRB1
# MEDLINE 19363159 We now report the discovery that... USP 1 DUB H.sapiens
dub_stmt = up.statements[-1]
# 56 UBP33_HUMAN ARRB1_HUMAN Q8TEY7 P49407 USP33 ARRB1 MEDLINE
# 19363159 "We now report the discovery that " "USP 1" "Training data"
# H.sapiens
dub_stmt = up.statements[5]
assert dub_stmt.enz.name == 'USP33'
assert dub_stmt.enz.db_refs['UP'] == 'Q8TEY7'
assert dub_stmt.sub.name == 'ARRB1'
Expand Down

0 comments on commit 4c33d60

Please sign in to comment.