Skip to content

Commit fb3f91a

Browse files
committed
Annotate Reactome using offline database files #35
1 parent 6351208 commit fb3f91a

File tree

3 files changed

+123
-54
lines changed

3 files changed

+123
-54
lines changed

reports/annotate_reactome.py

Lines changed: 54 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -35,22 +35,45 @@ def inner(*args, sleep_s=sleep_s, **kw):
3535
return deco
3636

3737

38-
@_lxml_try(sleep_s=0.01)
39-
def fetch_title(s):
40-
reactome_id = extract_reactome_id(s)
41-
42-
if not reactome_id:
43-
return "N/A"
38+
def fetch_title(*strs, pathways_db_path=None):
39+
if pathways_db_path:
40+
df = pd.read_csv(pathways_db_path, sep="\t", header=None,
41+
names=["title", "species"], index_col=0)
4442
else:
45-
url = "http://www.reactome.org/content/detail/" + reactome_id
46-
t = lxml.html.parse(url)
47-
return t.find(".//title").text.replace("Reactome | ", "")
43+
df = None
44+
45+
titles = []
46+
for s in strs:
47+
reactome_id = extract_reactome_id(s)
48+
49+
if reactome_id:
50+
if df is not None:
51+
title = _db_fetch_title(reactome_id, df)
52+
else:
53+
title = _web_fetch_title(reactome_id)
54+
else:
55+
title = "N/A"
56+
57+
titles.append(title)
58+
59+
return titles
60+
61+
62+
def _db_fetch_title(reactome_id, df):
63+
return df.title.loc[reactome_id]
64+
65+
66+
@_lxml_try(sleep_s=0.01)
67+
def _web_fetch_title(reactome_id):
68+
url = "http://www.reactome.org/content/detail/" + reactome_id
69+
t = lxml.html.parse(url)
70+
return t.find(".//title").text.replace("Reactome | ", "")
4871

4972

5073
def extract_reactome_id(s):
5174
match = re.match(".*R-HSA-(\d+).*", str(s), flags=re.IGNORECASE)
5275
if match:
53-
reactome_id = match.group(1)
76+
reactome_id = "R-HSA-{}".format(match.group(1))
5477
else:
5578
reactome_id = None
5679
return reactome_id
@@ -64,6 +87,8 @@ def _cli():
6487
)
6588
parser.add_argument('path', metavar="PATH",
6689
help="Table path")
90+
parser.add_argument('--db', metavar="PATH",
91+
help="ReactomePathways.txt file path")
6792

6893
parser.add_argument('col', metavar="INT_OR_STR",
6994
help="Index (>=1) or name of column containing Reactome pathway id. If "
@@ -80,6 +105,7 @@ def _cli():
80105
sep = args.sep
81106
col = args.col
82107
output_path = args.output_path
108+
pathways_db_path = args.db
83109

84110
try:
85111
col_i = int(col)
@@ -108,21 +134,26 @@ def _cli():
108134
if col_idx is None:
109135
print("Table header:", ", ".join("'{}'".format(c) for c in df.columns), file=sys.stderr)
110136
if index_col is not None:
111-
reactome_ids = df.index
137+
reactome_links = df.index
112138
else:
113-
reactome_ids = df.loc[:, col]
139+
reactome_links = df.loc[:, col]
114140
else:
115-
reactome_ids = df.iloc[:, col_idx]
116-
117-
titles = []
118-
print("Fetch titles for:", len(reactome_ids), "records", file=sys.stderr)
119-
for i, s in enumerate(reactome_ids, 1):
120-
titles.append(fetch_title(s))
121-
if i % 100 == 0:
122-
print('\n{} of {}'.format(i, len(reactome_ids)), file=sys.stderr)
123-
else:
124-
print('.', end="", file=sys.stderr)
125-
sys.stderr.flush()
141+
reactome_links = df.iloc[:, col_idx]
142+
143+
print("Fetch titles for:", len(reactome_links), "records", file=sys.stderr)
144+
if not pathways_db_path:
145+
# online mode:
146+
titles = []
147+
for i, s in enumerate(reactome_links, 1):
148+
titles.append(fetch_title(s))
149+
if i % 100 == 0:
150+
print('\n{} of {}'.format(i, len(reactome_links)), file=sys.stderr)
151+
else:
152+
print('.', end="", file=sys.stderr)
153+
sys.stderr.flush()
154+
else:
155+
# offline mode
156+
titles = fetch_title(*reactome_links, pathways_db_path=pathways_db_path)
126157
print(file=sys.stderr)
127158

128159
df['reactome_titles'] = titles

test/test_annotate_reactome.py

Lines changed: 59 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -9,33 +9,50 @@
99
@pytest.mark.parametrize("label,rid", [
1010
("foo", None),
1111
("2644605", None),
12-
("R-HSA-2644605", "2644605"),
13-
("r-hsa-2644605", "2644605"),
14-
("foo_R-HSA-2644605_boo", "2644605"),
15-
("foo_r-hsa-2644605_boo", "2644605"),
16-
("foo.R-HSA-2644605-boo", "2644605"),
17-
("foo-R-HSA-2644605.boo", "2644605"),
18-
("foo-R-HSA-2644605boo", "2644605"),
19-
("http://www.reactome.org/content/detail/R-HSA-2644605", "2644605"),
12+
("R-HSA-2644605", "R-HSA-2644605"),
13+
("r-hsa-2644605", "R-HSA-2644605"),
14+
("foo_R-HSA-2644605_boo", "R-HSA-2644605"),
15+
("foo_r-hsa-2644605_boo", "R-HSA-2644605"),
16+
("foo.R-HSA-2644605-boo", "R-HSA-2644605"),
17+
("foo-R-HSA-2644605.boo", "R-HSA-2644605"),
18+
("foo-R-HSA-2644605boo", "R-HSA-2644605"),
19+
("http://www.reactome.org/content/detail/R-HSA-2644605", "R-HSA-2644605"),
2020
])
2121
def test_extract_reactome_id(label, rid):
2222
assert rid == extract_reactome_id(label)
2323

2424

25-
@pytest.mark.parametrize("label,title", [
26-
("foo", "N/A"),
27-
("foo.R-HSA-2644605-boo", "FBXW7 Mutants and NOTCH1 in Cancer"),
25+
@pytest.mark.parametrize("label,title,offline", [
26+
("foo", "N/A", False),
27+
("foo", "N/A", True),
28+
("foo.R-HSA-2644605-boo", "FBXW7 Mutants and NOTCH1 in Cancer", False),
29+
("foo.R-HSA-2644605-boo", "FBXW7 Mutants and NOTCH1 in Cancer", True),
2830
])
29-
def test_fetch_title(label, title):
30-
assert title == fetch_title(label)
31+
def test_fetch_title(test_data, label, title, offline):
32+
kw = {}
33+
if offline:
34+
kw['pathways_db_path'] = test_data("reactome/ReactomePathways.txt")
35+
36+
assert title == fetch_title(label, **kw)[0]
37+
38+
39+
@pytest.mark.parametrize("offline", [True, False])
40+
def test_fetch_titles(test_data, offline):
41+
kw = {}
42+
if offline:
43+
kw['pathways_db_path'] = test_data("reactome/ReactomePathways.txt")
44+
45+
assert ['FBXW7 Mutants and NOTCH1 in Cancer', '2-LTR circle formation'] == fetch_title(
46+
"R-HSA-2644605", "R-HSA-164843", **kw
47+
)
3148

3249

3350
def test_cli_help(capfd):
3451
run("python", "{}/reports/annotate_reactome.py".format(PROJECT_ROOT_PATH),
3552
"-h")
3653
output = capfd.readouterr()
3754
assert """python {}/reports/annotate_reactome.py -h
38-
usage: annotate_reactome.py [-h] [-F FS] [-o PATH] PATH INT_OR_STR
55+
usage: annotate_reactome.py [-h] [--db PATH] [-F FS] [-o PATH] PATH INT_OR_STR
3956
4057
Annotates table containing http://www.reactome.org pathways ids (R-HSA-nnnnn)
4158
with pathways titles
@@ -47,37 +64,48 @@ def test_cli_help(capfd):
4764
4865
optional arguments:
4966
-h, --help show this help message and exit
67+
--db PATH ReactomePathways.txt file path (default: None)
5068
-F FS Field separator, auto-detect if not specified (default: None)
5169
-o PATH Output path (default: None)
5270
""".format(PROJECT_ROOT_PATH) == output[0]
5371
assert "" == output[1]
5472

5573

56-
@pytest.mark.parametrize("file,args,result_stdout,result_stderr,fresult", [
57-
("empty.csv", ["1"], "", "File is empty: ", None),
58-
("no_header_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None),
59-
("no_header_comma.csv", ["0"], "", "Column index should be >= 1, but was: 0", None),
60-
("no_header_comma.csv", ["1"], "", "", "no_header_comma.result.txt"),
61-
("header_index_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None),
62-
("header_index_comma.csv", ["1"], "", "", "header_index_comma.result1.txt"),
63-
("header_index_comma.csv", ["''"], "", "", "header_index_comma.result2.txt"),
64-
("header_colname_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None),
65-
("header_colname_comma.csv", ["1"], "", "", "header_colname_comma.result1.txt"),
66-
("header_colname_comma.csv", ["data"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None),
67-
("header_colname_comma.csv", ["data"], "", "", "header_colname_comma.result2.txt"),
74+
@pytest.mark.parametrize("file,args,result_stdout,result_stderr,fresult,offline", [
75+
# offline
76+
("empty.csv", ["1"], "", "File is empty: ", None, True),
77+
("no_header_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None, True),
78+
("no_header_comma.csv", ["0"], "", "Column index should be >= 1, but was: 0", None, True),
79+
("no_header_comma.csv", ["1"], "", "", "no_header_comma.result.txt", True),
80+
("header_index_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None, True),
81+
("header_index_comma.csv", ["1"], "", "", "header_index_comma.result1.txt", True),
82+
("header_index_comma.csv", ["''"], "", "", "header_index_comma.result2.txt", True),
83+
("header_colname_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None, True),
84+
("header_colname_comma.csv", ["1"], "", "", "header_colname_comma.result1.txt", True),
85+
("header_colname_comma.csv", ["data"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None, True),
86+
("header_colname_comma.csv", ["data"], "", "", "header_colname_comma.result2.txt", True),
6887
("header_colname_comma.csv", ["data", "-F','"], "FBXW7 Mutants and NOTCH1 in Cancer", "",
69-
None),
70-
("header_colname_tab.tsv", ["data"], "", "", "header_colname_comma.result2.txt"),
71-
("header_colname_tab.tsv", ["data", "-F'\t'"], "", "", "header_colname_tab.result.txt"),
88+
None, True),
89+
("header_colname_tab.tsv", ["data"], "", "", "header_colname_comma.result2.txt", True),
90+
("header_colname_tab.tsv", ["data", "-F'\t'"], "", "", "header_colname_tab.result.txt", True),
7291
("header_colname_comma.csv", ["data", "-F' '"], "",
73-
"KeyError: 'the label [data] is not in the [columns]'", None),
92+
"KeyError: 'the label [data] is not in the [columns]'", None, True),
93+
94+
# using web access:
95+
("header_colname_comma.csv", ["1"], "FBXW7 Mutants and NOTCH1 in Cancer", "", None, False),
7496
])
75-
def test_foo(test_data, tmp_path, capfd, file, args, result_stdout, result_stderr, fresult):
97+
def test_cli(test_data, tmp_path, capfd, file, args, result_stdout, result_stderr, fresult,
98+
offline):
99+
100+
pathways_db_path = test_data("reactome/ReactomePathways.txt") if offline else None
76101
input_path = str(test_data("reactome/" + file))
77102

78103
if fresult:
79104
args.append("-o " + str(tmp_path / "result.txt"))
80105

106+
if pathways_db_path:
107+
args.append("--db " + pathways_db_path)
108+
81109
run(
82110
"python", "{}/reports/annotate_reactome.py".format(PROJECT_ROOT_PATH),
83111
input_path,
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
"R-ATH-73843" "5-Phosphoribose 1-diphosphate biosynthesis" "Arabidopsis thaliana"
2+
"R-ATH-1369062" "ABC transporters in lipid homeostasis" "Arabidopsis thaliana"
3+
"R-HSA-168276" " NS1 Mediated Effects on Host Pathways" "Homo sapiens"
4+
"R-HSA-164843" "2-LTR circle formation" "Homo sapiens"
5+
"R-HSA-73843" "5-Phosphoribose 1-diphosphate biosynthesis" "Homo sapiens"
6+
"R-HSA-1971475" "A tetrasaccharide linker sequence is required for GAG synthesis" "Homo sapiens"
7+
"R-HSA-5619084" "ABC transporter disorders" "Homo sapiens"
8+
"R-HSA-2644605" "FBXW7 Mutants and NOTCH1 in Cancer" "Homo sapiens"
9+
"R-XTR-379724" "tRNA Aminoacylation" "Xenopus tropicalis"
10+
"R-XTR-199992" "trans-Golgi Network Vesicle Budding" "Xenopus tropicalis"

0 commit comments

Comments
 (0)