forked from emreg00/toolbox
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparse_umls.py.bak
384 lines (343 loc) · 17.7 KB
/
parse_umls.py.bak
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
import networkx, os, cPickle
def main():
base_dir = "/home/emre/arastirma/data/ontology/umls/2013AA/META/"
desc_file = base_dir + "MRCONSO.RRF"
rel_file = base_dir + "MRREL.RRF"
#g = get_mesh_disease_ontology(desc_file, rel_file)
#get_basic_info(desc_file, rel_file)
#get_drug_info(desc_file, rel_file)
mesh_id_to_name, concept_id_to_mesh_id, mesh_id_to_name_with_synonyms = get_mesh_id_mapping(desc_file, rel_file)
print "mesh dict:", len(mesh_id_to_name), len(mesh_id_to_name_with_synonyms)
print mesh_id_to_name["D003924"]
print concept_id_to_mesh_id["C0011860"]
print mesh_id_to_name_with_synonyms["D003924"]
for mesh_id in [ "D003924", "D001769", "D005947", "D004493", "D006943" ]:
print mesh_id, mesh_id in mesh_id_to_name
return
class UMLS(object):
def __init__(self, file_name_desc, file_name_rel, concept_types = None, concept_sources = None, only_preferred = False):
self.file_name_desc = file_name_desc
self.file_name_rel = file_name_rel
self.delim = "|"
self.ontology = None
self.concept_id_to_values = None
self.concept_to_concept_id = None
self.concept_id_to_relations = None
self._get_concept_info(concept_types, concept_sources, only_preferred)
return
def _get_concept_info(self, concept_types = None, concept_sources = None, only_preferred = False):
"""
Parses MRCONSO file to get concept info, typically called without any arguments and saved to the dictionary
"""
if self.concept_id_to_values is None and self.concept_to_concept_id is None:
self.concept_id_to_values = {}
self.concept_to_concept_id = {}
f = open(self.file_name_desc)
header_names = ["CUI", "LAT", "TS", "LUI", "STT", "SUI", "ISPREF", "AUI", "SAUI", "SCUI", "SDUI", "SAB", "TTY", "CODE", "STR", "SRL", "SUPPRESS", "CVF", "dummy"]
# CUI / LAT (ENG) / TS (P) / STT (PF/VO) all / ISPREF (Y) / SCUI - source based id / SAB - source / TTY (PT/SY) pt-preferred sy-synonym / CODE similar to SCUI / STR
col_to_idx = dict((val.lower(), i) for i, val in enumerate(header_names))
for line in f:
words = line.strip("\n").split(self.delim)
concept_id = words[col_to_idx["cui"]]
#if concept_id == "C0360380":
# print len(words), words
# print words[col_to_idx["ts"]], words[col_to_idx["ispref"]], words[col_to_idx["tty"]]
if words[col_to_idx["lat"]] != "ENG": # words[col_to_idx["ts"]] != "P"
continue
if only_preferred and words[col_to_idx["ispref"]] != "Y":
continue
concept_type = words[col_to_idx["tty"]]
if concept_types is not None and concept_type not in concept_types:
continue
source = words[col_to_idx["sab"]]
if concept_sources is not None and source not in concept_sources:
continue
concept = words[col_to_idx["str"]]
source_id = words[col_to_idx["code"]]
d = self.concept_id_to_values.setdefault(concept_id, {})
d.setdefault(source, set()).add((concept, source_id, concept_type))
if concept_id in self.concept_to_concept_id:
print "Concept id conflict - overwriting:", concept, self.concept_to_concept_id[concept], concept_id
self.concept_to_concept_id[concept] = concept_id
return self.concept_id_to_values, self.concept_to_concept_id
def get_concept_id(self, concept):
return self.concept_to_concept_id[concept]
def get_values_by_concept_id(self, concept_id):
return self.concept_id_to_values[concept_id]
def get_concepts(self, concept_id, concept_sources = None, concept_types = None):
concepts = []
values = self.get_values_by_concept_id(concept_id)
for source, vals in values.iteritems():
if concept_sources is not None and source not in concept_sources:
continue
for concept, source_id, concept_type in vals:
if concept_types is not None and concept_type not in concept_types:
continue
concepts.append((source, concept, concept_type))
#else:
# print concept_type
#if len(concepts) == 0:
# raise ValueError("Concept not found")
return concepts
def get_relations(self, relation_types = None, relation_a_types = None, source_types = None): # , "may_treat", "may_be_treated"
"""
Parses MRREL file to get relation info, typically called with relation type parameters and saved to the dictionary
"""
if self.concept_id_to_relations is None:
self.concept_id_to_relations = {}
f = open(self.file_name_rel)
header_names = ["CUI1", "AUI1", "STYPE1", "REL", "CUI2", "AUI2", "STYPE2", "RELA", "RUI", "SRUI", "SAB", "SL", "RG", "DIR", "SUPPRESS", "CVF", "dummy"]
col_to_idx = dict((val.lower(), i) for i, val in enumerate(header_names))
for line in f:
words = line.strip("\n").split(self.delim)
relation = words[col_to_idx["rel"]]
relation_a = words[col_to_idx["rela"]]
if relation_types is not None and relation not in relation_types:
continue
if relation_a_types is not None and relation_a not in relation_a_types:
continue
source_id = words[col_to_idx["cui1"]]
target_id = words[col_to_idx["cui2"]]
source = words[col_to_idx["sab"]]
if source_types is not None and source not in source_types:
continue
d = self.concept_id_to_relations.setdefault(target_id, {})
d.setdefault(source_id, []).append((relation, source))
if relation_a != "":
d[source_id].append((relation_a, source))
return self.concept_id_to_relations
def get_ontology(self, root_concept = None, relation_types = None, relation_a_types = None, source_types = None):
"""
Gets the graph (ontology tree) from MRREL file, typically called with relation type parameters and saved as a Networkx Graph object
"""
if self.ontology is None:
self.ontology = networkx.DiGraph()
f = open(self.file_name_rel)
header_names = ["CUI1", "AUI1", "STYPE1", "REL", "CUI2", "AUI2", "STYPE2", "RELA", "RUI", "SRUI", "SAB", "SL", "RG", "DIR", "SUPPRESS", "CVF", "dummy"]
col_to_idx = dict((val.lower(), i) for i, val in enumerate(header_names))
i = 0
for line in f:
words = line.strip("\n").split(self.delim)
source_id = words[col_to_idx["cui1"]]
target_id = words[col_to_idx["cui2"]]
relation = words[col_to_idx["rel"]]
relation_a = words[col_to_idx["rela"]]
source = words[col_to_idx["sab"]]
if relation_types is not None and relation not in relation_types:
continue
if relation_a_types is not None and relation_a not in relation_a_types:
continue
if source_types is not None and source not in source_types:
continue
#if source_id == root or target_id == root:
# print self.get_concepts(source_id), relation, self.get_concepts(target_id), source
self.ontology.add_edge(target_id, source_id)
i += 1
#if i > 1000:
# break
self.ontology = self.ontology.reverse()
if root_concept is not None:
root = self.get_concept_id(root_concept)
g = get_tree_rooted_at(self.ontology, root)
else:
g = self.ontology
return g
def get_drug_disease_relations(self):
drug_to_diseases = {}
concept_types = set(["MH", "PF", "PT", "PN", "EN", "EP", "FN", "SY", "PM"])
for nodes in self.get_ontology(root_concept = "Pharmaceutical / biologic product", relation_a_types = set(["isa"]), source_types = set(["SNOMEDCT"])).edges():
for node in nodes:
try:
rels = self.get_relations(relation_a_types=set(["treats", "may_treat"]), source_types = None)[node]
except:
continue
for cid, values in rels.iteritems():
relation, source = values
#if relation != "treats":
# continue
for source, concept, concept_type in self.get_concepts(node, concept_types = concept_types):
for source2, concept2, concept_type2 in self.get_concepts(cid, concept_types = concept_types):
drug_to_diseases.setdefault(concept, set()).add(concept2)
return drug_to_diseases
def get_tree_rooted_at(g, root):
neighbors = g.neighbors(root)
nodes_selected = set([root]) | set(neighbors)
while True:
neighbors_inner = set()
for node in neighbors:
neighbors_inner |= set(g.neighbors(node))
neighbors = set(list(neighbors_inner))
#if len(neighbors) == 0: # does not work probably due to circularity
# break
if len(neighbors - nodes_selected) == 0:
break
nodes_selected |= neighbors_inner
return g.subgraph(nodes_selected)
def get_mesh_id_mapping(desc_file, rel_file, only_diseases = True, dump_file = None):
if dump_file is not None and os.path.exists(dump_file):
values = cPickle.load(open(dump_file))
source_id_to_concept, concept_id_to_mesh_id, source_id_to_concepts = values
return source_id_to_concept, concept_id_to_mesh_id, source_id_to_concepts
umls = UMLS(desc_file, rel_file)
concept_ids_disease = None
if only_diseases: #! This excludes several synoyms (includes only snonyms of the concept id that is part of the MeSH diseases)
g = get_mesh_disease_ontology(desc_file, rel_file, umls=umls, dump_file=dump_file+".ontology")
concept_ids_disease = set(g.nodes())
source_id_to_concept = {} # only main headers
source_id_to_concepts = {} # all concepts including synonyms
concept_id_to_mesh_id = {}
for concept_id, values in umls.concept_id_to_values.iteritems():
if concept_ids_disease is not None and concept_id not in concept_ids_disease:
continue
for concept, source_id, concept_type in values["MSH"]:
if concept_type == "MH": # main heading
source_id_to_concept[source_id] = concept
source_id_to_concepts.setdefault(source_id, set()).add(concept)
#if concept_id in concept_id_to_mesh_id and concept_id_to_mesh_id[concept_id] != source_id:
# print "Inconsistency", concept_id, source_id
concept_id_to_mesh_id[concept_id] = source_id
if dump_file is not None:
values = (source_id_to_concept, concept_id_to_mesh_id, source_id_to_concepts)
cPickle.dump(values, open(dump_file, 'w'))
return source_id_to_concept, concept_id_to_mesh_id, source_id_to_concepts
def get_mesh_disease_ontology(desc_file, rel_file, umls = None, dump_file = None):
if dump_file is not None and os.path.exists(dump_file):
g = cPickle.load(open(dump_file))
return g
if umls is None:
umls = UMLS(desc_file, rel_file)
root = "Diseases (MeSH Category)" #! Consider adding Mental disorders as well
sources = set(["MSH"])
relations = set(["CHD"])
g = umls.get_ontology(root_concept = root, relation_types = relations, source_types = sources)
#print "Disease ontology:", len(g.nodes()), len(g.edges())
#for node in g.neighbors(umls.get_concept_id(root)):
# print node, umls.get_concepts(node, concept_sources = sources)
cPickle.dump(g, open(dump_file, 'w'))
return g
def get_mesh_id_to_disease_category(desc_file, rel_file, dump_file = None):
g = get_mesh_disease_ontology(desc_file, rel_file, dump_file = dump_file)
root = "C0012674" # "Diseases (MeSH Category)"
concept_id_to_top_ids = {}
for parent in g.neighbors(root):
t = get_tree_rooted_at(g, parent)
for node in t.nodes():
concept_id_to_top_ids.setdefault(node, []).append(parent)
return concept_id_to_top_ids
def get_snomedct_drug_ontology(desc_file, rel_file, umls = None):
if umls is None:
umls = UMLS(desc_file, rel_file)
root = "Pharmaceutical / biologic product"
sources = set(["SNOMEDCT"])
relations = set(["isa"])
g = umls.get_ontology(root_concept = root, relation_types = relations, source_types = sources)
return g
def get_basic_info(desc_file, rel_file):
u = UMLS(desc_file, rel_file)
concept = "Diabetes Mellitus" #"Triazole antifungals"
sources = set(["MSH"]) # set(["SNOMEDCT"])
relations = set(["CHD"]) # set(["isa"])
concept_id = u.get_concept_id(concept)
print concept, concept_id
concepts = u.get_concepts(concept_id, concept_sources = sources)
print concepts
root = "Diseases (MeSH Category)" #"Pharmaceutical / biologic product"
g = u.get_ontology(root_concept = root, relation_types = relations, source_types = sources)
print len(g.nodes()), len(g.edges())
print concept_id, g.edges([concept_id])
for s, v in g.edges([concept_id]):
print s, v, u.get_concepts(v, concept_sources = sources)
concept_id = "C0011849" #"C0360363" # Azole antifungal
rels = u.get_relations(relation_types = relations, source_types = sources)[concept_id]
for cid, values in rels.iteritems():
print cid, values
return
def get_drug_info(desc_file, rel_file):
u = UMLS(desc_file, rel_file)
drug_to_diseases = u.get_drug_disease_relations()
for drug, diseases in drug_to_diseases.iteritems():
print drug, diseases
return
def get_disease_specific_drugs(umls, selected_drugs, name_to_drug, synonym_to_drug, phenotypes):
drug_to_diseases = umls.get_drug_disease_relations()
disease_to_drugs = {}
for drug, diseases in drug_to_diseases.iteritems():
drug = drug.split()[0].lower()
if drug in name_to_drug:
drugbank_id = name_to_drug[drug]
elif drug in synonym_to_drug:
drugbank_id = synonym_to_drug[drug]
else:
continue
if drugbank_id not in selected_drugs:
continue
for description in diseases:
description = description.lower()
for phenotype in phenotypes:
disease_mod = phenotype.replace(" and ", ", ")
phrases = disease_mod.split(",")
values = []
for phrase in phrases:
inner_values = []
words = phrase.strip().split()
for i, token in enumerate(words):
if token.endswith("'s"):
token = token[:-2]
if i == len(words) - 1:
if token[-1] == "s":
token = token[:-1]
if token in ("disease", "disorder", "syndrome"):
continue
inner_values.append(token)
#if len(inner_values) > 0:
values.append(" ".join(inner_values))
if all([ description.find(word.strip()) != -1 for word in values ]): # phenotype.split(",")
disease_to_drugs.setdefault(phenotype, set()).add(drugbank_id)
return disease_to_drugs
def old_get_disease_specific_drugs(umls, drug_to_name, phenotypes):
import re
#drug_to_diseases = {"telmisartan": set(['Diabetic renal disease', 'congestive cardiac failure', 'congestive heart failure chf', 'left ventricular dysfunction', 'HBP', 'failure congestive heart'])
drug_to_diseases = umls.get_drug_disease_relations()
exps = [ re.compile(keyword.lower()) for keyword in phenotypes ]
drug_id_to_exp = {}
for drug_id, keyword in drug_to_name.iteritems():
try:
for l in "[{}]":
keyword = keyword.replace(l, "_")
exp = re.compile(keyword.lower())
except:
print keyword
continue
drug_id_to_exp[drug_id] = exp
disease_to_drugs = {}
for drug, diseases in drug_to_diseases.iteritems():
#drug = drug.lower().split()[0]
#print drug, diseases
drugbank_ids = [] #None
for drug_id, drug_name in drug_to_name.iteritems():
if drug_id not in drug_id_to_exp:
continue
exp_drug = drug_id_to_exp[drug_id]
if exp_drug.search(drug.lower()) is not None:
#if len(drugbank_ids) > 0: # is not None:
#raise ValueError("Duplicate match for drug " + drug_id)
#print "Duplicate match for drug ", drug, drug_id, drugbank_ids
drugbank_ids.append(drug_id)
if len(drugbank_ids) == 0:
continue
for disease, exp in zip(phenotypes, exps):
if any(map(lambda x: x is not None, [ exp.search(description.lower()) for description in diseases ])):
selected_drugbank_id = None
length = 0
for drugbank_id in drugbank_ids:
# choose drug with longer name
val = len(drug_to_name[drugbank_id])
if val > length:
selected_drugbank_id = drugbank_id
length = val
#if len(drugbank_ids) > 1:
# print selected_drugbank_id, drugbank_ids
disease_to_drugs.setdefault(disease, set()).add(selected_drugbank_id)
return disease_to_drugs
if __name__ == "__main__":
main()