forked from p2/ClinicalTrialsNLP
-
Notifications
You must be signed in to change notification settings - Fork 8
/
umls.py
361 lines (283 loc) · 10.2 KB
/
umls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# utilities to handle UMLS
#
# 2013-01-01 Created by Pascal Pfiffner
#
import csv
import sys
import os.path
import logging
from sqlite import SQLite
class UMLS (object):
""" A class for importing UMLS terminologies into an SQLite database.
"""
@classmethod
def check_databases(cls):
""" Check if our databases are in place and if not, import them.
Will raise on errors!
UMLS: (umls.db)
If missing prompt to use the `umls.sh` script
SNOMED: (snomed.db)
Read SNOMED CT from tab-separated files and create an SQLite database.
"""
# UMLS
umls_db = os.path.join('databases', 'umls.db')
if not os.path.exists(umls_db):
raise("The UMLS database at %s does not exist. Run the import script `databases/umls.sh`." % umls_db)
# SNOMED
SNOMED.sqlite_handle = None
try:
SNOMED.setup_tables()
except Exception as e:
raise("SNOMED setup failed: %s" % e)
# RxNorm
rxnorm_db = os.path.join('databases', 'rxnorm.db')
if not os.path.exists(rxnorm_db):
raise("The RxNorm database at %s does not exist. Run the import script `databases/rxnorm.sh`." % rxnorm_db)
else:
rx_map = {
'descriptions': 'snomed_desc.csv',
'relationships': 'snomed_rel.csv'
}
# need to import?
for table, filename in rx_map.iteritems():
num_query = 'SELECT COUNT(*) FROM %s' % table
num_existing = SNOMED.sqlite_handle.executeOne(num_query, ())[0]
if num_existing > 0:
continue
snomed_file = os.path.join('databases', filename)
if not os.path.exists(snomed_file):
raise("Need to import SNOMED, but the file %s is not present. Download SNOMED from http://www.nlm.nih.gov/research/umls/licensedcontent/snomedctfiles.html" % filename)
SNOMED.import_csv_into_table(snomed_file, table)
class UMLSLookup (object):
""" UMLS lookup """
sqlite_handle = None
did_check_dbs = False
preferred_sources = ['"SNOMEDCT"', '"MTH"']
def __init__(self):
self.sqlite = SQLite.get('databases/umls.db')
def lookup_code(self, cui, preferred=True):
""" Return a list with triples that contain:
- name
- source
- semantic type
by looking it up in our "descriptions" database.
The "preferred" settings has the effect that only names from SNOMED
(SNOMEDCD) and the Metathesaurus (MTH) will be reported. A lookup in
our "descriptions" table is much faster than combing through the full
MRCONSO table.
"""
if cui is None or len(cui) < 1:
return []
# lazy UMLS db checking
if not UMLSLookup.did_check_dbs:
UMLSLookup.did_check_dbs = True
try:
UMLS.check_databases()
except Exception as e:
logging.error(e)
# should this crash and burn?
# take care of negations
negated = '-' == cui[0]
if negated:
cui = cui[1:]
parts = cui.split('@', 1)
lookup_cui = parts[0]
# STR: Name
# SAB: Abbreviated Source Name
# STY: Semantic Type
if preferred:
sql = 'SELECT STR, SAB, STY FROM descriptions WHERE CUI = ? AND SAB IN (%s)' % ", ".join(UMLSLookup.preferred_sources)
else:
sql = 'SELECT STR, SAB, STY FROM descriptions WHERE CUI = ?'
# return as list
arr = []
for res in self.sqlite.execute(sql, (lookup_cui,)):
if negated:
arr.append(("[NEGATED] %s" % res[0], res[1], res[2]))
else:
arr.append(res)
return arr
def lookup_code_meaning(self, cui, preferred=True, no_html=True):
""" Return a string (an empty string if the cui is null or not found)
by looking it up in our "descriptions" database.
The "preferred" settings has the effect that only names from SNOMED
(SNOMEDCD) and the Metathesaurus (MTH) will be reported. A lookup in
our "descriptions" table is much faster than combing through the full
MRCONSO table.
"""
names = []
for res in self.lookup_code(cui, preferred):
if no_html:
names.append("%s (%s) [%s]" % (res[0], res[1], res[2]))
else:
names.append("%s (<span style=\"color:#090;\">%s</span>: %s)" % (res[0], res[1], res[2]))
comp = ", " if no_html else "<br/>\n"
return comp.join(names) if len(names) > 0 else ''
class SNOMED (object):
sqlite_handle = None
# -------------------------------------------------------------------------- Database Setup
@classmethod
def import_csv_into_table(cls, snomed_file, table_name):
""" Import SNOMED CSV into our SQLite database.
The SNOMED CSV files can be parsed by Python's CSV parser with the
"excel-tab" flavor.
"""
logging.debug('..> Importing SNOMED %s into snomed.db...' % table_name)
# not yet imported, parse tab-separated file and import
with open(snomed_file, 'rb') as csv_handle:
cls.sqlite_handle.isolation_level = 'EXCLUSIVE'
sql = cls.insert_query_for(table_name)
reader = unicode_csv_reader(csv_handle, dialect='excel-tab')
i = 0
try:
for row in reader:
if i > 0: # first row is the header row
# execute SQL (we just ignore duplicates)
params = cls.insert_tuple_from_csv_row_for(table_name, row)
try:
cls.sqlite_handle.execute(sql, params)
except Exception as e:
sys.exit(u'Cannot insert %s: %s' % (params, e))
i += 1
# commit to file
cls.sqlite_handle.commit()
cls.did_import(table_name)
cls.sqlite_handle.isolation_level = None
except csv.Error as e:
sys.exit('CSV error on line %d: %s' % (reader.line_num, e))
logging.debug('..> %d concepts parsed' % (i-1))
@classmethod
def setup_tables(cls):
""" Creates the SQLite tables we need, not the tables we deserve.
"""
if cls.sqlite_handle is None:
cls.sqlite_handle = SQLite.get('databases/snomed.db')
# descriptions
cls.sqlite_handle.create('descriptions', '''(
concept_id INTEGER PRIMARY KEY,
lang TEXT,
term TEXT,
isa VARCHAR,
active INT
)''')
cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS isa_index ON descriptions (isa)")
# relationships
cls.sqlite_handle.create('relationships', '''(
relationship_id INTEGER PRIMARY KEY,
source_id INT,
destination_id INT,
rel_type INT,
rel_text VARCHAR,
active INT
)''')
cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS source_index ON relationships (source_id)")
cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS destination_index ON relationships (destination_id)")
cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS rel_type_index ON relationships (rel_type)")
cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS rel_text_index ON relationships (rel_text)")
@classmethod
def insert_query_for(cls, table_name):
""" Returns the insert query needed for the given table
"""
if 'descriptions' == table_name:
return '''INSERT OR IGNORE INTO descriptions
(concept_id, lang, term, isa, active)
VALUES
(?, ?, ?, ?, ?)'''
if 'relationships' == table_name:
return '''INSERT OR IGNORE INTO relationships
(relationship_id, source_id, destination_id, rel_type, active)
VALUES
(?, ?, ?, ?, ?)'''
return None
@classmethod
def insert_tuple_from_csv_row_for(cls, table_name, row):
if 'descriptions' == table_name:
isa = ''
if len(row) > 6:
if '900000000000013009' == row[6]:
isa = 'synonym'
elif '900000000000003001' == row[6]:
isa = 'full'
return (int(row[4]), row[5], row[7], isa, int(row[2]))
if 'relationships' == table_name:
return (int(row[0]), int(row[4]), int(row[5]), int(row[7]), int(row[2]))
return None
@classmethod
def did_import(cls, table_name):
""" Allows us to set hooks after tables have been imported
"""
if 'relationships' == table_name:
cls.sqlite_handle.execute('''
UPDATE relationships SET rel_text = 'isa' WHERE rel_type = 116680003
''')
cls.sqlite_handle.execute('''
UPDATE relationships SET rel_text = 'finding_site' WHERE rel_type = 363698007
''')
class SNOMEDLookup (object):
""" SNOMED lookup """
sqlite_handle = None
def __init__(self):
self.sqlite = SQLite.get('databases/snomed.db')
def lookup_code_meaning(self, snomed_id, preferred=True, no_html=True):
""" Returns HTML for all matches of the given SNOMED id.
The "preferred" flag here currently has no function.
"""
if snomed_id is None or len(snomed_id) < 1:
return ''
sql = 'SELECT term, isa, active FROM descriptions WHERE concept_id = ?'
names = []
# loop over results
for res in self.sqlite.execute(sql, (snomed_id,)):
if not no_html and ('synonym' == res[1] or 0 == res[2]):
names.append("<span style=\"color:#888;\">%s</span>" % res[0])
else:
names.append(res[0])
if no_html:
return ", ".join(names) if len(names) > 0 else ''
return "<br/>\n".join(names) if len(names) > 0 else ''
class RxNormLookup (object):
""" RxNorm lookup """
sqlite_handle = None
def __init__(self):
self.sqlite = SQLite.get('databases/rxnorm.db')
def lookup_code_meaning(self, rx_id, preferred=True, no_html=True):
""" Return HTML for the meaning of the given code.
If preferred is True (the default), only one match will be returned,
looking for specific TTY and using the "best" one. """
if rx_id is None or len(rx_id) < 1:
return ''
# retrieve all matches
sql = 'SELECT STR, TTY, RXAUI FROM RXNCONSO WHERE RXCUI = ? AND LAT = "ENG"'
found = []
names = []
format_str = "<span title=\"RXAUI: %s\">%s <span style=\"color:#888;\">[%s]</span></span>"
# loop over them
for res in self.sqlite.execute(sql, (rx_id,)):
found.append(res)
if len(found) > 0:
# preferred name only
if preferred:
for tty in ['BN', 'IN', 'PIN', 'SBDC', 'SCDC', 'SBD', 'SCD', 'MIN']:
for res in found:
if tty == res[1]:
names.append(format_str % (res[2], res[0], res[1]))
break
else:
continue
break
if len(names) < 1:
res = found[0]
names.append(format_str % (res[2], res[0], res[1]))
# return a list of all names
else:
for res in found:
names.append(format_str % (res[2], res[0], res[1]))
return "<br/>\n".join(names) if len(names) > 0 else ''
# the standard Python CSV reader can't do unicode, here's the workaround
def unicode_csv_reader(utf8_data, dialect=csv.excel, **kwargs):
csv_reader = csv.reader(utf8_data, dialect=dialect, **kwargs)
for row in csv_reader:
yield [unicode(cell, 'utf-8') for cell in row]