-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconsult_umls.py
87 lines (60 loc) · 3.79 KB
/
consult_umls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import pickle
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
import nltk
#%% Funcion para crear etiquetas y funcion para hacer etiquetado en la EHR
with open('my_vocabulary.pkl', 'rb') as file:
my_vocabulary = pickle.load(file)
my_vocabulary = my_vocabulary.reset_index(drop = True)
# my_vocabulary = my_vocabulary['STR'].apply(lambda x: [item for item in x if item not in stopwords.words('spanish')])
def Etiqueta(i):
# Creo los conjuntos de mis entidades
anatomia= set(['anatomical structure', 'body system', 'body part, organ, or organ component',
'body location or region', 'body space or junction'])
signo_sintoma = set(['finding','injury or poisoning', 'physiologic function', 'pathologic function',
'sign or symptom', 'organism function'])
problema_clinico = set(['congenital abnormality', 'disease or syndrome', 'mental or behavioral dysfunction',
'anatomical abnormality'])
sustancia = set(['body substance', 'chemical', 'pharmacologic substance', 'biologically active substance',
'hazardous or poisonous substance', 'substance', 'antibiotic', 'clinical drug'])
procedimiento = set(['laboratory or test result', 'health care activity', 'laboratory procedure',
'diagnostic procedure', 'therapeutic or preventive procedure', 'research activity'])
atributo = set(['temporal concept', 'qualitative concept', 'quantitative concept', 'spatial concept'])
# Para hacer etiquetado
if my_vocabulary.values[i][-1] in anatomia: etiqueta = 'Anatomia'
elif my_vocabulary.values[i][-1] in signo_sintoma: etiqueta = 'SignoSintoma'
elif my_vocabulary.values[i][-1] in problema_clinico: etiqueta = 'ProblemaClinico'
elif my_vocabulary.values[i][-1] in sustancia: etiqueta = 'Sustancia'
elif my_vocabulary.values[i][-1] in procedimiento: etiqueta = 'Procedimiento'
elif my_vocabulary.values[i][-1] in atributo: etiqueta = 'Atributo'
return etiqueta
def etiquetado(data):
tokens = [ t.lower() for t in word_tokenize(data) if t.isalpha() ]
#filtered_tokens = [ t for t in tokens if t not in stopwords.words('spanish') ]
etiquetasbio = ['O'] * len(tokens)
eti_bi = ['O'] * len(tokens)
eti_tri = ['O'] * len(tokens)
arreglo = np.transpose(np.vstack((tokens, etiquetasbio, eti_bi, eti_tri))) #
# arreglo = np.transpose(np.vstack((tokens, etiquetasbio)))
token = 0
while token < len(arreglo):
for i in range(len(my_vocabulary)):
if arreglo[:,0][token] == my_vocabulary['STR'].values[i]:
arreglo[token, 1] = Etiqueta(i)
if token < len(arreglo[:,0]) - 1:
if len(arreglo[:,0][token + 1]) > 2:
for j in range(len(my_vocabulary)):
if arreglo[:,0][token] + ' ' + arreglo[:,0][token+1] == my_vocabulary['STR'].values[j]:
arreglo[token+1, 2] = Etiqueta(i=j)
arreglo[token, 2] = Etiqueta(i=j)
# token = token + 1
else:
for k in range(len(my_vocabulary)):
if arreglo[:,0][token] + ' ' + arreglo[:,0][token + 1] + ' ' + arreglo[:,0][token + 2] == my_vocabulary['STR'].values[k]:
arreglo[token, 3] = Etiqueta(i=k)
arreglo[token+1, 3] = Etiqueta(i=k)
arreglo[token+2, 3] = Etiqueta(i=k)
# token = token + 2
token += 1
return arreglo