-
Notifications
You must be signed in to change notification settings - Fork 0
/
update_and_write.py
212 lines (168 loc) · 8.66 KB
/
update_and_write.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import pandas as pd
import numpy as np
#import os
from Clean_function import clean_note
from collections import OrderedDict
from progressbar import Percentage, ProgressBar,Bar,ETA
from sklearn.model_selection import train_test_split
import tensorflow as tf
import pickle
maxlen=2500
min_word_frequency=5
from tensorflow import keras
#Here we will put the text and label Token "updater"
import pandas as pd
from nltk.tokenize import word_tokenize
from progressbar import Percentage, ProgressBar,Bar,ETA
from collections import OrderedDict
#Writing to TF records
#We write the inputs and labels! as integer sequences has a BUNCH of advantges to writing it as multi hot encoded.
from progressbar import Percentage, ProgressBar,Bar,ETA
import tensorflow as tf
import pickle
import os.path
def write_tf_records(writer,text_list,label_array,maxlen):
'''
Args:
1. writer: a tf data "writer object" we will use this to "write" the tfrecords on HD
2. text_list: A list of Medical Notes integer coded
3. label_array: A List of Integer coded ICD 9 codes
6. maxlen the maximum lenght of a note we want to deal with. Longer notes are truncated
'''
pbar=ProgressBar(widgets=[Bar('=', '[', ']'), ' ', Percentage(), ' ', ETA()],
maxval=len(text_list)).start()
for j in range(0,len(text_list)):
review=text_list[j]
target=label_array[j]
ex = tf.train.SequenceExample()
label_indexes = ex.feature_lists.feature_list['label_indexes']
for token_index in target:
label_indexes.feature.add().int64_list.value.append(token_index)
token_indexes = ex.feature_lists.feature_list['token_indexes']
for token_index in review:
token_indexes.feature.add().int64_list.value.append(token_index)
writer.write(ex.SerializeToString())
pbar.update(j)
def initial_tokenizer_text(ICD_file_en='/data/ICD9/icd9.txt'
,tokenizer_name='/tokenizer/Text_Tokenizer.pkl'
,min_word_frequency=5):
#We set up an initial TOkenizer on the Text Descriptions(Italian and english )
#Get English Descriptions
all_names=pd.read_table(ICD_file_en,encoding = "ISO-8859-1")
ICD9_des_en=all_names['LONG DESCRIPTION'].tolist()+all_names['SHORT DESCRIPTION'].tolist()
ICD9_des_en=[clean_note(tx) for tx in ICD9_des_en]
init_text=ICD9_des_en
print('Initial Set Up Tokenizer')
extra_words=[word_tokenize(review) for review in init_text]
extra_words = [word for review in extra_words for word in review]
extra_words=list(set(extra_words))
vocabulary = extra_words
#i + 1 because 0 wil represent the padding!
word2idx=OrderedDict(((x,i+1) for i,x in enumerate(vocabulary)))
t=keras.preprocessing.text.Tokenizer(oov_token='UNK__TO_123')
t.word_index = word2idx
t.word_index['UNK__TO_123']=len(vocabulary)+2
# Write word2index (tokenizer) to disk
pickle.dump(t, open(tokenizer_name, 'wb'))
def initial_tokenizer_label(tokenizer_name='/tokenizer/Label_Tokenizer.pkl'):
#we use the keras tokenizer, it is basically a wrapper for the dictionary with some nice added functionality.
t=keras.preprocessing.text.Tokenizer()
t.word_index={}
# Finally we save the tokenizer.
#we leave 0 empty and start the real tokens from 1
pickle.dump(t, open(tokenizer_name, 'wb'))
def Update_Tokenizer_Text(initial_tokenizer,new_text,min_word_frequency=5):
vocabulary_old=list(initial_tokenizer.word_index.keys())
#tokenize the new texts, check for most common words.
reviews = [word_tokenize(review) for review in new_text]
len_reviews = [len(review) for review in reviews]
# Flatten nested list
reviews = [word for review in reviews for word in review]
# Compute the frequency of each word
word_frequency = pd.value_counts(reviews)
# Keep only words with frequency higher than minimum
potential_vocabulary = word_frequency[word_frequency>=min_word_frequency].index.tolist()
#we add words to the vocabulary, that appear at least min_word_frequency in the new docs and which we dont
#already have.
vocab_add=[item for item in potential_vocabulary if item not in vocabulary_old]
print('Words added:')
print(len(vocab_add))
#only if there is something to add we add.
if len(vocab_add)>0:
#then we add them, while perserving order
vocab_new=vocabulary_old+vocab_add
word2idx_new=OrderedDict(((x,i+1) for i,x in enumerate(vocab_new)))
initial_tokenizer.word_index = word2idx_new
return initial_tokenizer
def Update_Tokenizer_Labels(initial_tokenizer,labels):
if len(initial_tokenizer.word_index)>0:
vocabulary_old=list(initial_tokenizer.word_index.keys())
potential_vocabulary = [word for review in labels for word in review]
#remove duplicates
potential_vocabulary=list(set(potential_vocabulary))
#only add labels we havent seen yet
vocab_add=[item for item in potential_vocabulary if item not in vocabulary_old]
vocab_add=[word for word in vocab_add if type(word)==str ]
print('Labels added:')
print(len(vocab_add))
#add it IF THERE IS SOMETHING TO ADD
if len(vocab_add)>0:
vocab_new=vocabulary_old+vocab_add
#use ordered dict to keep original order
word2idx_new=OrderedDict(((x,i+1) for i,x in enumerate(vocab_new)))
#modify the word index with the new index
initial_tokenizer.word_index = word2idx_new
#return tokenizer with new index.
else:
print('Empty dictionary: Initial Tokenizer ? ')
potential_vocabulary = [word for review in labels for word in review]
#remove duplicates
potential_vocabulary=list(set(potential_vocabulary))
vocab_add=[word for word in potential_vocabulary if type(word)==str ]
vocab_new=vocab_add
print('Labels added:')
print(len(vocab_add))
#use ordered dict to keep original order
word2idx_new=OrderedDict(((x,i+1) for i,x in enumerate(vocab_new)))
#modify the word index with the new index
initial_tokenizer.word_index = word2idx_new
#return tokenizer with new index.
return initial_tokenizer
def update_write(tokenizer_text_path,tokenizer_label_path,In_Text,labels,output_name):
#First we check for existance of tf records file if yes we break and require a new name
if os.path.isfile(output_name+'_train.tfrecords')==True:
return print("File already exists please choose different name.")
print('Cleaning Notes')
In_Text=[clean_note(tx) for tx in In_Text]
print('Update Text Tokenizer')
initial_tokenizer = pickle.load(open(tokenizer_text_path, 'rb'))
new_tokenizer=Update_Tokenizer_Text(initial_tokenizer=initial_tokenizer,min_word_frequency=min_word_frequency,new_text=In_Text)
pickle.dump(new_tokenizer, open(tokenizer_text_path, 'wb'))
#initial Label tokenizer
print('Update Label Tokenizer')
initial_tokenizer_label = pickle.load(open(tokenizer_label_path, 'rb'))
new_tokenizer_label=Update_Tokenizer_Labels(initial_tokenizer_label,labels)
#Then tokenization, words, labels.
pickle.dump(new_tokenizer_label, open(tokenizer_label_path, 'wb'))
#initial Tokenizer might be empty, we include the handling here.
print('Encode the Data')
In_Text,labels=new_tokenizer.texts_to_sequences(In_Text),new_tokenizer_label.texts_to_sequences(labels)
#Shorten texts to max lengths. ( they are already tokenized, dont do this on the strings:D )
In_Text=[x[-maxlen:] for x in In_Text]
#splitting into train test etc.
train_text,val_text,train_labels,val_labels = train_test_split(In_Text, labels, test_size=0.1, random_state=1)
train_text,test_text,train_labels,test_labels = train_test_split(train_text, train_labels, test_size=0.1, random_state=1)
#define the 3 writers
writer_train = tf.python_io.TFRecordWriter( output_name+ '_train.tfrecords')
writer_validation = tf.python_io.TFRecordWriter( output_name+ '_validation.tfrecords')
writer_test = tf.python_io.TFRecordWriter( output_name+ '_test.tfrecords')
#Finally write each TF records file dont forget to close it!
print('Writing Tf Records Train')
write_tf_records(writer_train,train_text,train_labels,maxlen)
writer_train.close()
print('Writing Tf Records Validation')
write_tf_records(writer_validation,val_text,val_labels,maxlen)
writer_validation.close()
print('Writing Tf Records Test')
write_tf_records(writer_test,test_text,test_labels,maxlen)
writer_test.close()