-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepare_data_final.py
330 lines (272 loc) · 11.7 KB
/
prepare_data_final.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
'''
prepares data
'''
import numpy as np
import sparse
'''
global variable, used when preprocessing
'''
deltat = 250
T = 3500
'''
reads file and turns to numpy array
'''
def get_file(file_name, dim):
f = open(file_name, 'r')
c = f.read()
c = c[1:]
c = c.replace('\n', ',')
c = c.split(',')
c = np.array(c)
c = c[:-1]
c = c.reshape((-1,dim))
return c
'''
normalize data
'''
def normalize(data):
mins = np.amin(data, axis=0)
maxs = np.amax(data, axis=0)
dims = data.shape
mins = np.tile(mins, (dims[0], 1))
maxs = np.tile(maxs, (dims[0], 1))
ranges = maxs - mins
ranges[ranges == 0] = 1
data[:, :] = (data[:,:] - mins[:, :]) / (ranges)
return data
'''
load time invariant data
'''
def load_time_invar(data_source):
raw_data = np.load('placeholder' + data_source + '/s.npz')
data = sparse.COO(raw_data['coords'], raw_data['data'], tuple(raw_data['shape']))
data = data.todense()
pat_trajectories = get_file('placeholder', 2)
pats = pat_trajectories[:, 0]
pat_sort_i = np.argsort(pats)
data = data[pat_sort_i, :]
return data
'''
flip time variant data
'''
def flip_data(data, traj_bins):
count = 0
for bin_num in traj_bins:
data[count, :bin_num + 1, :] = np.flip(data[count, :bin_num + 1, :], axis=1)
count += 1
return data
'''
load time variant data
mod_type determines whether to use summary of sequence data
data_types can be summary of sequential
summary: concatenates sum of first 2 elements of time sequence and last 8 (values clipped at 1)
sequential: leaves data as is and returns a sequence of length 10
'''
def load_time_var(data_source, data_type):
raw_data = np.load('placeholder' + data_source + '/X.npz')
data = sparse.COO(raw_data['coords'], raw_data['data'], tuple(raw_data['shape']))
data = np.array(data.todense())
pat_trajectories = get_file('placeholder', 2)
traj_lens = pat_trajectories[:, 1].astype(int)
traj_lens[np.where(traj_lens > T)[0]] = T
traj_bins = (traj_lens // deltat).astype(int)
pat_sort_i = np.argsort(pat_trajectories[:, 0])
traj_bins = traj_bins[pat_sort_i]
if data_type == 'summary':
#process variable lengths patient trajectories
num_segs = 4 #number of segments per feature vector
seg_size = 1 #number of bins per segment
ret_data = np.zeros((data.shape[0], 0))
for i in range(num_segs):
segment = np.sum(data[:, i*seg_size:(i*seg_size) + seg_size, :], axis=1)
ret_data = np.concatenate((ret_data, segment), axis=1)
else:
ret_data = flip_data(data, traj_bins) #prepare_sequences(data)
ret_data = normalize(ret_data)
return ret_data
'''
prepare sequences for input
'''
def prepare_sequences(data):
traj_lens = get_file('placeholder', 2)
traj_lens = traj_lens[:, 1].astype(int)
traj_bins = (traj_lens / deltat).astype(int)
inputs = []
for i in range(traj_bins.shape[0]):
inputs.append(data[i, :traj_bins[i] + 1, :])
sorted_order = np.flip(np.argsort(traj_bins))
traj_bins = traj_bins[sorted_order]
inputs = [inputs[i] for i in sorted_order]
return inputs
'''
combine time in/variant data
'''
def combine_data(data_source, data_type, var_len=False):
time_invar = load_time_invar(data_source)
time_var = load_time_var(data_source, data_type)
#if using sequence data, need to tile time_invar
if len(time_var.shape) == 3:
temp = np.zeros((time_invar.shape[0], time_var.shape[1], time_invar.shape[1]))
#loop over sequence length
for i in range(time_var.shape[1]):
temp[:, i, :] = time_invar
time_invar = temp
all_data = np.concatenate((time_invar, time_var), axis=2)
if var_len:
all_data = prepare_sequences(all_data)
#using summary data, concatenate on features
else:
#all_data = time_var
all_data = np.concatenate((time_invar, time_var), axis=1) #use everything
'''dems = time_invar #demographics only
icds = time_var[:, 1416:2720] #icds only
cpts = time_var[:, 162:1395] #cpts only
labs = time_var[:, 2721:3832] #labs only
util = np.concatenate((time_var[:, 0].reshape(-1, 1), time_var[:, 3935:3939], time_var[:, 3948:3964]), axis=1) #util
age = np.concatenate((time_var[:, 0].reshape(-1, 1), time_var[:, 3948:3964]), axis=1)
vist = time_var[:, 3935:3939] #vistype
vits = np.concatenate((time_var[:, 11:41], time_var[:, 122:127], \
time_var[:, 134:139], time_var[:, 1407:1412]), axis=1) #vitals
meds = np.concatenate((time_var[:, 1:11], time_var[:, 41:122], \
time_var[:, 127:134], time_var[:, 139:162], time_var[:, 1395:1407], \
time_var[:, 1412:1416], time_var[:, 2720].reshape(-1, 1), \
time_var[:, 3832:3935], time_var[:, 3939:3948]), axis=1) #meds
if time_var.shape[1] > 3964:
num_segs = int(time_var.shape[1] / 3964)
for i in range(1, num_segs):
icds = np.concatenate((icds, time_var[:, (i*3964)+1416:(i*3964)+2720]), axis=1)
cpts = np.concatenate((cpts, time_var[:, (i*3964)+162:(i*3964)+1395]), axis=1)
labs = np.concatenate((labs, time_var[:, (i*3964)+2721:(i*3964)+3832]), axis=1)
util = np.concatenate((util, time_var[:, (i*3964)+0].reshape(-1, 1), \
time_var[:, (i*3964)+3935:(i*3964)+3939], time_var[:, (i*3964)+3948:(i*3964)+3964]), axis=1)
age = np.concatenate((util, time_var[:, (i*3964)+0].reshape(-1, 1), \
time_var[:, (i*3964)+3948:(i*3964)+3964]), axis=1)
vist = np.concatenate((vist, time_var[:, (i*3964)+3935:(i*3964)+3939]), axis=1)
vits = np.concatenate((vits, time_var[:, (i*3964)+11:(i*3964)+41], time_var[:, (i*3964)+122:(i*3964)+127], \
time_var[:, (i*3964)+134:(i*3964)+139], time_var[:, (i*3964)+1407:(i*3964)+1412]), axis=1)
meds = np.concatenate((meds, time_var[:, (i*3964)+1:(i*3964)+11], time_var[:, (i*3964)+41:(i*3964)+122], \
time_var[:, (i*3964)+127:(i*3964)+134], time_var[:, (i*3964)+139:(i*3964)+162], \
time_var[:, (i*3964)+1395:(i*3964)+1407], \
time_var[:, (i*3964)+1412:(i*3964)+1416], time_var[:, (i*3964)+2720].reshape(-1, 1), \
time_var[:, (i*3964)+3832:(i*3964)+3935], time_var[:, (i*3964)+3939:(i*3964)+3948]), axis=1)
#all_data = np.concatenate((cpts, labs, util, icds, vits, dems), axis=1)
all_data = dems'''
return all_data
'''
gets patient labels
first column is patient id
second columns is the time from alignment to conversion (if they convert, otherwise -1)
third column is label
'''
def get_labels():
labels_file = 'placeholder'
labels = get_file(labels_file, 4)
labels_sort_i = np.argsort(labels[:, 0])
labels = labels[labels_sort_i, :]
#remove extra patients
pats = np.sort(get_file('placeholder', 1)[1:].reshape(-1))
excluded_patients = np.where(np.logical_not(np.isin(labels[:, 0], pats)))[0]
labels = np.delete(labels, excluded_patients, axis=0)
print(labels.shape)
excluded_patients2 = np.where(np.logical_not(np.isin(pats, labels[:, 0])))[0]
print(excluded_patients2.shape)
return labels, excluded_patients2
'''
split data into training and test set
in addition to training and test split, returns indexes of test patients
'''
def split_data(data_source, data_type):
#setup
data = combine_data(data_source, data_type)
labels_raw, exclude_pats = get_labels()#[:, 2].astype(int)
labels = np.logical_and(labels_raw[:, 2].astype(int) == 1, labels_raw[:, 1].astype(int) <= 120)
data = np.delete(data, exclude_pats, axis=0)
print(np.sum(labels), labels.shape, data.shape)
prop_training = 0.8
num_features = data.shape[1] if len(data.shape) != 3 else data.shape[2]
pos_pats = np.where(labels == 1)[0]
neg_pats = np.where(labels == 0)[0]
training_labs = np.zeros((0,))
test_labs = np.zeros((0,))
test_pat_i = np.zeros((0,))
if len(data.shape) != 3:
training_data = np.zeros((0, num_features))
test_data = np.zeros((0, num_features))
else:
training_data = np.zeros((0, data.shape[1], num_features))
test_data = np.zeros((0, data.shape[1], num_features))
#stratified split
for pat_set in [pos_pats, neg_pats]:
num_pats = pat_set.shape[0]
rand_perm = np.random.permutation(num_pats)
num_training = int(prop_training * num_pats)
pat_labs = labels[pat_set][rand_perm]
training_labs = np.append(training_labs, pat_labs[:num_training])
test_labs = np.append(test_labs, pat_labs[num_training:])
test_pat_i = np.append(test_pat_i, pat_set[rand_perm][num_training:])
if len(data.shape) != 3:
pat_data = data[pat_set, :][rand_perm, :]
training_data = np.append(training_data, pat_data[:num_training, :], axis=0)
test_data = np.append(test_data, pat_data[num_training:, :], axis=0)
else:
pat_data = data[pat_set, :, :][rand_perm, :, :]
training_data = np.append(training_data, pat_data[:num_training, :, :], axis=0)
test_data = np.append(test_data, pat_data[num_training:, :, :], axis=0)
#randomize order of positive/negative examples
train_perm = np.random.permutation(training_labs.shape[0])
test_perm = np.random.permutation(test_labs.shape[0])
training_labs = training_labs[train_perm]
test_labs = test_labs[test_perm]
test_pat_i = test_pat_i[test_perm]
if len(data.shape) != 3:
training_data = training_data[train_perm, :]
test_data = test_data[test_perm, :]
else:
training_data = training_data[train_perm, :, :]
test_data = test_data[test_perm, :, :]
return (training_data, training_labs, test_data, test_labs, test_pat_i)
'''
splits squential_data
'''
def split_data_sequential(data_source):
#setup
data = combine_data(data_source, 'sequential', var_len=True)
labels = get_labels()[:, 2].astype(int)
prop_training = 0.8
pos_pats = np.where(labels == 1)[0]
neg_pats = np.where(labels == 0)[0]
training_labs = np.zeros((0,))
test_labs = np.zeros((0,))
test_pat_i = np.zeros((0,))
training_data = []
test_data = []
#stratified split
for pat_set in [pos_pats, neg_pats]:
num_pats = pat_set.shape[0]
rand_perm = np.random.permutation(num_pats)
num_training = int(prop_training * num_pats)
pat_labs = labels[pat_set][rand_perm]
training_labs = np.append(training_labs, pat_labs[:num_training])
test_labs = np.append(test_labs, pat_labs[num_training:])
test_pat_i = np.append(test_pat_i, pat_set[rand_perm][num_training:])
pat_data = [data[i] for i in pat_set]
pat_data = [pat_data[i] for i in rand_perm]
training_data = training_data + pat_data[:num_training]
test_data = test_data + pat_data[num_training:]
#randomize order of positive/negative examples
train_perm = np.random.permutation(training_labs.shape[0])
test_perm = np.random.permutation(test_labs.shape[0])
training_labs = training_labs[train_perm]
test_labs = test_labs[test_perm]
training_data = [training_data[i] for i in train_perm]
test_data = [test_data[i] for i in test_perm]
print('done splitting')
return (training_data, training_labs, test_data, test_labs, test_pat_i)
'''
main block
'''
if __name__ == '__main__':
pass
#find_mem_prob()
#data_type = 'summary'
#split_data_sequential('first_align')