-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_weka_similarity.py
executable file
·360 lines (290 loc) · 14.9 KB
/
preprocess_weka_similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
#!/usr/bin/env python
import sys
import argparse
import math
from collections import OrderedDict
import json
from random import choice
import random
import copy
import os
import cPickle as pickle
EXT_GZ = "gz"
SUFFIX_GZ = os.extsep + EXT_GZ
from contextlib import closing, contextmanager
from gzip import open as _gzip_open
def gzip_open(*args, **kwargs):
return closing(_gzip_open(*args, **kwargs))
# return _gzip_open(*args, **kwargs)
def maybe_gzip_open(filename, *args, **kwargs):
if filename.endswith(SUFFIX_GZ):
return gzip_open(filename, *args, **kwargs)
else:
return open(filename, *args, **kwargs)
parser = argparse.ArgumentParser(description='Annotate CNVs with similarity scores.')
parser.add_argument('weka_file', help="The arff file to annotate.")
parser.add_argument('sim_file', help="Similarity scores file.")
parser.add_argument('output', choices=['no_sim', 'by_sim'], help="Type of output: no_sim to not use similarity scores, by_sim otherwise.")
parser.add_argument('remove_no_feature_instances', choices=['remove', 'keep'], help="Set to remove no feature instances.")
parser.add_argument('neighbour_weight_function', help="The function to use to apply to the neighbour weights. e.g. [x, 2**x, x**2, sqrt(x)]")
parser.add_argument('weighted_gene_duplication', \
choices=['weighted_gene_duplication_no', 'weighted_gene_duplication_sim', 'weighted_gene_duplication_uniform', 'weighted_gene_duplication_max'], \
help="no to not use weights, sim to weight by similarity score, uniform for a uniform weighting out of 1000, max only use the highest similarity score")
parser.add_argument('plusone', choices=['plusone', 'pluszero'], \
help="plusone to use all instances even if the similarity score is 0, pluszero to not include these.")
parser.add_argument('out_gene_file', help="The output file, an arff annotated with similarity scores.")
parser.add_argument('similarity_rank_cutoff', type=int)
parser.add_argument('balance_test', choices=['bt_none', 'bt_remaining', 'bt_patient', 'bt_ptrem'], \
help="none for not balancing, remaining to balance and keep the remaining, patient to balance by patient, ptrem to balance by patient and keep the remainging.")
parser.add_argument('--balance_genes', '-b', action='store_true')
parser.add_argument('--debug', '-d', help="Debug.", action='store_true')
args = parser.parse_args()
if args.neighbour_weight_function == "0":
anwf = "1"
else:
anwf = args.neighbour_weight_function
fn_str = 'lambda x: %s' % args.neighbour_weight_function
print fn_str
NEIGHBOUR_WEIGHT_FUNCTION = eval(fn_str)
f = _gzip_open(args.weka_file, 'r')
line = f.readline().split('\t')
for i in range(len(line)):
if '{' in line[i]:
cutoff_index = i
break
f.close()
prev = None
current = None
# out_gene_file = open(args.out_gene_file, 'w')
log_line_original_gene = []
log_score_original_gene = []
log_line_cnv = []
log_score_cnv = []
# out_cnv_file = open('weka_out_cnv.txt', 'w')
# discarded_cnvs_file = open('discarded_cnvs.txt', 'w')
# discarded_genes_file = open('discarded_genes.txt', 'w')
sim_file = _gzip_open(args.sim_file, 'r')
weka_file = _gzip_open(args.weka_file, 'r')
case_control = None
sim_line = sim_file.readline()
weka_line = weka_file.readline()
gene_to_sim = OrderedDict()
keep_looping = True
info_dict = OrderedDict()
while keep_looping:
if not sim_line:
keep_looping = False
sim_line = '#'
weka_line = '#'
# ASSUME: line is not empty string
# not original_gene line
if sim_line[0] != "#":
sim_line = sim_line.rstrip().split('\t')
if args.debug:
print 'sim_line:', sim_line
gene_to_sim[sim_line[2]] = {'weight' : float(sim_line[3]),
'rank' : int(sim_line[5]),
'sim' : float(sim_line[12]),
}
# original_gene line
else:
# ASSUME: line is not empty string
# not original_gene line
while weka_line[0] != '#':
weka_line = weka_line.rstrip().split('\t')
gene = weka_line[2]
original_gene = weka_line[4]
if args.debug:
print ' each_gene_weka_line:', weka_line
print ' gene:', gene
print ' og:', original_gene
# HARDCODE: phenotype similarity index
if args.output == 'no_sim':
sim_score = 0
elif args.output == 'by_sim':
if args.debug:
print gene_to_sim
if gene_to_sim[gene]['rank'] > args.similarity_rank_cutoff:
sim_score = 0
else:
sim_score = gene_to_sim[gene]['sim'] * NEIGHBOUR_WEIGHT_FUNCTION(gene_to_sim[gene]['weight'])
# if gene in gene_to_sim: # TODO: need new similairty file!!
# sim_score = gene_to_sim[gene]['sim'] * NEIGHBOUR_WEIGHT_FUNCTION(gene_to_sim[gene]['weight'])
# else:
# sim_score = 0
replace_str = ', 6 %s' % sim_score
out_line = '\t'.join(weka_line[cutoff_index:] + [str(sim_score)]).replace(',6 0', replace_str)
# if specified in args, do not add no feature genes
if not (args.remove_no_feature_instances == 'remove' and len(weka_line[0]) == 0):
log_line_original_gene.append(out_line)
log_score_original_gene.append(sim_score)
weka_line = weka_file.readline()
gene_to_sim = OrderedDict()
# if not at the end of the file
if weka_line != '#':
# after while, original_gene_line
weka_line = weka_line.rstrip().split('\t')
# cnv name
prev = current
current = weka_line[9]
if args.debug:
print ' og_weka_line:', weka_line
# print ' ', prev, current, current != prev
# print ' log_line_original_gene', log_line_original_gene
print ' log_score_original_gene', log_score_original_gene
else:
prev = current
current = None
# new cnv
if current != prev and log_line_cnv != [] and case_control:
# if case_control == 'BENIGN':
# log_score_cnv = [(max(log_score_cnv)-ls) for ls in log_score_cnv]
max_score = max(log_score_cnv)
idx = log_score_cnv.index(max_score)
sum_sim_score = sum(log_score_cnv)
if not case_control in info_dict[phenotype][patient_id]:
info_dict[phenotype][patient_id][case_control] = OrderedDict()
info_dict[phenotype][patient_id][case_control][prev] = []
lsc_limit = float(1000)
# lsc_limit = float(len(log_score_cnv) * 2)
if ((sum_sim_score == 0 or len(log_score_cnv) > lsc_limit) and \
(args.weighted_gene_duplication != 'weighted_gene_duplication_no')) or \
((args.weighted_gene_duplication == 'weighted_gene_duplication_no') and \
(case_control != 'HARMFUL' or args.output != 'by_sim')):
log_score_cnv_fractions = [1/lsc_limit for ls in log_score_cnv] # 1 per, special case (all 0 or > lsc_limit)
elif args.weighted_gene_duplication == 'weighted_gene_duplication_uniform':
log_score_cnv_fractions = [(1/float(len(log_score_cnv))) for ls in log_score_cnv] # uniform
elif args.weighted_gene_duplication == 'weighted_gene_duplication_sim':
log_score_cnv_fractions = [ls/float(sum_sim_score) for ls in log_score_cnv] # fractions
elif args.weighted_gene_duplication == 'weighted_gene_duplication_max' or \
(args.weighted_gene_duplication == 'weighted_gene_duplication_no' and case_control == 'HARMFUL' and args.output == 'by_sim'):
log_score_cnv_fractions = [0 for ls in log_score_cnv]
log_score_cnv_fractions[idx] = 1/lsc_limit
else:
raise Exception('Invalid args.weighted_gene_duplication. (%s, %s, %s)' % \
(args.weighted_gene_duplication, case_control, args.output))
if args.debug:
print ' case_control', case_control
print ' idx', idx
print ' sum_sim_score', sum_sim_score
print ' log_score_cnv', log_score_cnv
print ' log_score_cnv_fractions', log_score_cnv_fractions
log_score_cnv_fractions = [int(math.ceil(ls*lsc_limit)) for ls in log_score_cnv_fractions]
if args.debug:
print ' log_score_cnv_fractions', log_score_cnv_fractions
# add 1 to all so genes with hposim of 0 are still represented, +1, plusone
if args.plusone == "plusone":
log_score_cnv_fractions = [ls+1 for ls in log_score_cnv_fractions]
if args.debug:
print ' log_score_cnv_fractions', log_score_cnv_fractions
if args.debug:
print ' log_score_cnv_fractions_sum', sum(log_score_cnv_fractions)
# # gene creation
# if args.weighted_gene_duplication == 'weighted_gene_duplication_no':
# # output all
# if case_control == 'BENIGN' or (case_control == 'HARMFUL' and args.output == 'no_sim'):
# # cnv
# # print >> out_cnv_file, 'xxx'.join(log_line_cnv)
# # gene
# for log_line_cnv_line in log_line_cnv:
# # print >> out_gene_file, log_line_cnv_line
# # info_dict
# gene = log_line_cnv_line.split('\t')[7]
# info_dict[phenotype][patient_id][case_control][prev].append(log_line_cnv_line)
# if args.debug:
# print 'patient_id', patient_id
# print 'prev', prev
# print 'gene', gene
# print 'case_control', case_control
# print 'log_line_cnv_line', log_line_cnv_line
# # output only top by sim score
# elif case_control == 'HARMFUL' and args.output == 'by_sim':
# # discard cnv/genes if no gene has a similarity score > 0
# if max(log_score_cnv) == 0:
# # cnv
# print >> discarded_cnvs_file, 'xxx'.join(log_line_cnv)
# # gene
# for log_line_cnv_line in log_line_cnv:
# print >> discarded_genes_file, log_line_cnv_line
# else:
# # cnv
# # print >> out_cnv_file, log_line_cnv[idx]
# # gene
# # print >> out_gene_file, log_line_cnv[idx]
# # info_dict
# gene = log_line_cnv_line.split('\t')[7]
# if args.debug:
# print 'patient_id', patient_id
# print 'prev', prev
# print 'gene', gene
# print 'case_control', case_control
# print 'log_line_cnv_line', log_line_cnv[idx]
# info_dict[phenotype][patient_id][case_control][prev].append(log_line_cnv[idx])
# elif args.weighted_gene_duplication == 'weighted_gene_duplication_sim' or \
# args.weighted_gene_duplication == 'weighted_gene_duplication_uniform':
# args.weighted_gene_duplication == 'weighted_gene_duplication_max':
if True:
max_counter = 0
for llc, lscf in zip(log_line_cnv, log_score_cnv_fractions):
# out_cnv_file.write('%sxxx' % llc)
gene = llc.split('\t')[7]
if case_control == 'BENIGN':
is_max = 'benign'
elif case_control == 'HARMFUL' and max_counter == idx:
is_max = 'max'
else:
is_max = 'notmax'
# for iteration in range(int(lscf)):
# # gene
# # print >> out_gene_file, llc
# # cnv
# # out_cnv_file.write('%sxxx' % llc)
# # info_dict
# info_dict[phenotype][patient_id][case_control][prev].append('%s\t%s' % (llc, is_max))
weighted_weka_line = '%s\t%s' % (llc, is_max)
weighted_weka_line = weighted_weka_line.replace('}\t%', '}, {%s}\t%%' % lscf)
info_dict[phenotype][patient_id][case_control][prev].append(weighted_weka_line)
if args.debug:
print 'wwl: ', weighted_weka_line
max_counter += 1
# out_cnv_file.write('\n')
else:
raise 'Invalid option for args.weighted_gene_duplication.'
# reset cnv
log_line_cnv = []
log_score_cnv = []
# if not at the end of the file
if weka_line != '#':
case_control = weka_line[10]
phenotype = weka_line[1]
if not phenotype in info_dict:
info_dict[phenotype] = OrderedDict()
patient_id = weka_line[8]
if not patient_id in info_dict[phenotype]:
info_dict[phenotype][patient_id] = OrderedDict()
info_dict[phenotype][patient_id]['BENIGN'] = OrderedDict()
info_dict[phenotype][patient_id]['HARMFUL'] = OrderedDict()
# add the max sim score of neighbour gene as sim score for the original_gene
if log_score_original_gene == []:
max_score_original_gene = 0
else:
max_score_original_gene = max(log_score_original_gene)
# TODO: put this in later, check downstream affects
out_line = '\t'.join(weka_line[cutoff_index:] + [str(max_score_original_gene), weka_line[4].split(',')[0], weka_line[1]]).replace(',6 0', ',6 %s' % max_score_original_gene)
if not (args.remove_no_feature_instances == 'remove' and len(weka_line[0]) == 1):
log_line_cnv.append(out_line)
log_score_cnv.append(max_score_original_gene)
# reset original_gene
log_line_original_gene = []
log_score_original_gene = []
weka_line = weka_file.readline()
sim_line = sim_file.readline()
weka_file.close()
sim_file.close()
# out_cnv_file.close()
# out_gene_file.close()
out_gene_file = open(args.out_gene_file, 'w')
pickle.dump(info_dict, out_gene_file)
out_gene_file.close()
# print "info_dict_res"
# print json.dumps(info_dict, indent=4)