-
Notifications
You must be signed in to change notification settings - Fork 0
/
utilities.py
137 lines (108 loc) · 4.75 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import sklearn.metrics
from numpy import random, nanmax
from pandas import concat, read_csv, DataFrame
import math
def load_properties(dirname):
properties = [_.split('=') for _ in open(dirname + '/config.txt').readlines()]
d = {}
for key, value in properties:
d[key.strip()] = value.strip()
return d
def cluster_cmd():
return 'rc.py --cores 1 --walltime 00:10 --queue low'
def get_num_cores():
#return -1 #spawning processes on all the available cores of the local machine
return 1
def get_fold_ens(fileName):
with open(fileName, 'r') as f:
content = f.readline()
ens = content.split(':: (',1)[1].split(')')[0]
ens = (ens[:-1] if ens[-1] == ',' else ens)
predictors = map(int, ens.split(","))
f.close()
return predictors
def load_arff_headers(filename):
dtypes = {}
for line in open(filename):
if line.startswith('@data'):
break
if line.startswith('@attribute'):
_, name, dtype = line.split()
if dtype.startswith('{'):
dtype = dtype[1:-1]
dtypes[name] = set(dtype.split(','))
return dtypes
def f_score(labels, predictions, beta = 1.0, pos_label = 1):
precision, recall, _ = sklearn.metrics.precision_recall_curve(labels, predictions, pos_label)
f1 = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
return f1
def fmax_score(labels, predictions, beta = 1.0, pos_label = 1):
"""
Radivojac, P. et al. (2013). A Large-Scale Evaluation of Computational Protein Function Prediction. Nature Methods, 10(3), 221-227.
Manning, C. D. et al. (2008). Evaluation in Information Retrieval. In Introduction to Information Retrieval. Cambridge University Press.
"""
precision, recall, _ = sklearn.metrics.precision_recall_curve(labels, predictions, pos_label)
f1 = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
return nanmax(f1)
def get_set_preds(dirname, set, bag, fold, seed):
filename = '%s/%s-b%s-f%s-s%s.csv.gz' % (dirname, set, bag, fold, seed)
df = read_csv(filename, skiprows = 1, compression = 'gzip')
y_true = df.ix[:,1:2]
y_score = df.ix[:,2:3]
return y_true, y_score
def get_bps(project_path, seed, metric, size): #to vary the ensemble size, select randomly w/o replacement, an equal number of good, medium, and weak performance bps
order_file = "%s/ENSEMBLES/order_of_seed%s_%s.txt" % (project_path, seed, metric)
with open(order_file, 'r') as f:
content = f.read().splitlines()
f.close()
max_num_clsf = len(content)
bps_weight = {}
subset = []
random.seed(int(seed))
num_bins = 3 # good, medium, weak
interval = int(math.floor(max_num_clsf/num_bins))
rem = max_num_clsf%num_bins
for bin in range(num_bins):
num_sel = int(math.floor(size/num_bins) + 1 if (size % num_bins > bin) else int(math.floor(size/num_bins)))
if bin == 0:
start = 0
end = (interval + 1 if rem else interval)
elif bin == 1:
start = (interval + 1 if rem else interval)
end = (2*interval + rem if rem else 2*interval)
else:
start = (2*interval + rem if rem else 2*interval)
end = max_num_clsf
selected = random.choice(range(start, end), num_sel, replace=False)
subset.extend([content[bp] for bp in selected])
for i in range(len(subset)):
index = i + 1
bps_weight[index] = float(subset[i].split(",")[1])
return subset, bps_weight
def bps2string(predictors):
str = "\n* * * * *\nBase predictors and their weight (performance on the validation, over 5 folds):\n"
index = 1
for p in predictors:
str += "%i :: %s\n" % (index, p)
index += 1
str += "* * * * *\n"
return str
def get_path_bag_weight(predictor):
path = (predictor.split("_bag")[0].split(":: ")[1] if "::" in predictor else predictor.split("_bag")[0])
bag = int(predictor.split("_bag")[1].split(",")[0])
weight = float(predictor.split(",")[1].strip())
return path, bag, weight
def aggregate_predictions(predictors, seed, fold, set, RULE):
denom = 0
path, bag, weight = get_path_bag_weight(predictors[0])
denom = ((denom + weight) if RULE == 'WA' else (denom + 1))
y_true, y_score = get_set_preds(path, set, bag, fold, seed)
y_score = weight * y_score
for bp in predictors[1:]:
path, bag, weight = get_path_bag_weight(bp)
denom += weight
y_true, y_score_current = get_set_preds(path, set, bag, fold, seed)
y_score = (y_score.add(weight * y_score_current) if RULE =='WA' else y_score.add(y_score_current))
y_score = y_score/denom
perf = fmax_score(y_true, y_score)
return (y_true, y_score)