-
Notifications
You must be signed in to change notification settings - Fork 22
/
pyroc.py
382 lines (310 loc) · 11.9 KB
/
pyroc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
#!/usr/bin/env python
# encoding: utf-8
"""
PyRoc.py
Created by Marcel Caraciolo on 2009-11-16.
Copyright (c) 2009 Federal University of Pernambuco. All rights reserved.
IMPORTANT:
Based on the original code by Eithon Cadag (http://www.eithoncadag.com/files/pyroc.txt)
Python Module for calculating the area under the receive operating characteristic curve, given a dataset.
0.1 - First Release
0.2 - Updated the code by adding new metrics for analysis with the confusion matrix.
"""
import random
import math
try:
import pylab
except:
print "error:\tcan't import pylab module, you must install the module:\n"
print "\tmatplotlib to plot charts!'\n"
def random_mixture_model(pos_mu=.6,pos_sigma=.1,neg_mu=.4,neg_sigma=.1,size=200):
pos = [(1,random.gauss(pos_mu,pos_sigma),) for x in xrange(size/2)]
neg = [(0,random.gauss(neg_mu,neg_sigma),) for x in xrange(size/2)]
return pos+neg
def plot_multiple_rocs_separate(rocList,title='', labels = None, equal_aspect = True):
""" Plot multiples ROC curves as separate at the same painting area. """
pylab.clf()
pylab.title(title)
for ix, r in enumerate(rocList):
ax = pylab.subplot(4,4,ix+1)
pylab.ylim((0,1))
pylab.xlim((0,1))
ax.set_yticklabels([])
ax.set_xticklabels([])
if equal_aspect:
cax = pylab.gca()
cax.set_aspect('equal')
if not labels:
labels = ['' for x in rocList]
pylab.text(0.2,0.1,labels[ix],fontsize=8)
pylab.plot([x[0] for x in r.derived_points],[y[1] for y in r.derived_points], 'r-',linewidth=2)
pylab.show()
def _remove_duplicate_styles(rocList):
""" Checks for duplicate linestyles and replaces duplicates with a random one."""
pref_styles = ['cx-','mx-','yx-','gx-','bx-','rx-']
points = 'ov^>+xd'
colors = 'bgrcmy'
lines = ['-','-.',':']
rand_ls = []
for r in rocList:
if r.linestyle not in rand_ls:
rand_ls.append(r.linestyle)
else:
while True:
if len(pref_styles) > 0:
pstyle = pref_styles.pop()
if pstyle not in rand_ls:
r.linestyle = pstyle
rand_ls.append(pstyle)
break
else:
ls = ''.join(random.sample(colors,1) + random.sample(points,1)+ random.sample(lines,1))
if ls not in rand_ls:
r.linestyle = ls
rand_ls.append(ls)
break
def plot_multiple_roc(rocList,title='',labels=None, include_baseline=False, equal_aspect=True):
""" Plots multiple ROC curves on the same chart.
Parameters:
rocList: the list of ROCData objects
title: The tile of the chart
labels: The labels of each ROC curve
include_baseline: if it's True include the random baseline
equal_aspect: keep equal aspect for all roc curves
"""
pylab.clf()
pylab.ylim((0,1))
pylab.xlim((0,1))
pylab.xticks(pylab.arange(0,1.1,.1))
pylab.yticks(pylab.arange(0,1.1,.1))
pylab.grid(True)
if equal_aspect:
cax = pylab.gca()
cax.set_aspect('equal')
pylab.xlabel("1 - Specificity")
pylab.ylabel("Sensitivity")
pylab.title(title)
if not labels:
labels = [ '' for x in rocList]
_remove_duplicate_styles(rocList)
for ix, r in enumerate(rocList):
pylab.plot([x[0] for x in r.derived_points], [y[1] for y in r.derived_points], r.linestyle, linewidth=1, label=labels[ix])
if include_baseline:
pylab.plot([0.0,1.0], [0.0, 1.0], 'k-', label= 'random')
if labels:
pylab.legend(loc='lower right')
pylab.show()
def load_decision_function(path):
""" Function to load the decision function (DataSet)
Parameters:
path: The dataset file path
Return:
model_data: The data modeled
"""
fileHandler = open(path,'r')
reader = fileHandler.readlines()
reader = [line.strip().split() for line in reader]
model_data = []
for line in reader:
if len(line) == 0: continue
fClass,fValue = line
model_data.append((int(fClass), float(fValue)))
fileHandler.close()
return model_data
class ROCData(object):
""" Class that generates an ROC Curve for the data.
Data is in the following format: a list l of tutples t
where:
t[0] = 1 for positive class and t[0] = 0 for negative class
t[1] = score
t[2] = label
"""
def __init__(self,data,linestyle='rx-'):
""" Constructor takes the data and the line style for plotting the ROC Curve.
Parameters:
data: The data a listl of tuples t (l = [t_0,t_1,...t_n]) where:
t[0] = 1 for positive class and 0 for negative class
t[1] = a score
t[2] = any label (optional)
lineStyle: THe matplotlib style string for plots.
Note: The ROCData is still usable w/o matplotlib. The AUC is still available,
but plots cannot be generated.
"""
self.data = sorted(data,lambda x,y: cmp(y[1],x[1]))
self.linestyle = linestyle
self.auc() #Seed initial points with default full ROC
def auc(self,fpnum=0):
""" Uses the trapezoidal ruel to calculate the area under the curve. If fpnum is supplied, it will
calculate a partial AUC, up to the number of false positives in fpnum (the partial AUC is scaled
to between 0 and 1).
It assumes that the positive class is expected to have the higher of the scores (s(+) < s(-))
Parameters:
fpnum: The cumulativr FP count (fps)
Return:
"""
fps_count = 0
relevant_pauc = []
current_index = 0
max_n = len([x for x in self.data if x[0] == 0])
if fpnum == 0:
relevant_pauc = [x for x in self.data]
elif fpnum > max_n:
fpnum = max_n
#Find the upper limit of the data that does not exceed n FPs
else:
while fps_count < fpnum:
relevant_pauc.append(self.data[current_index])
if self.data[current_index][0] == 0:
fps_count += 1
current_index +=1
total_n = len([x for x in relevant_pauc if x[0] == 0])
total_p = len(relevant_pauc) - total_n
#Convert to points in a ROC
previous_df = -1000000.0
current_index = 0
points = []
tp_count, fp_count = 0.0 , 0.0
tpr, fpr = 0, 0
while current_index < len(relevant_pauc):
df = relevant_pauc[current_index][1]
if previous_df != df:
points.append((fpr,tpr,fp_count))
if relevant_pauc[current_index][0] == 0:
fp_count +=1
elif relevant_pauc[current_index][0] == 1:
tp_count +=1
fpr = fp_count/total_n
tpr = tp_count/total_p
previous_df = df
current_index +=1
points.append((fpr,tpr,fp_count)) #Add last point
points.sort(key=lambda i: (i[0],i[1]))
self.derived_points = points
return self._trapezoidal_rule(points)
def _trapezoidal_rule(self,curve_pts):
""" Method to calculate the area under the ROC curve"""
cum_area = 0.0
for ix,x in enumerate(curve_pts[0:-1]):
cur_pt = x
next_pt = curve_pts[ix+1]
cum_area += ((cur_pt[1]+next_pt[1])/2.0) * (next_pt[0]-cur_pt[0])
return cum_area
def calculateStandardError(self,fpnum=0):
""" Returns the standard error associated with the curve.
Parameters:
fpnum: The cumulativr FP count (fps)
Return:
the standard error.
"""
area = self.auc(fpnum)
#real positive cases
Na = len([ x for x in self.data if x[0] == 1])
#real negative cases
Nn = len([ x for x in self.data if x[0] == 0])
Q1 = area / (2.0 - area)
Q2 = 2 * area * area / (1.0 + area)
return math.sqrt( ( area * (1.0 - area) + (Na - 1.0) * (Q1 - area*area) +
(Nn - 1.0) * (Q2 - area * area)) / (Na * Nn))
def plot(self,title='',include_baseline=False,equal_aspect=True):
""" Method that generates a plot of the ROC curve
Parameters:
title: Title of the chart
include_baseline: Add the baseline plot line if it's True
equal_aspect: Aspects to be equal for all plot
"""
pylab.clf()
pylab.plot([x[0] for x in self.derived_points], [y[1] for y in self.derived_points], self.linestyle)
if include_baseline:
pylab.plot([0.0,1.0], [0.0,1.0],'k-.')
pylab.ylim((0,1))
pylab.xlim((0,1))
pylab.xticks(pylab.arange(0,1.1,.1))
pylab.yticks(pylab.arange(0,1.1,.1))
pylab.grid(True)
if equal_aspect:
cax = pylab.gca()
cax.set_aspect('equal')
pylab.xlabel('1 - Specificity')
pylab.ylabel('Sensitivity')
pylab.title(title)
pylab.show()
def confusion_matrix(self,threshold,do_print=False):
""" Returns the confusion matrix (in dictionary form) for a fiven threshold
where all elements > threshold are considered 1 , all else 0.
Parameters:
threshold: threshold to check the decision function
do_print: if it's True show the confusion matrix in the screen
Return:
the dictionary with the TP, FP, FN, TN
"""
pos_points = [x for x in self.data if x[1] >= threshold]
neg_points = [x for x in self.data if x[1] < threshold]
tp,fp,fn,tn = self._calculate_counts(pos_points,neg_points)
if do_print:
print "\t Actual class"
print "\t+(1)\t-(0)"
print "+(1)\t%i\t%i\tPredicted" % (tp,fp)
print "-(0)\t%i\t%i\tclass" % (fn,tn)
return {'TP': tp, 'FP': fp, 'FN': fn, 'TN': tn}
def evaluateMetrics(self,matrix,metric=None,do_print=False):
""" Returns the metrics evaluated from the confusion matrix.
Parameters:
matrix: the confusion matrix
metric: the specific metric of the default value is None (all metrics).
do_print: if it's True show the metrics in the screen
Return:
the dictionary with the Accuracy, Sensitivity, Specificity,Efficiency,
PositivePredictiveValue, NegativePredictiveValue, PhiCoefficient
"""
accuracy = (matrix['TP'] + matrix['TN'])/ float(sum(matrix.values()))
sensitivity = (matrix['TP'])/ float(matrix['TP'] + matrix['FN'])
specificity = (matrix['TN'])/float(matrix['TN'] + matrix['FP'])
efficiency = (sensitivity + specificity) / 2.0
positivePredictiveValue = matrix['TP'] / float(matrix['TP'] + matrix['FP'])
NegativePredictiveValue = matrix['TN'] / float(matrix['TN'] + matrix['FN'])
PhiCoefficient = (matrix['TP'] * matrix['TN'] - matrix['FP'] * matrix['FN'])/(
math.sqrt( (matrix['TP'] + matrix['FP']) *
(matrix['TP'] + matrix['FN']) *
(matrix['TN'] + matrix['FP']) *
(matrix['TN'] + matrix['FN']))) or 1.0
if do_print:
print 'Sensitivity: ' , sensitivity
print 'Specificity: ' , specificity
print 'Efficiency: ' , efficiency
print 'Accuracy: ' , accuracy
print 'PositivePredictiveValue: ' , positivePredictiveValue
print 'NegativePredictiveValue' , NegativePredictiveValue
print 'PhiCoefficient' , PhiCoefficient
return {'SENS': sensitivity, 'SPEC': specificity, 'ACC': accuracy, 'EFF': efficiency,
'PPV':positivePredictiveValue, 'NPV':NegativePredictiveValue , 'PHI': PhiCoefficient}
def _calculate_counts(self,pos_data,neg_data):
""" Calculates the number of false positives, true positives, false negatives and true negatives """
tp_count = len([x for x in pos_data if x[0] == 1])
fp_count = len([x for x in pos_data if x[0] == 0])
fn_count = len([x for x in neg_data if x[0] == 1])
tn_count = len([x for x in neg_data if x[0] == 0])
return tp_count,fp_count,fn_count, tn_count
if __name__ == '__main__':
print "PyRoC - ROC Curve Generator"
print "By Marcel Pinheiro Caraciolo (@marcelcaraciolo)"
print "http://aimotion.bogspot.com\n"
from optparse import OptionParser
parser = OptionParser()
parser.add_option('-f', '--file', dest='origFile', help="Path to a file with the class and decision function. The first column of each row is the class, and the second the decision score.")
parser.add_option("-n", "--max fp", dest = "fp_n", default=0, help= "Maximum false positives to calculate up to (for partial AUC).")
parser.add_option("-p","--plot", action="store_true",dest='plotFlag', default=False, help="Plot the ROC curve (matplotlib required)")
parser.add_option("-t",'--title', dest= 'ptitle' , default='' , help = 'Title of plot.')
(options,args) = parser.parse_args()
if (not options.origFile):
parser.print_help()
exit()
df_data = load_decision_function(options.origFile)
roc = ROCData(df_data)
roc_n = int(options.fp_n)
print "ROC AUC: %s" % (str(roc.auc(roc_n)),)
print 'Standard Error: %s' % (str(roc.calculateStandardError(roc_n)),)
print ''
for pt in roc.derived_points:
print pt[0],pt[1]
if options.plotFlag:
roc.plot(options.ptitle,True,True)