-
Notifications
You must be signed in to change notification settings - Fork 4
/
evaluation_metrics.py
342 lines (270 loc) · 18 KB
/
evaluation_metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
import numpy as np
import sys
def obtain_asv_error_rates(tar_asv, non_asv, spoof_asv, asv_threshold):
# False alarm and miss rates for ASV
Pfa_asv = sum(non_asv >= asv_threshold) / non_asv.size
Pmiss_asv = sum(tar_asv < asv_threshold) / tar_asv.size
# Rate of rejecting spoofs in ASV
if spoof_asv.size == 0:
Pmiss_spoof_asv = None
Pfa_spoof_asv = None
else:
Pmiss_spoof_asv = np.sum(spoof_asv < asv_threshold) / spoof_asv.size
Pfa_spoof_asv = np.sum(spoof_asv >= asv_threshold) / spoof_asv.size
return Pfa_asv, Pmiss_asv, Pmiss_spoof_asv, Pfa_spoof_asv
def compute_det_curve(target_scores, nontarget_scores):
n_scores = target_scores.size + nontarget_scores.size
all_scores = np.concatenate((target_scores, nontarget_scores))
labels = np.concatenate((np.ones(target_scores.size), np.zeros(nontarget_scores.size)))
# Sort labels based on scores
indices = np.argsort(all_scores, kind='mergesort')
labels = labels[indices]
# Compute false rejection and false acceptance rates
tar_trial_sums = np.cumsum(labels)
nontarget_trial_sums = nontarget_scores.size - (np.arange(1, n_scores + 1) - tar_trial_sums)
frr = np.concatenate((np.atleast_1d(0), tar_trial_sums / target_scores.size)) # false rejection rates
far = np.concatenate((np.atleast_1d(1), nontarget_trial_sums / nontarget_scores.size)) # false acceptance rates
thresholds = np.concatenate((np.atleast_1d(all_scores[indices[0]] - 0.001), all_scores[indices])) # Thresholds are the sorted scores
return frr, far, thresholds
def compute_eer(target_scores, nontarget_scores):
""" Returns equal error rate (EER) and the corresponding threshold. """
frr, far, thresholds = compute_det_curve(target_scores, nontarget_scores)
abs_diffs = np.abs(frr - far)
min_index = np.argmin(abs_diffs)
eer = np.mean((frr[min_index], far[min_index]))
return eer, thresholds[min_index]
def compute_tDCF(bonafide_score_cm, spoof_score_cm, Pfa_asv, Pmiss_asv, Pfa_spoof_asv, cost_model, print_cost):
"""
Compute Tandem Detection Cost Function (t-DCF) [1] for a fixed ASV system.
In brief, t-DCF returns a detection cost of a cascaded system of this form,
Speech waveform -> [CM] -> [ASV] -> decision
where CM stands for countermeasure and ASV for automatic speaker
verification. The CM is therefore used as a 'gate' to decided whether or
not the input speech sample should be passed onwards to the ASV system.
Generally, both CM and ASV can do detection errors. Not all those errors
are necessarily equally cost, and not all types of users are necessarily
equally likely. The tandem t-DCF gives a principled with to compare
different spoofing countermeasures under a detection cost function
framework that takes that information into account.
INPUTS:
bonafide_score_cm A vector of POSITIVE CLASS (bona fide or human)
detection scores obtained by executing a spoofing
countermeasure (CM) on some positive evaluation trials.
trial represents a bona fide case.
spoof_score_cm A vector of NEGATIVE CLASS (spoofing attack)
detection scores obtained by executing a spoofing
CM on some negative evaluation trials.
Pfa_asv False alarm (false acceptance) rate of the ASV
system that is evaluated in tandem with the CM.
Assumed to be in fractions, not percentages.
Pmiss_asv Miss (false rejection) rate of the ASV system that
is evaluated in tandem with the spoofing CM.
Assumed to be in fractions, not percentages.
Pmiss_spoof_asv Miss rate of spoof samples of the ASV system that
is evaluated in tandem with the spoofing CM. That
is, the fraction of spoof samples that were
rejected by the ASV system.
cost_model A struct that contains the parameters of t-DCF,
with the following fields.
Ptar Prior probability of target speaker.
Pnon Prior probability of nontarget speaker (zero-effort impostor)
Psoof Prior probability of spoofing attack.
Cmiss Cost of tandem system falsely rejecting target speaker.
Cfa Cost of tandem system falsely accepting nontarget speaker.
Cfa_spoof Cost of tandem system falsely accepting spoof.
print_cost Print a summary of the cost parameters and the
implied t-DCF cost function?
OUTPUTS:
tDCF_norm Normalized t-DCF curve across the different CM
system operating points; see [2] for more details.
Normalized t-DCF > 1 indicates a useless
countermeasure (as the tandem system would do
better without it). min(tDCF_norm) will be the
minimum t-DCF used in ASVspoof 2019 [2].
CM_thresholds Vector of same size as tDCF_norm corresponding to
the CM threshold (operating point).
NOTE:
o In relative terms, higher detection scores values are assumed to
indicate stronger support for the bona fide hypothesis.
o You should provide real-valued soft scores, NOT hard decisions. The
recommendation is that the scores are log-likelihood ratios (LLRs)
from a bonafide-vs-spoof hypothesis based on some statistical model.
This, however, is NOT required. The scores can have arbitrary range
and scaling.
o Pfa_asv, Pmiss_asv, Pmiss_spoof_asv are in fractions, not percentages.
References:
[1] T. Kinnunen, H. Delgado, N. Evans,K.-A. Lee, V. Vestman,
A. Nautsch, M. Todisco, X. Wang, M. Sahidullah, J. Yamagishi,
and D.-A. Reynolds, "Tandem Assessment of Spoofing Countermeasures
and Automatic Speaker Verification: Fundamentals," IEEE/ACM Transaction on
Audio, Speech and Language Processing (TASLP).
[2] ASVspoof 2019 challenge evaluation plan
https://www.asvspoof.org/asvspoof2019/asvspoof2019_evaluation_plan.pdf
"""
# Sanity check of cost parameters
if cost_model['Cfa'] < 0 or cost_model['Cmiss'] < 0 or \
cost_model['Cfa'] < 0 or cost_model['Cmiss'] < 0:
print('WARNING: Usually the cost values should be positive!')
if cost_model['Ptar'] < 0 or cost_model['Pnon'] < 0 or cost_model['Pspoof'] < 0 or \
np.abs(cost_model['Ptar'] + cost_model['Pnon'] + cost_model['Pspoof'] - 1) > 1e-10:
sys.exit('ERROR: Your prior probabilities should be positive and sum up to one.')
# Unless we evaluate worst-case model, we need to have some spoof tests against asv
if Pfa_spoof_asv is None:
sys.exit('ERROR: you should provide false alarm rate of spoof tests against your ASV system.')
# Sanity check of scores
combined_scores = np.concatenate((bonafide_score_cm, spoof_score_cm))
if np.isnan(combined_scores).any() or np.isinf(combined_scores).any():
sys.exit('ERROR: Your scores contain nan or inf.')
# Sanity check that inputs are scores and not decisions
n_uniq = np.unique(combined_scores).size
if n_uniq < 3:
sys.exit('ERROR: You should provide soft CM scores - not binary decisions')
# Obtain miss and false alarm rates of CM
Pmiss_cm, Pfa_cm, CM_thresholds = compute_det_curve(bonafide_score_cm, spoof_score_cm)
# Constants - see ASVspoof 2019 evaluation plan
C0 = cost_model['Ptar'] * cost_model['Cmiss'] * Pmiss_asv + cost_model['Pnon']*cost_model['Cfa']*Pfa_asv
C1 = cost_model['Ptar'] * cost_model['Cmiss'] - (cost_model['Ptar'] * cost_model['Cmiss'] * Pmiss_asv + cost_model['Pnon'] * cost_model['Cfa'] * Pfa_asv)
C2 = cost_model['Pspoof'] * cost_model['Cfa_spoof'] * Pfa_spoof_asv;
# Sanity check of the weights
if C0 < 0 or C1 < 0 or C2 < 0:
sys.exit('You should never see this error but I cannot evalute tDCF with negative weights - please check whether your ASV error rates are correctly computed?')
# Obtain t-DCF curve for all thresholds
tDCF = C0 + C1 * Pmiss_cm + C2 * Pfa_cm
# Obtain default t-DCF
tDCF_default = C0 + np.minimum(C1, C2)
# Normalized t-DCF
tDCF_norm = tDCF / tDCF_default
# Everything should be fine if reaching here.
if print_cost:
print('t-DCF evaluation from [Nbona={}, Nspoof={}] trials\n'.format(bonafide_score_cm.size, spoof_score_cm.size))
print('t-DCF MODEL')
print(' Ptar = {:8.5f} (Prior probability of target user)'.format(cost_model['Ptar']))
print(' Pnon = {:8.5f} (Prior probability of nontarget user)'.format(cost_model['Pnon']))
print(' Pspoof = {:8.5f} (Prior probability of spoofing attack)'.format(cost_model['Pspoof']))
print(' Cfa = {:8.5f} (Cost of tandem system falsely accepting a nontarget)'.format(cost_model['Cfa']))
print(' Cmiss = {:8.5f} (Cost of tandem system falsely rejecting target speaker)'.format(cost_model['Cmiss']))
print(' Cfa_spoof = {:8.5f} (Cost of tandem sysmte falsely accepting spoof)'.format(cost_model['Cfa_spoof']))
print('\n Implied normalized t-DCF function (depends on t-DCF parameters and ASV errors), t_CM=CM threshold)')
print(' tDCF_norm(t_CM) = {:8.5f} + {:8.5f} x Pmiss_cm(t_CM) + {:8.5f} x Pfa_cm(t_CM)\n'.format(C0/tDCF_default, C1/tDCF_default, C2/tDCF_default))
print(' * The optimum value is given by the first term (0.06273). This is the normalized t-DCF obtained with an error-free CM system.')
print(' * The minimum normalized cost (minimum over all possible thresholds) is always <= 1.00.')
print('')
return tDCF_norm, CM_thresholds
def compute_tDCF_legacy(bonafide_score_cm, spoof_score_cm, Pfa_asv, Pmiss_asv, Pmiss_spoof_asv, cost_model, print_cost):
"""
Compute Tandem Detection Cost Function (t-DCF) [1] for a fixed ASV system.
In brief, t-DCF returns a detection cost of a cascaded system of this form,
Speech waveform -> [CM] -> [ASV] -> decision
where CM stands for countermeasure and ASV for automatic speaker
verification. The CM is therefore used as a 'gate' to decided whether or
not the input speech sample should be passed onwards to the ASV system.
Generally, both CM and ASV can do detection errors. Not all those errors
are necessarily equally cost, and not all types of users are necessarily
equally likely. The tandem t-DCF gives a principled with to compare
different spoofing countermeasures under a detection cost function
framework that takes that information into account.
INPUTS:
bonafide_score_cm A vector of POSITIVE CLASS (bona fide or human)
detection scores obtained by executing a spoofing
countermeasure (CM) on some positive evaluation trials.
trial represents a bona fide case.
spoof_score_cm A vector of NEGATIVE CLASS (spoofing attack)
detection scores obtained by executing a spoofing
CM on some negative evaluation trials.
Pfa_asv False alarm (false acceptance) rate of the ASV
system that is evaluated in tandem with the CM.
Assumed to be in fractions, not percentages.
Pmiss_asv Miss (false rejection) rate of the ASV system that
is evaluated in tandem with the spoofing CM.
Assumed to be in fractions, not percentages.
Pmiss_spoof_asv Miss rate of spoof samples of the ASV system that
is evaluated in tandem with the spoofing CM. That
is, the fraction of spoof samples that were
rejected by the ASV system.
cost_model A struct that contains the parameters of t-DCF,
with the following fields.
Ptar Prior probability of target speaker.
Pnon Prior probability of nontarget speaker (zero-effort impostor)
Psoof Prior probability of spoofing attack.
Cmiss_asv Cost of ASV falsely rejecting target.
Cfa_asv Cost of ASV falsely accepting nontarget.
Cmiss_cm Cost of CM falsely rejecting target.
Cfa_cm Cost of CM falsely accepting spoof.
print_cost Print a summary of the cost parameters and the
implied t-DCF cost function?
OUTPUTS:
tDCF_norm Normalized t-DCF curve across the different CM
system operating points; see [2] for more details.
Normalized t-DCF > 1 indicates a useless
countermeasure (as the tandem system would do
better without it). min(tDCF_norm) will be the
minimum t-DCF used in ASVspoof 2019 [2].
CM_thresholds Vector of same size as tDCF_norm corresponding to
the CM threshold (operating point).
NOTE:
o In relative terms, higher detection scores values are assumed to
indicate stronger support for the bona fide hypothesis.
o You should provide real-valued soft scores, NOT hard decisions. The
recommendation is that the scores are log-likelihood ratios (LLRs)
from a bonafide-vs-spoof hypothesis based on some statistical model.
This, however, is NOT required. The scores can have arbitrary range
and scaling.
o Pfa_asv, Pmiss_asv, Pmiss_spoof_asv are in fractions, not percentages.
References:
[1] T. Kinnunen, K.-A. Lee, H. Delgado, N. Evans, M. Todisco,
M. Sahidullah, J. Yamagishi, D.A. Reynolds: "t-DCF: a Detection
Cost Function for the Tandem Assessment of Spoofing Countermeasures
and Automatic Speaker Verification", Proc. Odyssey 2018: the
Speaker and Language Recognition Workshop, pp. 312--319, Les Sables d'Olonne,
France, June 2018 (https://www.isca-speech.org/archive/Odyssey_2018/pdfs/68.pdf)
[2] ASVspoof 2019 challenge evaluation plan
https://www.asvspoof.org/asvspoof2019/asvspoof2019_evaluation_plan.pdf
"""
# Sanity check of cost parameters
if cost_model['Cfa_asv'] < 0 or cost_model['Cmiss_asv'] < 0 or \
cost_model['Cfa_cm'] < 0 or cost_model['Cmiss_cm'] < 0:
print('WARNING: Usually the cost values should be positive!')
if cost_model['Ptar'] < 0 or cost_model['Pnon'] < 0 or cost_model['Pspoof'] < 0 or \
np.abs(cost_model['Ptar'] + cost_model['Pnon'] + cost_model['Pspoof'] - 1) > 1e-10:
sys.exit('ERROR: Your prior probabilities should be positive and sum up to one.')
# Unless we evaluate worst-case model, we need to have some spoof tests against asv
if Pmiss_spoof_asv is None:
sys.exit('ERROR: you should provide miss rate of spoof tests against your ASV system.')
# Sanity check of scores
combined_scores = np.concatenate((bonafide_score_cm, spoof_score_cm))
if np.isnan(combined_scores).any() or np.isinf(combined_scores).any():
sys.exit('ERROR: Your scores contain nan or inf.')
# Sanity check that inputs are scores and not decisions
n_uniq = np.unique(combined_scores).size
if n_uniq < 3:
sys.exit('ERROR: You should provide soft CM scores - not binary decisions')
# Obtain miss and false alarm rates of CM
Pmiss_cm, Pfa_cm, CM_thresholds = compute_det_curve(bonafide_score_cm, spoof_score_cm)
# Constants - see ASVspoof 2019 evaluation plan
C1 = cost_model['Ptar'] * (cost_model['Cmiss_cm'] - cost_model['Cmiss_asv'] * Pmiss_asv) - \
cost_model['Pnon'] * cost_model['Cfa_asv'] * Pfa_asv
C2 = cost_model['Cfa_cm'] * cost_model['Pspoof'] * (1 - Pmiss_spoof_asv)
# Sanity check of the weights
if C1 < 0 or C2 < 0:
sys.exit('You should never see this error but I cannot evalute tDCF with negative weights - please check whether your ASV error rates are correctly computed?')
# Obtain t-DCF curve for all thresholds
tDCF = C1 * Pmiss_cm + C2 * Pfa_cm
# Normalized t-DCF
tDCF_norm = tDCF / np.minimum(C1, C2)
# Everything should be fine if reaching here.
if print_cost:
print('t-DCF evaluation from [Nbona={}, Nspoof={}] trials\n'.format(bonafide_score_cm.size, spoof_score_cm.size))
print('t-DCF MODEL')
print(' Ptar = {:8.5f} (Prior probability of target user)'.format(cost_model['Ptar']))
print(' Pnon = {:8.5f} (Prior probability of nontarget user)'.format(cost_model['Pnon']))
print(' Pspoof = {:8.5f} (Prior probability of spoofing attack)'.format(cost_model['Pspoof']))
print(' Cfa_asv = {:8.5f} (Cost of ASV falsely accepting a nontarget)'.format(cost_model['Cfa_asv']))
print(' Cmiss_asv = {:8.5f} (Cost of ASV falsely rejecting target speaker)'.format(cost_model['Cmiss_asv']))
print(' Cfa_cm = {:8.5f} (Cost of CM falsely passing a spoof to ASV system)'.format(cost_model['Cfa_cm']))
print(' Cmiss_cm = {:8.5f} (Cost of CM falsely blocking target utterance which never reaches ASV)'.format(cost_model['Cmiss_cm']))
print('\n Implied normalized t-DCF function (depends on t-DCF parameters and ASV errors), s=CM threshold)')
if C2 == np.minimum(C1, C2):
print(' tDCF_norm(s) = {:8.5f} x Pmiss_cm(s) + Pfa_cm(s)\n'.format(C1 / C2))
else:
print(' tDCF_norm(s) = Pmiss_cm(s) + {:8.5f} x Pfa_cm(s)\n'.format(C2 / C1))
return tDCF_norm, CM_thresholds