-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
476 lines (400 loc) · 19.3 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
"""
Module containing utility functions needed for PRMSE simulations.
:author: Nitin Madnani
:author: Anastassia Loukina
:organization: ETS
:date: March 2020
"""
import itertools
import numpy as np
import pandas as pd
from rsmtool.analyzer import Analyzer
from rsmtool.utils.prmse import prmse_true
from scipy.stats import pearsonr
def get_rater_pairs(rater_ids, num_pairs, seed=1234567890):
"""
Randomly sample given number of rater pairs from given rater IDs.
Parameters
----------
rater_ids : list of str
A list of rater IDs from which we have to randomly sample pairs.
num_pairs : int
Number of rater pairs we want to sample.
seed : int, optional
The seed for the random number generator that will be
used to sample the rater pairs.
Defaults to 1234567890.
Returns
-------
rater_pairs : list of str
A list containing the required number of randomly sampled
rater pairs. Each pair is of the form "h_X+h_Y".
"""
# first we generate all possible rater pair combinations
all_pairs = [f"{rater1}+{rater2}" for (rater1, rater2)
in itertools.combinations(rater_ids, 2)]
# next we randomly sample as many rater pairs as we need
prng = np.random.RandomState(seed)
chosen_pairs = prng.choice(all_pairs, size=num_pairs, replace=False)
return [pair.split('+') for pair in chosen_pairs]
def compute_agreement_one_system_one_rater_pair(df_scores,
system_id,
rater_id1,
rater_id2,
include_mean=False):
"""
Evaluate the given system against the given pair of raters.
This function computes the agreement metrics between the scores
assigned by the given simulated system (``system_id``) against the scores
assigned by the two simulated raters ``rater_id1`` and ``rater_id2``.
The agreement metrics computed are: Pearson's correlation, adjusted R^2,
quadratically-weighted kappa, and the difference between the human-human
Pearson correlation and the human-machine Pearson correlation (commonly
known as "degradation"). All 4 metrics are computed against the scores
of the first rater in the pair and, if ``include_mean`` is ``True``, also
against the average of the scores assigned by both raters in the pair.
Parameters
----------
df_scores : pandas.DataFrame
The data frame containing the simulated scores.
This is usually one of the data frames returned
by the ``simulation.dataset.Dataset.to_frame()``
method.
system_id : str
The ID for the simulated system to be evaluated.
This must be a column in ``df_scores``.
rater_id1 : str
The ID for the first rater in the rater pair
being used to evaluate the given system.
This must be a column in ``df_scores``.
rater_id2 : str
The ID for the second rater in the rater pair
being used to evaluate the given system.
include_mean : bool, optional
If set to ``True``, also include the metric values
computed against the average of the scores assigned
by both raters in the given pair.
Returns
-------
metrics_series : list of pandas.Series
A list containing 1 or 2 pandas series depending on the value
of ``include_mean``. If it is ``True``, this list contains
two series: the first containing the values of the metrics
against the average of the two rater scores and the second
containing the value of the metrics against the scores of
the first rater. If ``include_mean`` is ``False``, this list
only contains a single series - that containing the metric
values against the scores of the first rater. Any series
returned will contains the following columns:
1. "r" - the Pearson's correlation between the system score
and the average and the first rater scores.
2. "QWK" - the quadratically-weighted kapp between the system score
and the average and the first rater scores.
3. "R2" - the R^2 score between the system score
and the average and the first rater scores.
4. "degradation" - the difference between the human-human correlation
score and the human-machine correlation score. Note that this column
may not be included in the output if any of the scores for either of
the two simulated raters are null, e.g., if some of the responses are
single scored.
5. "reference" - a column containing whether the metric values were
computed against the average of the two rater scores (``h1-h2 mean``)
or the first rater scores (``h1``).
"""
# compute the inter-rater correlation that we need for degradation
try:
rater1_rater2_correlation = pearsonr(df_scores[rater_id1], df_scores[rater_id2])[0]
except ValueError:
rater1_rater2_correlation = None
pass
# we only want these 3 metrics to start with
chosen_metrics = ['wtkappa', 'corr', 'R2']
# compute the metrics against the first rater as a series
h1_metric_values = Analyzer.metrics_helper(df_scores[rater_id1], df_scores[system_id])
h1_metric_values = h1_metric_values[chosen_metrics]
# compute the degradation values
if rater1_rater2_correlation:
h1_metric_values['degradation'] = rater1_rater2_correlation - h1_metric_values['corr']
# add a new column called "reference" indicating whether we used
# the h1-h2 average score for just the h1 score
h1_metric_values['reference'] = 'h1'
# rename some of the metrics to have more recognizable names
h1_metric_values.rename({'wtkappa': 'QWK', 'corr': 'r'}, inplace=True)
# compute the metrics against the average ot the two rater scores
# as a series if it was requested
if include_mean:
mean_metric_values = Analyzer.metrics_helper(df_scores[[rater_id1, rater_id2]].mean(axis=1),
df_scores[system_id])
mean_metric_values = mean_metric_values[chosen_metrics]
if rater1_rater2_correlation:
mean_metric_values['degradation'] = rater1_rater2_correlation - mean_metric_values['corr']
mean_metric_values['reference'] = 'h1-h2 mean'
mean_metric_values.rename({'wtkappa': 'QWK', 'corr': 'r'}, inplace=True)
# return the right number of metric series
ans = [mean_metric_values, h1_metric_values] if include_mean else [h1_metric_values]
return ans
def compute_agreement_one_system_multiple_rater_pairs(df_scores,
system_id,
rater_pairs,
include_mean=False):
"""
Compute agreement for system against all given rater pairs.
This function computes the values of conventional metrics of agreement
between the scores of the given system (``system_id``) against the scores
assigned by the two simulated raters ``rater_id1`` and ``rater_id2``.
This function simply calls the ``compute_agreement_one_system_one_rater_pair()``
function for each rater pair and combines the output. Refer
to ``compute_agreements_for_rater_pair()`` for more details.
Parameters
----------
df_scores : pandas.DataFrame
The data frame containing the simulated scores.
This is usually one of the data frames returned
by the ``simulation.dataset.Dataset.to_frame()``
method.
system_id : str
The ID for the simulated system to be evaluated.
This must be a column in ``df_scores``.
Description
rater_pairs : list of lists of str
A list containing rater pairs against which
the system is to be evaluated. Each rater
pair is a list of rater ID, e.g.,
``[h_1, h_33]``.
include_mean : bool, optional
If set to ``True``, also include the metric values
computed against the average of the scores assigned
by both raters in the given pair.
Returns
-------
df_metrics : pandas.DataFrame
A pandas data frame which has the same columns as the series
returned by ``compute_agreements_for_rater_pair()``.
"""
# initialize a list that will hold the series
metrics_for_all_pairs = []
# iterate over each given rater pair
for rater_id1, rater_id2 in rater_pairs:
# call the per-pair function
metrics_for_this_pair = compute_agreement_one_system_one_rater_pair(df_scores,
system_id,
rater_id1,
rater_id2,
include_mean=include_mean)
# add the returned series to the list of all metrics
metrics_for_all_pairs.extend(metrics_for_this_pair)
# create a data frame from the lists of series
df_metrics = pd.DataFrame(metrics_for_all_pairs)
return df_metrics
def compute_agreement_multiple_systems_one_rater_pair(df_scores,
system_ids,
rater_id1,
rater_id2,
include_mean=False):
"""
Compute agreement for given systems against the given rater pair.
This function computes the values of conventional metrics of agreement
between the scores of the given list of systems (with IDs ``system_ids``)
against the scores assigned by the two simulated raters ``rater_id1``
and ``rater_id2``.
This function simply calls the ``compute_agreement_one_system_one_rater_pair()``
function for each system and combines the output. Refer to
``compute_agreements_for_rater_pair()`` for more details.
Parameters
----------
df_scores : pandas.DataFrame
The data frame containing the simulated scores.
This is usually one of the data frames returned
by the ``simulation.dataset.Dataset.to_frame()``
method.
system_ids : list of str
The list of IDs of the simulated systems to be evaluated.
Each ID must be a column in ``df_scores``.
rater_id1 : str
The ID for the first rater in the rater pair
being used to evaluate the given system.
This must be a column in ``df_scores``.
rater_id2 : str
The ID for the second rater in the rater pair
being used to evaluate the given system.
This must be a column in ``df_scores``.
Returns
-------
data frame
Description
"""
# initialize an empty list we will use to save each system ID's results
metrics = []
# iterate over each system ID
for system_id in system_ids:
# compute the metric series for this system ID against the rater pair
metric_series = compute_agreement_one_system_one_rater_pair(df_scores,
system_id,
rater_id1,
rater_id2,
include_mean=include_mean)
# save the current system ID in the same series
for series in metric_series:
series['system_id'] = system_id
# save the series in the list
metrics.extend(metric_series)
# convert the list of series into a data frame and return
return pd.DataFrame(metrics)
def compute_prmse_one_system_multiple_rater_pairs(df_scores, system_id, rater_pairs):
"""
Compute the PRMSE score for the system against all given rater pairs.
This function computes the value of the PRMSE metric between
the scores of the given system (``system_id``) against the scores
assigned by the two simulated raters ``rater_id1`` and ``rater_id2``.
Parameters
----------
df_scores : pandas.DataFrame
The data frame containing the simulated scores.
This is usually one of the data frames returned
by the ``simulation.dataset.Dataset.to_frame()``
method.
system_id : str
The ID for the simulated system to be evaluated.
This must be a column in ``df_scores``.
Description
rater_pairs : list of lists of str
A list containing rater pairs against which
the system is to be evaluated. Each rater
pair is a list of rater ID, e.g.,
``[h_1, h_33]``.
Returns
-------
prmse_values : list of float
A list containing the values for the PRMSE metric
for each of the given rater pairs.
"""
# initialize a list that will hold the series
prmse_for_all_pairs = []
# iterate over each given rater pair
for rater_id1, rater_id2 in rater_pairs:
# call the per-pair function
prmse_for_this_pair = prmse_true(df_scores[system_id],
df_scores[[rater_id1, rater_id2]])
# save the returned lists of series
prmse_for_all_pairs.append(prmse_for_this_pair)
return prmse_for_all_pairs
def compute_cumulative_mean_for_raters(df_scores, rater_ids):
"""
Compute cumulative average of given rater's scores.
This function computes the cumulative average of scores
assigned by the raters identified by the given list of
``rater_ids``. The cumulative average is computed by
adding one rater at a time.
Parameters
----------
df_scores : pandas.DataFrame
The data frame containing the simulated scores.
This is usually one of the data frames returned
by the ``simulation.dataset.Dataset.to_frame()``
method.
rater_ids : numpy.ndarray
An array of simulated rater IDs whose scores want
to compute the cumulative average over.
Returns
-------
df_cumulative_mean_scores : pandas.DataFrame
A data frame containing the cumulative average rater scores.
It has ``len(df_scores)`` rows and ``len(rater_ids`` columns.
Each of the columns is named ``N=n``, where n is the number of
raters included in computing the cumulative average.
"""
df_rater_scores = df_scores[rater_ids]
df_cumulative_average_scores = df_rater_scores.expanding(1, axis=1).mean()
df_cumulative_average_scores.columns = [f"N={num_raters+1}" for num_raters in range(len(rater_ids))]
return df_cumulative_average_scores
def compute_ranks_from_metrics(df_metrics):
"""
Compute ranks given metric values for systems.
This function computes ranks for a list of systems
according to different metrics present in the given
data frame.
Parameters
----------
df_metrics : pandas.DataFrame
A data frame with one row for each system to be ranked and
the following columns:
1. "system_id" : the ID of the system
2. "system_category" : the performance category that the system belongs to.
3. At least one other column containing values for a metric with the name
of the metric being the column name. For example, "QWK" or "PRMSE" etc.
Returns
-------
df_metric_ranks : pandas.DataFrame
A data frame with the same number of rows and columns as the input
data frame except that the values in each "metric" column are now the
ranks of the systems rather than the metric values themselves.
"""
# first we set our indices to be the system IDs and categories so that they are
# retained when we compute the ranks for the systems
df_metrics_for_ranks = df_metrics.set_index(['system_category', 'system_id'])
# if degradation is one of the metrics, multiply it by -1 to make it behave
# like other metrics for ranking purposes
if "degradation" in df_metrics_for_ranks.columns:
df_metrics_for_ranks['degradation'] = -1 * df_metrics_for_ranks['degradation']
# compute the ranks in descending order since lower ranks are better;
# also reset the indices so that we get the IDs and categories back
df_metric_ranks = df_metrics_for_ranks.rank(ascending=False).reset_index()
# return the data frame
return df_metric_ranks
def simulate_percent_double_scored(df_scores, rater_id1, rater_id2, percentage):
"""
Simulate a dataset with only given percentage of double-scored responses.
This function takes two given rater IDs and a percentage value and returns
a data frame that contains the scores from only those two simulated raters
with the additional constraint that only a certain percentage of the
hypothetical responses in the dataset have a score from the second rater
and the remaining responses have the scores from the second rater set
to ``NaN``.
Parameters
----------
df_scores : pandas.DataFrame
The data frame containing the simulated scores.
This is usually one of the data frames returned
by the ``simulation.dataset.Dataset.to_frame()``
method.
rater_id1 : str
The ID for the first rater in the rater pair
being used to evaluate the given system.
This must be a column in ``df_scores``.
rater_id2 : str
The ID for the second rater in the rater pair
being used to evaluate the given system.
This must be a column in ``df_scores``.
percentage : float
The percentage of randomly chosen responses in the dataset
that should still have scores from both raters.
Returns
-------
df_rater_pair_scores : pandas.DataFrame
A data frame containing the same number of rows as ``df_scores`` but
only two columns : ``rater_id1`` and ``rater_id2``. Only ``percentage``
percent of responses in the ``rater_id2`` column have valid scores
with the rest set to ``Nan``.
num_double_scored : int
The number of double-scored responses in the returned data frame.
"""
# initialize a random number generator
prng = np.random.RandomState(987654321)
# create a data frame that only has the scores from the raters in the given pair
df_rater_pair_scores = df_scores[[rater_id1, rater_id2]].copy()
# stochastically mask the second rater's scores unless we want 100% double-scored
if percentage < 100:
# calculate the number of responses we want to simulate as double scored
num_double_scored = int(percentage / 100 * len(df_rater_pair_scores))
# randomly choose this many number of responses
chosen = prng.choice(df_rater_pair_scores.index.values,
size=num_double_scored,
replace=False)
# for all the other responses in the dataset, set the second rater's score to null
# that is, make them single scored
df_rater_pair_scores.loc[~df_rater_pair_scores.index.isin(chosen), rater_id2] = np.nan
else:
# if the percentge is 100%, then the whole dataset is double-scored
# and we don't need to do anything else
num_double_scored = len(df_rater_pair_scores)
return df_rater_pair_scores, num_double_scored