-
Notifications
You must be signed in to change notification settings - Fork 12
/
algorithms.py
382 lines (300 loc) · 13.4 KB
/
algorithms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
"""
Copyright (C) 2014 Dallas Card
Copyright (C) 2018 Vaibhav B Sinha, Sukrut Rao, Vineeth N Balasubramanian
Permission is hereby granted, free of charge, to any person obtaining a copy of this
software and associated documentation files (the "Software"), to deal in the Software
without restriction, including without limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons
to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or
substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
"""
from __future__ import print_function
import numpy as np
def main(args, data, gold=None):
"""
Run the EM estimator on the data passed as the parameter
Args:
args: Arguments, must contain algorithm whose value should be one
among ['FDS','DS','H','MV']
And should contain verbose whose value should be either True or False
data: a dictionary object of crwod-sourced responses:
{questions: {participants: [labels]}}
gold: The correct label for each question: [nQuestions]
Returns:
result: The estimated label for each question: [nQuestions]
acc: Accuracy of the estimated labels if gold was specified
"""
assert args.algorithm in ['FDS', 'DS', 'H', 'MV'], 'Invalid algorithm'
result = run(data, args=args)
if gold is not None:
acc = (gold == result).mean()
else:
acc = None
return result, acc
def run(responses, args, tol=0.0001, CM_tol=0.005, max_iter=100):
"""
Run the aggregator on response data
Args:
responses: a dictionary object of responses:
{questions: {participants: [labels]}}
args: Must contain algorithm whose value should be
one among ['FDS','DS','H','MV']
'FDS': use for FDS algorithm
'DS': use for original DS algorithm
'H': use for Hybrid algorithm
'MV': use for Majority Voting
And should contain verbose whose value should be either True or False
tol: threshold for class marginals for convergence of the algorithm
CM_tol: threshold for class marginals for switching to 'hard' mode
in Hybrid algorithm. Has no effect for FDS or DS
max_iter: maximum number of iterations of EM
Returns:
The estimated label for each question: [nQuestions]
"""
mode = args.algorithm
# convert responses to counts
(questions, participants, classes, counts) = responses_to_counts(responses)
if args.verbose:
print("Number of Questions:", len(questions))
print("Number of Participants:", len(participants))
print("Classes:", classes)
question_classes = initialize(counts, mode)
if mode == 'MV':
return np.argmax(question_classes, axis=1)
# initialize
nIter = 0
converged = False
old_class_marginals = None
old_error_rates = None
# total_time = 0
if args.verbose:
print("Iter\tlog-likelihood\tdelta-CM\tdelta-ER")
while not converged:
nIter += 1
# Start measuring time
# start = time.time()
# M-step
(class_marginals, error_rates) = m_step(counts, question_classes)
# E-step
question_classes = e_step(counts, class_marginals, error_rates, mode)
# End measuring time
# end = time.time()
# total_time += end-start
# check likelihood
log_L = calc_likelihood(counts, class_marginals, error_rates)
# check for convergence
if old_class_marginals is not None:
class_marginals_diff = np.sum(
np.abs(class_marginals - old_class_marginals))
error_rates_diff = np.sum(np.abs(error_rates - old_error_rates))
if args.verbose:
print(nIter, '\t', log_L, '\t%.6f\t%.6f' %
(class_marginals_diff, error_rates_diff))
if (class_marginals_diff < tol) or nIter >= max_iter:
converged = True
elif (mode == 'H' and class_marginals_diff <= CM_tol):
if args.verbose:
print("Mode changed to Hphase2")
mode = 'Hphase2'
else:
if args.verbose:
print(nIter, '\t', log_L)
old_class_marginals = class_marginals
old_error_rates = error_rates
np.set_printoptions(precision=2, suppress=True)
if args.verbose:
print("Class marginals")
print(class_marginals)
result = np.argmax(question_classes, axis=1)
return result
def responses_to_counts(responses):
"""
Convert a matrix of annotations to count data
Args:
responses: dictionary of responses {questions:{participants:[responses]}}
Returns:
questions: list of questions
participants: list of participants
classes: list of possible classes (choices)
counts: 3d array of counts: [questions x participants x classes]
"""
questions = responses.keys()
questions = sorted(questions)
nQuestions = len(questions)
# determine the participants and classes
participants = set()
classes = set()
for i in questions:
i_participants = responses[i].keys()
for k in i_participants:
if k not in participants:
participants.add(k)
ik_responses = responses[i][k]
classes.update(ik_responses)
classes = list(classes)
classes.sort()
nClasses = len(classes)
participants = list(participants)
participants.sort()
nParticipants = len(participants)
# create a 3d array to hold counts
counts = np.zeros([nQuestions, nParticipants, nClasses])
# convert responses to counts
for question in questions:
i = questions.index(question)
for participant in responses[question].keys():
k = participants.index(participant)
for response in responses[question][participant]:
j = classes.index(response)
counts[i, k, j] += 1
return (questions, participants, classes, counts)
def initialize(counts, mode):
"""
Get majority voting estimates for the true classes using counts
Args:
counts: counts of the number of times each response was received
by each question from each participant: [questions x participants x classes]
mode: One among ['FDS', 'DS', 'H', 'MV']
'FDS', 'MV' and 'H' will give a majority voting initialization
'DS' will give the initialization mentioned in Dawid and Skene (1979)
'FDS': use for FDS algorithm
'DS': use for original DS algorithm
'H': use for Hybrid algorithm
Returns:
question_classes: matrix of estimates of true classes:
[questions x responses]
"""
[nQuestions, nParticipants, nClasses] = np.shape(counts)
response_sums = np.sum(counts, 1)
question_classes = np.zeros([nQuestions, nClasses])
if mode == 'FDS' or mode == 'MV':
for p in range(nQuestions):
indices = np.argwhere(response_sums[p, :] == np.max(
response_sums[p, :])).flatten()
question_classes[p, np.random.choice(indices)] = 1
else:
for p in range(nQuestions):
question_classes[p, :] = response_sums[p, :] / \
np.sum(response_sums[p, :], dtype=float)
return question_classes
def m_step(counts, question_classes):
"""
M Step for the EM algorithm
Get estimates for the prior class probabilities (p_j) and the error
rates (pi_jkl) using MLE with current estimates of true question classes
See equations 2.3 and 2.4 in Dawid-Skene (1979) or equations 3 and 4 in
our paper (Fast Dawid-Skene: A Fast Vote Aggregation Scheme for Sentiment
Classification)
Args:
counts: Array of how many times each response was received
by each question from each participant: [questions x participants x classes]
question_classes: Matrix of current assignments of questions to classes
Returns:
p_j: class marginals - the probability that the correct answer of a question
is a given choice (class) [classes]
pi_kjl: error rates - the probability of participant k labeling
response l for a question whose correct answer is j [participants, classes, classes]
"""
[nQuestions, nParticipants, nClasses] = np.shape(counts)
# compute class marginals
class_marginals = np.sum(question_classes, 0) / float(nQuestions)
# compute error rates
error_rates = np.zeros([nParticipants, nClasses, nClasses])
for k in range(nParticipants):
for j in range(nClasses):
for l in range(nClasses):
error_rates[k, j, l] = np.dot(
question_classes[:, j], counts[:, k, l])
sum_over_responses = np.sum(error_rates[k, j, :])
if sum_over_responses > 0:
error_rates[k, j, :] = error_rates[
k, j, :] / float(sum_over_responses)
return (class_marginals, error_rates)
def e_step(counts, class_marginals, error_rates, mode):
"""
E (+ C) Step for the EM algorithm
Determine the probability of each question belonging to each class,
given current ML estimates of the parameters from the M-step. Also
perform the C step (along with E step (see section 3.4)) in case of FDS.
See equation 2.5 in Dawid-Skene (1979) or equations 1 and 2 in
our paper (Fast Dawid Skene: A Fast Vote Aggregation Scheme for Sentiment
Classification)
Args:
counts: Array of how many times each response was received
by each question from each participant: [questions x participants x classes]
class_marginals: probability of a random question belonging to each class: [classes]
error_rates: probability of participant k assigning a question whose correct
label is j the label l: [participants x classes x classes]
mode: One among ['H', 'Hphase2', 'FDS', 'DS']
'Hphase2' and 'FDS' will perform E + C step
'DS' and 'H' will perform only the E step
'FDS': use for FDS algorithm
'DS': use for original DS algorithm
'H' and 'Hphase2': use for Hybrid algorithm
Returns:
question_classes: Assignments of labels to questions
[questions x classes]
"""
[nQuestions, nParticipants, nClasses] = np.shape(counts)
question_classes = np.zeros([nQuestions, nClasses])
final_classes = np.zeros([nQuestions, nClasses])
for i in range(nQuestions):
for j in range(nClasses):
estimate = class_marginals[j]
estimate *= np.prod(np.power(error_rates[:,
j, :], counts[i, :, :]))
question_classes[i, j] = estimate
if mode == 'H' or mode == 'DS':
question_sum = np.sum(question_classes[i, :])
if question_sum > 0:
question_classes[i, :] = question_classes[
i, :] / float(question_sum)
else:
indices = np.argwhere(question_classes[i, :] == np.max(
question_classes[i, :])).flatten()
final_classes[i, np.random.choice(indices)] = 1
if mode == 'H' or mode == 'DS':
return question_classes
else:
return final_classes
def calc_likelihood(counts, class_marginals, error_rates):
"""
Calculate the likelihood with the current parameters
Calculate the likelihood given the current parameter estimates
This should go up monotonically as EM proceeds
See equation 2.7 in Dawid-Skene (1979)
Args:
counts: Array of how many times each response was received
by each question from each participant: [questions x participants x classes]
class_marginals: probability of a random question belonging to each class: [classes]
error_rates: probability of participant k assigning a question whose correct
label is j the label l: [observers x classes x classes]
Returns:
Likelihood given current parameter estimates
"""
[nPatients, nObservers, nClasses] = np.shape(counts)
log_L = 0.0
for i in range(nPatients):
patient_likelihood = 0.0
for j in range(nClasses):
class_prior = class_marginals[j]
patient_class_likelihood = np.prod(
np.power(error_rates[:, j, :], counts[i, :, :]))
patient_class_posterior = class_prior * patient_class_likelihood
patient_likelihood += patient_class_posterior
temp = log_L + np.log(patient_likelihood)
if np.isnan(temp) or np.isinf(temp):
if args.verbose:
print(i, log_L, np.log(patient_likelihood), temp)
sys.exit()
log_L = temp
return log_L
if __name__ == '__main__':
print("Aggregation Algorithms")