-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathutils_qa.py
457 lines (357 loc) · 19.6 KB
/
utils_qa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
# coding=utf-8
# Copyright 2020 The HuggingFace Team All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
# VumBleBot/odqa_baseline_code
# Baseline code for Korean open domain question answering(ODQA)
# Modifications copyright (C) 2021 VumBleBot
#
"""
Post-processing utilities for question answering.
"""
import os
import json
import collections
from typing import Optional, Tuple
import numpy as np
import torch
from tqdm.auto import tqdm
from konlpy.tag import Mecab
from reader.pororo_reader import PororoMrcFactory
mecab = Mecab()
# TODO: alpha를 config로 빼기, pororo voting 가중
alpha = 2.0
def get_all_logits(predictions, features):
"""
Check assertions for predictions ans features length.
If passes, return all start/end logits.
:param predictions: predictions
:param features: tokenized and divided contexts(The processed dataset, cut by max_sequence_length)
:return: all start/end logits if pass assertions.
"""
assert len(predictions) == 2, "`predictions` should be a tuple with two elements (start_logits, end_logits)."
all_start_logits, all_end_logits = predictions
assert len(predictions[0]) == len(features), f"Got {len(predictions[0])} predictions and {len(features)} features."
return all_start_logits, all_end_logits
def map_examples_to_features(examples, features, topk):
"""
Build {example : features} dictionary for predictions from topk features.
return mapped dictionary.
Document means original context. Feature means tokenized, divided document.
:ex
Document 1 = [ Feature1-1, Feature1-2, Feature1-3, ... ]
Document 2 = [ Feature2-1, Feature2-2 ]
:param examples: QA Dataset
:param topk: variable for binding topk documents in one example.
:param features: tokenized and divided contexts(The processed dataset, cut by max_sequence_length)
:return: dict of one example(index) - n features.
- key : one example index(NOT example_id, such as mrc-00-1234)
- value : n features(divided contexts) index which are related to the example(key).
These contexts contains all topk context features.
"""
# Build a map example to its corresponding features.
# example_id_to_index = {'mrc-0-00XXXX_0' : 0, 'mrc-0-00XXXX_1' : 1, ....} --> origin*topk개 example마다 각각 다른 example_id를 준다.
example_id_to_index = {"_".join([k, str(i % topk)]): i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
# ex) features_per_example[0] ==> [0], features_per_example[0] ==> [1,2,3] ....
prev_doc_offset = (-1, -1)[0]
doc_id_postfix = 0
for i, feature in enumerate(features):
# query sequence를 지나 document의 첫번째 offset을 가리키는 doc_pointer
doc_pointer = 0
while feature["offset_mapping"][doc_pointer] is None:
doc_pointer += 1
doc_offset = feature["offset_mapping"][doc_pointer][0] # 해당 context sequence의 첫번째 offset
# offset이 떨어지거나 같으면(0) --> topk묶음이 끝나면
if doc_offset <= prev_doc_offset:
# doc_id_postfix가 0~topk-1까지 가도록 조정.
if (doc_id_postfix + 1) % topk == 0:
doc_id_postfix = 0
else:
doc_id_postfix += 1
# example_id_to_index의 키값으로 사용할 문자열 조합
# ex) mrc-00-00XXXX_0, mrc-00-00XXXX_2
# 해당 feature를 example index dict에 등록
example_index_key = "_".join([feature["example_id"], str(doc_id_postfix)])
features_per_example[example_id_to_index[example_index_key]].append(i)
prev_doc_offset = doc_offset
return features_per_example
def looping_through_all_features(
all_start_logits, all_end_logits, n_best_size, features, max_answer_length, feature_indices
):
min_null_prediction = None
prelim_predictions = []
for feature_index in feature_indices:
start_logits = all_start_logits[feature_index]
end_logits = all_end_logits[feature_index]
offset_mapping = features[feature_index]["offset_mapping"]
token_is_max_context = features[feature_index].get("token_is_max_context", None)
feature_null_score = start_logits[0] + end_logits[0]
if min_null_prediction is None or min_null_prediction["score"] > feature_null_score:
min_null_prediction = {
"offsets": (0, 0),
"score": feature_null_score,
"start_logit": start_logits[0],
"end_logit": end_logits[0],
}
start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
for start_index in start_indexes:
for end_index in end_indexes:
if (
start_index >= len(offset_mapping)
or end_index >= len(offset_mapping)
or offset_mapping[start_index] is None
or offset_mapping[end_index] is None
):
continue
if end_index < start_index or end_index - start_index + 1 > max_answer_length:
continue
if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
continue
prelim_predictions.append(
{
"offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
"score": start_logits[start_index] + end_logits[end_index],
"start_logit": start_logits[start_index],
"end_logit": end_logits[end_index],
}
)
return prelim_predictions
def get_all_prelim_predictions(
examples, features, features_per_example, all_start_logits, all_end_logits, max_answer_length, topk, n_best_size
):
"""
Return list of predictions(in nbest_size) per context with descending order.
:param examples: QA Dataset
:param features: tokenized and divided contexts(The processed dataset, cut by max_sequence_length)
:param features_per_example: dict of one example(index) - n features.
:param all_start_logits: all start logits that reader model predicts
:param all_end_logits: all end logits that reader model predicts
:param topk: variable for binding topk documents in one example.
:param n_best_size: The total number of n-best predictions to generate when looking for an answer.
:return: all raw predictions sorted by score
- list of dict {'offsets': (start, end), 'score' : int, 'start_logit' : int, 'end_logit' : int}
"""
all_prelim_predictions = []
# example은 len(example_to_index) * topk 번을 돈다.
# 따라서 각 topk 묶음의 첫번째 인덱스인 bundle_start_index를 활용.
for bundle_start_index in tqdm(range(0, len(examples), topk)):
for example_index in range(bundle_start_index, bundle_start_index + topk):
# example = examples[example_index]
feature_indices = features_per_example[example_index]
# print(f"example {example_index} | feature_indices {feature_indices}")
# print(f"example {example['question']} | feature_indices {feature_indices}")
# 하나의 example에 딸린 context들을 전부 돌면서 prediction 수집
prelim_predictions = looping_through_all_features(
all_start_logits, all_end_logits, n_best_size, features, max_answer_length, feature_indices
) # [offset, start logit, end logit, score]
all_prelim_predictions.append(
sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
)
return all_prelim_predictions
def make_predictions(examples, all_prelim_predictions, topk):
"""
Make formatted predictions (NOT SORTED)
:param examples: QA Dataset
:param all_prelim_predictions: all raw predictions sorted by score
:param topk: variable for binding topk documents in one example.
:return: all predictions (NOT SORTED, topk * len(dataset))
- list of predicts
"""
all_predictions = []
for bundle_start_index in tqdm(range(0, len(examples), topk)):
for example_index in range(bundle_start_index, bundle_start_index + topk):
example = examples[example_index]
predictions = all_prelim_predictions[example_index]
# 01 predictions 정답 텍스트 매핑
context = example["context"]
for pred in predictions:
offsets = pred.pop("offsets")
pred["text"] = context[offsets[0] : offsets[1]]
# 02 정답이 없다면 Fake 정답 생성
if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""):
predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0})
# 03 확률값 계산(softmax)
scores = np.array([pred["score"] for pred in predictions])
exp_scores = np.exp(scores - np.max(scores))
probs = exp_scores / exp_scores.sum()
for prob, pred in zip(probs, predictions):
pred["probability"] = prob
# prediction 결과분석용
# run_mrc의 경우 retrieve가 되지 않으므로 document_id(정답 문서 id)만 존재
# run의 경우 retrieve 과정에서 predict source document를 context_id로 가공하여 전달
pred["question"] = example["question"]
pred["context_id"] = example["context_id"] if "context_id" in example.keys() else example["document_id"]
pred["context"] = example["context"]
all_predictions.append(predictions)
return all_predictions
def select_top_score_predict(examples, all_predictions, n_best_size, topk):
# initialize
final_predictions = collections.OrderedDict()
all_nbest_json = collections.OrderedDict()
for bundle_start_index in tqdm(range(0, len(examples), topk)):
example = examples[bundle_start_index]
topk_merged_predictions = []
for example_index in range(bundle_start_index, bundle_start_index + topk):
topk_merged_predictions.extend(all_predictions[example_index])
predictions = sorted(topk_merged_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
# 정답 로깅
final_predictions[example["id"]] = predictions[0]["text"]
# 정답 포함 가능성 있었던 답을 로깅
all_nbest_json[example["id"]] = [
{k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
for pred in predictions
]
return final_predictions, all_nbest_json
def save_predictions(predictions, output_dir, filename):
assert os.path.isdir(output_dir), f"{output_dir} is not a directory."
save_path = os.path.join(output_dir, filename)
with open(save_path, "w") as writer:
writer.write(json.dumps(predictions, indent=4, ensure_ascii=False) + "\n")
def load_predictions(data_dir, filename):
assert os.path.isdir(data_dir), f"{data_dir} is not a directory."
load_path = os.path.join(data_dir, filename)
with open(load_path, "r") as prediction_file:
predictions = json.load(prediction_file)
return predictions
def remove_last_postposition(predict):
"""
예측 마지막에 조사가 있다면 제거
"""
pos_tagged_predict = mecab.pos(predict)
# 조사제거
if len(predict) != 0 and pos_tagged_predict[-1][-1].startswith("J"):
predict = predict.replace(pos_tagged_predict[-1][0], "")
return predict
def pororo_predict(examples, mrc_model, topk):
topk_merged_pororo_predictions = []
all_pororo_preds = []
for bundle_start_index in tqdm(range(0, len(examples), topk)):
for example_index in range(bundle_start_index, bundle_start_index + topk):
example = examples[example_index]
pororo_pred_text, _, pororo_score = mrc_model(example["question"], example["context"], postprocess=False)[0]
pororo_pred_text = remove_last_postposition(pororo_pred_text)
pororo_prediction = {"text": pororo_pred_text, "score": pororo_score}
topk_merged_pororo_predictions.append(pororo_prediction)
pororo_pred = max(topk_merged_pororo_predictions, key=lambda x: x["score"]) # 각 context의 top-1 중에서도 top-1만을 추출
all_pororo_preds.append(pororo_pred)
topk_merged_pororo_predictions = []
return all_pororo_preds
def pororo_voting(examples, all_pororo_preds, output_dir, prefix, topk):
all_pororo_voted_predictions = collections.OrderedDict()
all_pororo_voted_nbest_json = collections.OrderedDict()
filename = "nbest_predictions.json" if not prefix else f"nbest_predictions_{prefix}.json"
all_nbests = load_predictions(output_dir, filename)
all_nbests = [val for val in all_nbests.values()] # len(dataset)
for i, nbest in enumerate(all_nbests):
example = examples[i * topk]
for pred in nbest:
if pred["text"] == all_pororo_preds[i]["text"]:
pred["score"] += all_pororo_preds[i]["score"] * alpha
pred["pororo_voting"] = True
pororo_voted_predictions = sorted(nbest, key=lambda x: x["score"], reverse=True)
all_pororo_voted_predictions[example["id"]] = pororo_voted_predictions[0]["text"]
all_pororo_voted_nbest_json[example["id"]] = [
{k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
for pred in pororo_voted_predictions
]
return all_pororo_voted_predictions, all_pororo_voted_nbest_json
def pororo_ensemble(examples, output_dir, prefix, topk):
# PORORO Reader for Voting
my_mrc_factory = PororoMrcFactory("mrc", "ko", "brainbert.base.ko.korquad")
pororo_mrc = my_mrc_factory.load(torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))
# MRC 모델로 예측하기
all_pororo_preds = pororo_predict(examples, pororo_mrc, topk)
# MRC 모델 예측 결과를 기존 결과에 합치기
all_pororo_voted_predictions, all_pororo_voted_nbest_json = pororo_voting(
examples, all_pororo_preds, output_dir, prefix, topk
)
return all_pororo_voted_predictions, all_pororo_voted_nbest_json
def get_logits_with_offset(
examples, features, predictions, topk: int = 1, max_answer_length: int = 30, n_best_size: int = 20
):
all_start_logits, all_end_logits = get_all_logits(predictions, features)
features_per_example = map_examples_to_features(examples, features, topk)
all_prelim_predictions = get_all_prelim_predictions(
examples, features, features_per_example, all_start_logits, all_end_logits, max_answer_length, topk, n_best_size
)
return all_prelim_predictions
def postprocess_qa_predictions(
examples,
features,
predictions: Tuple[np.ndarray, np.ndarray],
training_args,
topk: int = 1,
n_best_size: int = 5,
max_answer_length: int = 30,
output_dir: Optional[str] = None,
prefix: Optional[str] = None,
):
"""
Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
original contexts. This is the base postprocessing functions for models that only return start and end logits.
Args:
examples: The non-preprocessed dataset (see the main script for more information).
features: The processed dataset (see the main script for more information).
predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
first dimension must match the number of elements of :obj:`features`.
version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the underlying dataset contains examples with no answers.
n_best_size (:obj:`int`, `optional`, defaults to 20):
The total number of n-best predictions to generate when looking for an answer.
max_answer_length (:obj:`int`, `optional`, defaults to 30):
The maximum length of an answer that can be generated. This is needed because the start and end predictions
are not conditioned on one another.
null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0):
The threshold used to select the null answer: if the best answer has a score that is less than the score of
the null answer minus this threshold, the null answer is selected for this example (note that the score of
the null answer for an example giving several features is the minimum of the scores for the null answer on
each feature: all features must be aligned on the fact they `want` to predict a null answer).
Only useful when :obj:`version_2_with_negative` is :obj:`True`.
output_dir (:obj:`str`, `optional`):
If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
:obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
answers, are saved in `output_dir`.
prefix (:obj:`str`, `optional`):
If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
is_world_process_zero (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether this process is the main process or not (used to determine if logging/saves should be done).
"""
# predictions,features length check
all_start_logits, all_end_logits = get_all_logits(predictions, features)
# one example - n features map
features_per_example = map_examples_to_features(examples, features, topk)
all_prelim_predictions = get_all_prelim_predictions(
examples, features, features_per_example, all_start_logits, all_end_logits, max_answer_length, topk, n_best_size
)
all_preds = make_predictions(examples, all_prelim_predictions, topk)
final_predictions, all_nbest_json = select_top_score_predict(examples, all_preds, n_best_size, topk)
results = {}
if output_dir is not None:
filename = "predictions.json" if not prefix else f"predictions_{prefix}.json"
save_predictions(final_predictions, output_dir, filename)
filename = "nbest_" + filename
save_predictions(all_nbest_json, output_dir, filename)
results["predictions"] = final_predictions
if output_dir is not None and training_args.pororo_prediction is True:
all_pororo_voted_predictions, all_pororo_voted_nbest_json = pororo_ensemble(examples, output_dir, prefix, topk)
filename = "pororo_predictions.json" if not prefix else f"pororo_predictions_{prefix}.json"
save_predictions(all_pororo_voted_predictions, output_dir, filename)
filename = "nbest_" + filename
save_predictions(all_pororo_voted_nbest_json, output_dir, filename)
results["pororo_predictions"] = all_pororo_voted_predictions
return results