-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
58 lines (52 loc) · 2.37 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from typing import List, Union
import re
import string
from collections import Counter
import numpy as np
def parse_prediction(pred):
# truncate after to the answer
stop_idx = -1
for word in ["Question:", "\n\n", "</s>"]:
idx = pred.find(word)
if stop_idx == -1 or (idx != -1 and idx < stop_idx):
stop_idx = idx
if stop_idx != -1:
pred = pred[:stop_idx].strip()
return pred
######## Evaluation Utils ########
def normalize_answer(s):
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def em_score(prediction: str, ground_truths: Union[str, List[str]]):
correct = np.max([int(normalize_answer(prediction) == normalize_answer(gt)) for gt in ground_truths])
return correct
def f1_score(prediction: str, ground_truths: Union[str, List[str]]):
final_metric = {'f1': 0, 'precision': 0, 'recall': 0, 'acc': 0}
for ground_truth in ground_truths:
normalized_prediction = normalize_answer(prediction)
normalized_ground_truth = normalize_answer(ground_truth)
if normalized_prediction in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
continue
if normalized_ground_truth in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
continue
prediction_tokens = normalized_prediction.split()
ground_truth_tokens = normalized_ground_truth.split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
continue
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
acc = 1.0 if normalized_ground_truth in normalized_prediction else 0.0
for k in ['f1', 'precision', 'recall', 'acc']:
final_metric[k] = max(eval(k), final_metric[k])
return final_metric['f1'], final_metric['precision'], final_metric['recall'], final_metric['acc']