-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathevaluate_perfomance_casehold.py
54 lines (46 loc) · 2.02 KB
/
evaluate_perfomance_casehold.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import json
import random
import numpy as np
import os
from sklearn.metrics import classification_report
from datasets import load_dataset
from data import DATA_DIR
import argparse
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')
def main(args):
dataset = []
with open(os.path.join(DATA_DIR, 'zero-shot-predictions', f'case_hold_{args.model_name}_predictions.jsonl')) as file:
for line in file:
dataset.append(json.loads(line))
labels = []
predictions = []
nones = 0
noisy_labels = 0
for idx, example in enumerate(dataset):
if example['prediction'] is not None:
for l_idx, label_name in enumerate(example['choices']):
if label_name.lower() in dataset[idx]['answer'].lower():
labels.append(l_idx)
break
for l_idx, label_name in enumerate(example['choices']):
if label_name.lower() in example['prediction'].lower():
predictions.append(l_idx)
break
if len(labels) != len(predictions):
prediction = example['prediction'].lower()
pred_embeddings = model.encode(prediction)
label_embeddings = model.encode(example['choices'])
label_id = util.cos_sim(pred_embeddings, label_embeddings).argmax().numpy()
predictions.append(label_id)
print(f'- Prediction "{prediction}" best matches label "{dataset[idx]["answer"].lower()}"')
noisy_labels += 1
else:
nones += 1
print(f'{nones} question unanswered!\n')
print(classification_report(y_true=labels, y_pred=predictions, target_names=[f'Choice {idx}' for idx in range(5)],
zero_division=0, digits=3))
parser = argparse.ArgumentParser(description='Prompting GPT')
parser.add_argument("--model_name", type=str, default='gpt-3.5-turbo', help="GPT model name")
args = parser.parse_args()
main(args)