forked from anthonysandesh/LegalLLM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate.py
68 lines (53 loc) · 1.86 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# -*- coding: utf-8 -*-
"""Untitled4.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1eV4-E7VDAHekOoakymatKd3grGwUSTAC
"""
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
def evaluate_similar_case_retrieval(model, test_cases):
results = {
'precision': [],
'recall': [],
'f1': []
}
for case in test_cases:
retrieved = model.retrieve_similar_cases(case)
metrics = precision_recall_fscore_support(
case.relevant,
retrieved,
average='binary'
)
results['precision'].append(metrics[0])
results['recall'].append(metrics[1])
results['f1'].append(metrics[2])
return {k: np.mean(v) for k, v in results.items()}
def evaluate_precedent_recommendation(model, test_cases):
correct = 0
total = len(test_cases)
for case in test_cases:
recommendation = model.recommend_precedent(case)
if recommendation == case.correct_precedent:
correct += 1
return {'accuracy': correct/total}
def evaluate_legal_judgment(model, test_cases):
predictions = []
actuals = []
for case in test_cases:
pred = model.predict_judgment(case)
predictions.append(pred)
actuals.append(case.actual_judgment)
return {
'judgment_accuracy': sum(p == a for p, a in zip(predictions, actuals)) / len(predictions)
}
def run_full_evaluation():
test_cases = load_test_cases()
results = {
'retrieval_metrics': evaluate_similar_case_retrieval(model, test_cases),
'precedent_metrics': evaluate_precedent_recommendation(model, test_cases),
'judgment_metrics': evaluate_legal_judgment(model, test_cases)
}
print("Evaluation Results:")
print(json.dumps(results, indent=2))
return results