forked from ai-sec-lab/SecRepoBench
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_eval.py
More file actions
37 lines (27 loc) · 1.43 KB
/
run_eval.py
File metadata and controls
37 lines (27 loc) · 1.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import argparse
import sys
from tools.report_analyzer import analyze_report
from tools.evaler import eval_setup, eval
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--agents", nargs='+', help="List of agents to evaluate")
parser.add_argument("--model-names", nargs='+', help="List of models to evaluate")
parser.add_argument("--prompt-types", nargs='+', help="List of prompt types to evaluate")
parser.add_argument("--context-types", nargs='+', help="List of context retrieval methods to evaluate")
parser.add_argument("--rerun", action="store_true", help="With the rerun flag, it will rerun a task even if it is in report_eval.json. Otherwise, it will not.")
args = parser.parse_args()
# modes refers to the code mutation strategy to mitigate memorization.
# We only have the local var perturbation ('perturbed' mode).
modes = ['perturbed']
# consider exposing num_workers
num_workers = 25
with open('assets/ids.txt', 'r') as f:
ids = f.read().splitlines()[1:]
# setup files for eval
eval_setup(ids, args.agents, args.model_names, args.prompt_types, args.context_types, modes, num_workers)
# # run eval for all ids in ids.txt
eval_report = eval(ids, args.agents, args.model_names, args.prompt_types, args.context_types, modes, args.rerun, num_workers)
# # process results
analyze_report(ids, eval_report)
if __name__ == "__main__":
main()