Simplify eval code (All-Hands-AI#2775)

* Start simplifying eval code * Update * Add EDA * Updated GAIA * Update gpqa * Add humanevalfix * Fix logic_reasoning * Add miniwob * Add mint and ml_bench * toolqa * Added swe-bench * Fixed webarena * Refactor parameters
invariantlabs-ai · Jul 5, 2024 · a081935 · a081935
1 parent 038e8f8
commit a081935
Show file tree

Hide file tree

Showing 16 changed files with 835 additions and 2,495 deletions.
diff --git a/evaluation/EDA/run_infer.py b/evaluation/EDA/run_infer.py
@@ -1,18 +1,21 @@
 import asyncio
-import json
 import logging
 import multiprocessing as mp
 import os
-import pathlib
-import subprocess
-import time
-from concurrent.futures import ProcessPoolExecutor
+
+import pandas as pd
 
 # import huggingface_hub
 from datasets import load_dataset
-from tqdm import tqdm
 
 from evaluation.EDA.game import Q20Game, Q20GameCelebrity
+from evaluation.utils.shared import (
+    EvalMetadata,
+    make_metadata,
+    monologue_user_response,
+    prepare_dataset,
+    run_evaluation,
+)
 from opendevin.controller.agent import Agent
 
 # from evaluation.EDA.scorer import question_scorer
@@ -36,7 +39,7 @@ def cleanup():
         process.join()
 
 
-def codeact_user_response(state: State) -> str:
+def codeact_user_response_eda(state: State) -> str:
     global game
     model_guess = ''
     if state.history:
@@ -54,12 +57,8 @@ def codeact_user_response(state: State) -> str:
     return msg
 
 
-def monologue_user_response(state: State) -> str:
-    raise NotImplementedError('MonologueAgent should never ask for user responses.')
-
-
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
-    'CodeActAgent': codeact_user_response,
+    'CodeActAgent': codeact_user_response_eda,
     'MonologueAgent': monologue_user_response,
 }
 
@@ -69,10 +68,14 @@ def monologue_user_response(state: State) -> str:
 
 
 def process_instance(
-    agent: Agent, instance, metadata, openai_api_key, reset_logger: bool = True
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
 ):
+    # Create the agent
+    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
     # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
-    eval_output_dir = metadata['eval_output_dir']
+    eval_output_dir = metadata.eval_output_dir
     if reset_logger:
         # Set up logger
         log_file = os.path.join(
@@ -107,12 +110,14 @@ def process_instance(
 
     # Use codeactagent as guesser_model
     global game
-    game = _game_class[metadata['dataset']](
+    assert metadata.dataset is not None
+    assert metadata.details is not None
+    game = _game_class[metadata.dataset](
         item=instance['text'].strip(),
-        answerer_model=metadata['answerer_model'],
+        answerer_model=metadata.details['answerer_model'],
         guesser_model=None,
-        num_turns=metadata['max_iterations'],
-        openai_api_key=openai_api_key,
+        num_turns=metadata.max_iterations,
+        openai_api_key=metadata.details['openai_api_key'],
         guesser_kargs=guesser_kargs,
     )
 
@@ -195,151 +200,42 @@ def process_instance(
         help='data split, eg, test',
     )
     args, _ = parser.parse_known_args()
-    if args.directory:
-        config.workspace_base = os.path.abspath(args.directory)
-        print(f'Setting workspace base to {config.workspace_base}')
-    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
-    # so we don't need to manage file uploading to OpenDevin's repo
-    eda_dataset = load_dataset(
-        'yizheapple/entity-deduction-arena', name=args.dataset, split=args.data_split
-    )
-    logger.info(
-        f'Evaluating Entity Deduction Arena {args.dataset} {args.data_split} split'
-    )
 
-    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
-    # for details of how to set `llm_config`
     if args.llm_config:
         specified_llm_config = get_llm_config_arg(args.llm_config)
         if specified_llm_config:
             config.llm = specified_llm_config
-    logger.info(f'Config for evaluation: {config}')
-
-    # TEST METADATA
-    agent_class = args.agent_cls
-    assert (
-        agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
-    ), f'Unsupported agent class: {agent_class}'
-    model_name = config.llm.model.split('/')[-1]
-    max_iterations = args.max_iterations
-    eval_note = ''
-    if args.eval_note is not None:
-        eval_note += '_N_' + args.eval_note
-    eval_output_dir = os.path.join(
-        args.eval_output_dir,
-        'eda',
-        agent_class,
-        model_name + '_maxiter_' + str(max_iterations) + eval_note,
-    )
 
-    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
-    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
-        parents=True, exist_ok=True
+    eda_dataset = load_dataset(
+        'yizheapple/entity-deduction-arena', name=args.dataset, split=args.data_split
     )
-    logger.info(f'Using evaluation output directory: {eval_output_dir}')
-
-    metadata = {
-        'dataset': args.dataset,
-        'data_split': args.data_split,
-        'answerer_model': args.answerer_model,
-        'agent_class': agent_class,
-        'model_name': model_name,
-        'max_iterations': max_iterations,
-        'eval_output_dir': eval_output_dir,
-        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
-        # get the commit id of current repo for reproducibility
-        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
-        .decode('utf-8')
-        .strip(),
-    }
-    logger.info(f'Metadata: {metadata}')
-    with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
-        json.dump(metadata, f)
-
-    # LIMIT EVALUATION
-    eval_n_limit = args.eval_n_limit
-    if eval_n_limit:
-        eda_dataset = eda_dataset.select(list(range(eval_n_limit)))
-        logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
-
-    # OUTPUT FILE
-    output_file = os.path.join(eval_output_dir, 'output.jsonl')
-    logger.info(f'Writing evaluation output to {output_file}')
-    finished_items = set()
-    if os.path.exists(output_file):
-        with open(output_file, 'r') as f:
-            for line in f:
-                data = json.loads(line)
-                finished_items.add(data['instance_id'])
-        logger.warning(
-            f'Output file {output_file} already exists. Loaded {len(finished_items)} finished instances.'
-        )
-    output_fp = open(output_file, 'a')
 
-    logger.info(
-        f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
+    metadata = make_metadata(
+        config.llm,
+        f'eda-{args.dataset}',
+        args.agent_cls,
+        args.max_iterations,
+        args.eval_note,
+        args.eval_output_dir,
+        data_split=args.data_split,
+        details={
+            'answerer_model': str(args.answerer_model),
+            'openai_api_key': str(args.OPENAI_API_KEY),
+        },
     )
 
-    # =============================================
-    # filter out finished instances
-    new_eda_dataset = []
-    for instance in eda_dataset:
-        if instance['text'].strip() in finished_items:
-            logger.info(
-                f'Skipping instance {instance["text"].strip()} as it is already finished.'
-            )
-            continue
-        new_eda_dataset.append(instance)
-
-    eda_dataset = new_eda_dataset
-    logger.info(
-        f'Finished instances: {len(finished_items)}, Remaining instances: {len(eda_dataset)}'
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+    prepared_dataset = prepare_dataset(
+        eda_dataset.to_pandas(), output_file, args.eval_n_limit, 'text'
     )
-    # =============================================
 
-    pbar = tqdm(total=len(eda_dataset))
+    agent = Agent.get_cls(args.agent_cls)(llm=LLM(config.llm))
 
-    # This function tracks the progress AND write the output to a JSONL file
-    def update_progress(future):
-        pbar.update(1)
-        output = future.result()
-        pbar.set_description(f'Instance {output["instance_id"]}')
-        pbar.set_postfix_str(f'Test Result: {output["test_result"]}')
-        logger.info(
-            f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]}'
-        )
-        output_fp.write(json.dumps(output) + '\n')
-        output_fp.flush()
-
-    # This sets the multi-processing
-    num_workers = args.eval_num_workers
-    logger.info(f'Using {num_workers} workers for evaluation.')
-
-    # Create the agent
-    agent = Agent.get_cls(agent_class)(llm=LLM(config.llm))
-
-    try:
-        with ProcessPoolExecutor(num_workers) as executor:
-            futures = []
-            # This is how we perform multi-processing
-            for instance in eda_dataset:
-                future = executor.submit(
-                    process_instance,
-                    agent,
-                    instance,
-                    metadata,
-                    args.OPENAI_API_KEY,
-                    reset_logger=bool(num_workers > 1),
-                )
-                future.add_done_callback(update_progress)
-                futures.append(future)
-
-            # Wait for all futures to complete
-            for future in futures:
-                future.result()
-    except KeyboardInterrupt:
-        print('KeyboardInterrupt received. Cleaning up...')
-        cleanup()
-
-    output_fp.close()
-    logger.info('Evaluation finished.')
+    run_evaluation(
+        prepared_dataset,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
+        'text',
+    )
diff --git a/evaluation/agent_bench/helper.py b/evaluation/agent_bench/helper.py
@@ -1,9 +1,39 @@
 import os
 import re
+from functools import partial
 
+from evaluation.utils.shared import codeact_user_response
 from opendevin.events.action import CmdRunAction, MessageAction
 
 
+def try_parse_answer(act) -> str | None:
+    raw_ans = ''
+    if isinstance(act, MessageAction) and act.source == 'agent':
+        raw_ans = act.content
+    elif isinstance(act, CmdRunAction) and act.source == 'agent':
+        raw_ans = act.thought
+    else:
+        return None
+    agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans)
+    if not agent_answer:
+        return None
+    return agent_answer[0].strip()
+
+
+FAKE_RESPONSES = {
+    'CodeActAgent': partial(
+        codeact_user_response, encapsulate_solution=True, try_parse=try_parse_answer
+    ),
+}
+
+INST_SUFFIXES: dict[str, str] = {
+    'CodeActAgent': (
+        'When you think you have solved the question, '
+        'please first send your answer to user through message and then exit.\n'
+    )
+}
+
+
 def analysis_size(size_str):
     size_str = size_str.strip()
     avails = {
@@ -45,17 +75,3 @@ def create_sh_file(filename: str, cmds: str) -> None:
     with open(filename, 'w', encoding='utf-8') as file:
         file.write(cmds.replace('\r\n', '\n'))
     os.chmod(filename, 0o755)
-
-
-def try_parse_answer(act) -> str | None:
-    raw_ans = ''
-    if isinstance(act, MessageAction) and act.source == 'agent':
-        raw_ans = act.content
-    elif isinstance(act, CmdRunAction) and act.source == 'agent':
-        raw_ans = act.thought
-    else:
-        return None
-    agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans)
-    if not agent_answer:
-        return None
-    return agent_answer[0].strip()