fix: make max_budget_per_task optional in run_agent_controller (All-Hands-AI#3071)

xingyaoww · web-flow · commit da17665cab3c · 2024-07-22T21:47:00.000-04:00
* fix: make max_budget_per_task optional in `run_agent_controller`

* update arg for each run infer
diff --git a/evaluation/EDA/run_infer.py b/evaluation/EDA/run_infer.py
@@ -122,6 +122,7 @@ def process_instance(
             agent,
             instruction,
             max_iterations=metadata.max_iterations,
+            max_budget_per_task=config.max_budget_per_task,
             fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                 agent.__class__.__name__
             ],
diff --git a/evaluation/agent_bench/run_infer.py b/evaluation/agent_bench/run_infer.py
@@ -116,6 +116,7 @@ def process_instance(
             agent,
             instruction,
             max_iterations=metadata.max_iterations,
+            max_budget_per_task=config.max_budget_per_task,
             fake_user_response_fn=FAKE_RESPONSES[agent.__class__.__name__],
             sandbox=sandbox,
             sid=inst_id,
diff --git a/evaluation/biocoder/run_infer.py b/evaluation/biocoder/run_infer.py
@@ -169,6 +169,7 @@ def process_instance(
             agent,
             instruction,
             max_iterations=metadata.max_iterations,
+            max_budget_per_task=config.max_budget_per_task,
             fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                 agent.__class__.__name__
             ],
diff --git a/evaluation/bird/run_infer.py b/evaluation/bird/run_infer.py
@@ -213,6 +213,7 @@ def execute_sql(db_path, sql):
             agent,
             instruction,
             max_iterations=metadata.max_iterations,
+            max_budget_per_task=config.max_budget_per_task,
             fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                 agent.__class__.__name__
             ],
diff --git a/evaluation/browsing_delegation/run_infer.py b/evaluation/browsing_delegation/run_infer.py
@@ -67,7 +67,11 @@ def process_instance(
 
     state: State | None = asyncio.run(
         run_agent_controller(
-            agent, instruction, max_iterations=metadata.max_iterations, sid=env_id
+            agent,
+            instruction,
+            max_iterations=metadata.max_iterations,
+            max_budget_per_task=config.max_budget_per_task,
+            sid=env_id,
         )
     )
 
diff --git a/evaluation/gaia/run_infer.py b/evaluation/gaia/run_infer.py
@@ -119,6 +119,7 @@ def process_instance(
                 agent,
                 instruction,
                 max_iterations=metadata.max_iterations,
+                max_budget_per_task=config.max_budget_per_task,
                 fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                     agent.__class__.__name__
                 ],
diff --git a/evaluation/gorilla/run_infer.py b/evaluation/gorilla/run_infer.py
@@ -113,6 +113,7 @@ def process_instance(agent, question_id, question, metadata, reset_logger: bool
                 agent,
                 instruction,
                 max_iterations=metadata.max_iterations,
+                max_budget_per_task=config.max_budget_per_task,
                 fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
                     agent.__class__.__name__
                 ),
diff --git a/evaluation/gpqa/run_infer.py b/evaluation/gpqa/run_infer.py
@@ -229,6 +229,7 @@ def process_instance(
                 agent,
                 instruction,
                 max_iterations=metadata.max_iterations,
+                max_budget_per_task=config.max_budget_per_task,
                 fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
                     agent.__class__.__name__
                 ),
diff --git a/evaluation/humanevalfix/run_infer.py b/evaluation/humanevalfix/run_infer.py
@@ -182,6 +182,7 @@ def process_instance(
                 agent,
                 instruction,
                 max_iterations=metadata.max_iterations,
+                max_budget_per_task=config.max_budget_per_task,
                 fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
                     agent.__class__.__name__
                 ),
diff --git a/evaluation/logic_reasoning/run_infer.py b/evaluation/logic_reasoning/run_infer.py
@@ -180,6 +180,7 @@ def process_instance(
                 agent,
                 instruction,
                 max_iterations=metadata.max_iterations,
+                max_budget_per_task=config.max_budget_per_task,
                 fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
                     agent.__class__.__name__
                 ),
diff --git a/evaluation/miniwob/run_infer.py b/evaluation/miniwob/run_infer.py
@@ -81,6 +81,7 @@ def process_instance(
             agent,
             'PLACEHOLDER_GOAL',
             max_iterations=metadata.max_iterations,
+            max_budget_per_task=config.max_budget_per_task,
             runtime_tools_config=runtime_tools_config,
             sandbox=get_sandbox(),
             sid=env_id,
diff --git a/evaluation/mint/run_infer.py b/evaluation/mint/run_infer.py
@@ -143,6 +143,7 @@ def process_instance(
             agent,
             instruction,
             max_iterations=metadata.max_iterations,
+            max_budget_per_task=config.max_budget_per_task,
             fake_user_response_fn=fake_user_response_fn,
             sandbox=sandbox,
             sid=sid,
diff --git a/evaluation/ml_bench/run_infer.py b/evaluation/ml_bench/run_infer.py
@@ -150,6 +150,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
                 agent,
                 instruction,
                 max_iterations=metadata.max_iterations,
+                max_budget_per_task=config.max_budget_per_task,
                 fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
                     agent.__class__.__name__
                 ),
diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
@@ -282,6 +282,7 @@ def process_instance(
             agent,
             instruction,
             max_iterations=metadata.max_iterations,
+            max_budget_per_task=config.max_budget_per_task,
             fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                 agent.__class__.__name__
             ],
diff --git a/evaluation/toolqa/run_infer.py b/evaluation/toolqa/run_infer.py
@@ -79,6 +79,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
             agent,
             instruction,
             max_iterations=metadata.max_iterations,
+            max_budget_per_task=config.max_budget_per_task,
             fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                 agent.__class__.__name__
             ],
diff --git a/evaluation/webarena/run_infer.py b/evaluation/webarena/run_infer.py
@@ -82,6 +82,7 @@ def process_instance(
             agent,
             'PLACEHOLDER_GOAL',
             max_iterations=metadata.max_iterations,
+            max_budget_per_task=config.max_budget_per_task,
             runtime_tools_config=runtime_tools_config,
             sandbox=get_sandbox(),
             sid=env_id,
diff --git a/opendevin/core/main.py b/opendevin/core/main.py
@@ -34,7 +34,7 @@ async def run_agent_controller(
     agent: Agent,
     task_str: str,
     max_iterations: int,
-    max_budget_per_task: float,
+    max_budget_per_task: float | None = None,
     exit_on_message: bool = False,
     fake_user_response_fn: Callable[[State | None], str] | None = None,
     sandbox: Sandbox | None = None,

Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,11 @@ def process_instance(`
`67`	`67`
`68`	`68`	`state: State \| None = asyncio.run(`
`69`	`69`	`run_agent_controller(`
`70`		`- agent, instruction, max_iterations=metadata.max_iterations, sid=env_id`
	`70`	`+ agent,`
	`71`	`+ instruction,`
	`72`	`+ max_iterations=metadata.max_iterations,`
	`73`	`+ max_budget_per_task=config.max_budget_per_task,`
	`74`	`+ sid=env_id,`
`71`	`75`	`)`
`72`	`76`	`)`
`73`	`77`