Make agbenchmark a proxy of the evaluated agent (Significant-Gravitas…

…#5279) Make agbenchmark a Proxy of the evaluated agent Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
AmanKadam-16 · Sep 20, 2023 · ff4c76b · ff4c76b
1 parent 1a471b7
commit ff4c76b
Show file tree

Hide file tree

Showing 71 changed files with 2,461 additions and 1,299 deletions.
diff --git a/.github/workflows/benchmark-ci.yml b/.github/workflows/benchmark-ci.yml
@@ -128,7 +128,8 @@ jobs:
           echo "Running the following command: ${prefix}agbenchmark --test=WriteFile"
           ${prefix}agbenchmark --test=WriteFile
           sh run_benchmark &
-          sleep 5
-          python ../../benchmark/tests/test_web_server.py
+          cd ../../benchmark
+          poetry install
+          poetry run pytest tests
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/autogpts/forge/forge/sdk/__init__.py b/autogpts/forge/forge/sdk/__init__.py
@@ -12,12 +12,10 @@
     Pagination,
     Status,
     Step,
-    StepInput,
     StepOutput,
     StepRequestBody,
     Task,
     TaskArtifactsListResponse,
-    TaskInput,
     TaskListResponse,
     TaskRequestBody,
     TaskStepsListResponse,

diff --git a/autogpts/forge/forge/sdk/db.py b/autogpts/forge/forge/sdk/db.py
@@ -23,7 +23,7 @@
 
 from .errors import NotFoundError
 from .forge_log import ForgeLogger
-from .schema import Artifact, Pagination, Status, Step, StepRequestBody, Task, TaskInput
+from .schema import Artifact, Pagination, Status, Step, StepRequestBody, Task
 
 LOG = ForgeLogger(__name__)
 
@@ -140,7 +140,7 @@ def __init__(self, database_string, debug_enabled: bool = False) -> None:
         self.Session = sessionmaker(bind=self.engine)
 
     async def create_task(
-        self, input: Optional[str], additional_input: Optional[TaskInput] = {}
+        self, input: Optional[str], additional_input: Optional[dict] = {}
     ) -> Task:
         if self.debug_enabled:
             LOG.debug("Creating new task")
@@ -150,7 +150,7 @@ async def create_task(
                 new_task = TaskModel(
                     task_id=str(uuid.uuid4()),
                     input=input,
-                    additional_input=additional_input.json()
+                    additional_input=additional_input
                     if additional_input
                     else {},
                 )

diff --git a/autogpts/forge/forge/sdk/schema.py b/autogpts/forge/forge/sdk/schema.py
@@ -27,10 +27,6 @@ class Pagination(BaseModel):
     page_size: int = Field(..., description="Number of items per page.", example=25)
 
 
-class TaskInput(BaseModel):
-    pass
-
-
 class Artifact(BaseModel):
     created_at: datetime = Field(
         ...,
@@ -66,10 +62,6 @@ class Artifact(BaseModel):
     )
 
 
-class StepInput(BaseModel):
-    pass
-
-
 class StepOutput(BaseModel):
     pass
 
@@ -81,7 +73,7 @@ class TaskRequestBody(BaseModel):
         description="Input prompt for the task.",
         example="Write the words you receive to the file 'output.txt'.",
     )
-    additional_input: Optional[TaskInput] = {}
+    additional_input: Optional[dict] = {}
 
 
 class Task(TaskRequestBody):
@@ -122,7 +114,7 @@ class StepRequestBody(BaseModel):
         description="Input prompt for the step.",
         example="Washington",
     )
-    additional_input: Optional[StepInput] = {}
+    additional_input: Optional[dict] = {}
 
 
 class Status(Enum):

diff --git a/autogpts/forge/poetry.lock b/autogpts/forge/poetry.lock
diff --git a/benchmark/agbenchmark/__main__.py b/benchmark/agbenchmark/__main__.py
@@ -272,9 +272,6 @@ def version():
     print(f"Benchmark Tool Version {version}")
 
 
-from pathlib import Path
-
-
 def serve():
     import uvicorn
 

diff --git a/benchmark/agbenchmark/agent_api_interface.py b/benchmark/agbenchmark/agent_api_interface.py
@@ -60,19 +60,23 @@ async def run_api_agent(
                 api_instance, artifacts_location, task_id, "artifacts_out"
             )
 
-        artifacts = await api_instance.list_agent_task_artifacts(task_id=task_id)
-        for artifact in artifacts.artifacts:
-            # current absolute path of the directory of the file
-            directory_location = TEMP_FOLDER_ABS_PATH
-            if artifact.relative_path:
-                directory_location = directory_location / artifact.relative_path
-
-            with open(directory_location / artifact.file_name, "wb") as f:
-                content = await api_instance.download_agent_task_artifact(
-                    task_id=task_id, artifact_id=artifact.artifact_id
-                )
-
-                f.write(content)
+        await copy_agent_artifacts_into_temp_folder(api_instance, task_id)
+
+
+async def copy_agent_artifacts_into_temp_folder(api_instance, task_id):
+    artifacts = await api_instance.list_agent_task_artifacts(task_id=task_id)
+    for artifact in artifacts.artifacts:
+        # current absolute path of the directory of the file
+        directory_location = TEMP_FOLDER_ABS_PATH
+        if artifact.relative_path:
+            directory_location = directory_location / artifact.relative_path
+
+        with open(directory_location / artifact.file_name, "wb") as f:
+            content = await api_instance.download_agent_task_artifact(
+                task_id=task_id, artifact_id=artifact.artifact_id
+            )
+
+            f.write(content)
 
 
 async def append_updates_file(step: Step):

diff --git a/benchmark/agbenchmark/app.py b/benchmark/agbenchmark/app.py
@@ -1,22 +1,43 @@
+import datetime
+from collections import defaultdict, deque
+from pathlib import Path
+
+import httpx
+
+from agbenchmark.agent_protocol_client import (
+    AgentApi,
+    ApiClient,
+    ApiException,
+    Configuration,
+)
+from agbenchmark.reports.processing.report_types_v2 import BenchmarkRun
+from agbenchmark.schema import TaskEvalRequestBody
+
+configuration = Configuration(host="http://localhost:8000" + "/ap/v1")
+
 import json
 import os
 import sys
 from typing import Any, Optional
 
 import psutil
-from fastapi import FastAPI
+from fastapi import APIRouter, FastAPI
 from fastapi import (
     HTTPException as FastAPIHTTPException,  # Import HTTPException from FastAPI
 )
 from fastapi import Request, Response
 from fastapi.middleware.cors import CORSMiddleware
 
 from agbenchmark.execute_sub_process import execute_subprocess
+from agbenchmark.schema import Task, TaskRequestBody
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from fastapi import FastAPI
 from pydantic import BaseModel, Extra
 
+router = APIRouter()
+import glob
+
 # Change the current working directory to the benchmark path
 # home_path = find_absolute_benchmark_path()
 # os.chdir(home_path)
@@ -25,6 +46,27 @@
 
 import psutil
 
+challenges_path = os.path.join(os.path.dirname(__file__), "challenges")
+
+json_files = deque(
+    glob.glob(
+        f"{challenges_path}/**/data.json",
+        recursive=True,
+    )
+)
+
+CHALLENGES = {}
+task_informations = defaultdict(dict)
+
+while json_files:
+    json_file = json_files.popleft()
+
+    with open(json_file, "r") as file:
+        data = json.load(file)
+        # ok
+        CHALLENGES[data["eval_id"]] = data
+        CHALLENGES[data["eval_id"]]["path"] = json_file
+
 
 def find_agbenchmark_without_uvicorn():
     pids = []
@@ -89,7 +131,7 @@ def stream_output(pipe):
         print(line, end="")
 
 
-@app.post("/reports")
+@router.post("/reports")
 def run_single_test(body: CreateReportRequest) -> Any:
     pids = find_agbenchmark_without_uvicorn()
     print(f"pids already running with agbenchmark: {pids}")
@@ -144,7 +186,7 @@ def run_single_test(body: CreateReportRequest) -> Any:
 from fastapi import FastAPI, Request, Response
 
 
-@app.get("/updates")
+@router.get("/updates")
 def get_updates(request: Request) -> Any:
     from agbenchmark.__main__ import UPDATES_JSON_PATH
 
@@ -193,3 +235,164 @@ def get_updates(request: Request) -> Any:
             media_type="application/json",
             headers={"Content-Type": "application/json"},
         )
+
+
+@router.post("/agent/tasks", tags=["agent"], response_model=Task)
+async def create_agent_task(task_eval_request: TaskEvalRequestBody) -> Task:
+    """
+    Creates a new task using the provided TaskRequestBody and returns a Task.
+
+    Args:
+        request (Request): FastAPI request object.
+        task (TaskRequestBody): The task request containing input and additional input data.
+
+    Returns:
+        Task: A new task with task_id, input, additional_input, and empty lists for artifacts and steps.
+
+    Example:
+        Request (TaskRequestBody defined in schema.py):
+            {
+                "input": "Write the words you receive to the file 'output.txt'.",
+                "additional_input": "python/code"
+            }
+
+        Response (Task defined in schema.py):
+            {
+                "task_id": "50da533e-3904-4401-8a07-c49adf88b5eb",
+                "input": "Write the word 'Washington' to a .txt file",
+                "additional_input": "python/code",
+                "artifacts": [],
+            }
+    """
+    from agbenchmark.agent_api_interface import upload_artifacts
+
+    try:
+        async with ApiClient(configuration) as api_client:
+            api_instance = AgentApi(api_client)
+            task_input = CHALLENGES[task_eval_request.eval_id]["task"]
+
+            task_request_body = TaskRequestBody(input=task_input)
+            task_response = await api_instance.create_agent_task(
+                task_request_body=task_request_body
+            )
+            task_informations[task_response.task_id][
+                "benchmark_start_time"
+            ] = datetime.datetime.now(datetime.timezone.utc).strftime(
+                "%Y-%m-%dT%H:%M:%S+00:00"
+            )
+            task_informations[task_response.task_id][
+                "eval_id"
+            ] = task_eval_request.eval_id
+            await api_instance.create_agent_task(task_request_body=task_request_body)
+            await upload_artifacts(
+                api_instance,
+                str(Path(CHALLENGES[task_eval_request.eval_id]["path"]).parent),
+                task_response.task_id,
+                "artifacts_in",
+            )
+            return Response(
+                content=task_response.json(),
+                status_code=200,
+                media_type="application/json",
+            )
+    except ApiException as e:
+        print(f"Error whilst trying to create a task: {task_eval_request}")
+        return Response(
+            content=json.dumps({"error": "Internal server error"}),
+            status_code=500,
+            media_type="application/json",
+        )
+
+
+@router.post("/agent/tasks/{task_id}/steps")
+async def proxy(request: Request, task_id: str):
+    async with httpx.AsyncClient() as client:
+        # Construct the new URL
+        new_url = f"http://localhost:8000/ap/v1/agent/tasks/{task_id}/steps"
+
+        # Forward the request
+        response = await client.post(
+            new_url,
+            data=await request.body(),
+            headers=dict(request.headers),
+        )
+
+        # Return the response from the forwarded request
+        return Response(content=response.content, status_code=response.status_code)
+
+
+@router.post("/agent/tasks/{task_id}/evaluations")
+async def create_evaluation(task_id: str) -> deque:
+    from agbenchmark.agent_api_interface import copy_agent_artifacts_into_temp_folder
+    from agbenchmark.generate_test import create_challenge
+
+    try:
+        async with ApiClient(configuration) as api_client:
+            api_instance = AgentApi(api_client)
+            await copy_agent_artifacts_into_temp_folder(api_instance, task_id)
+
+        data = CHALLENGES[task_informations[task_id]["eval_id"]]
+        json_file = CHALLENGES[task_informations[task_id]["eval_id"]]["path"]
+        json_files = deque()
+
+        _, challenge_class = create_challenge(data, json_file, json_files)
+        challenge_instance = challenge_class()
+        scores = challenge_instance.get_scores(config={})
+        test_name = "Test" + data["name"]
+        is_score_100 = 1 in scores["values"]
+
+        info_details = {
+            "repository_info": {
+                "repo_url": None,
+                "team_name": None,
+                "benchmark_git_commit_sha": None,
+                "agent_git_commit_sha": None,
+            },
+            "run_details": {
+                "run_id": None,
+                "command": "agbenchmark" + " --test=" + test_name,
+                "completion_time": None,
+                "benchmark_start_time": task_informations[task_id][
+                    "benchmark_start_time"
+                ],
+                "test_name": data["name"],
+            },
+            "task_info": {
+                "data_path": data["path"].split("benchmark/", 1)[-1],
+                "is_regression": None,
+                "category": data["category"],
+                "task": data["task"],
+                "answer": data["ground"]["answer"],
+                "description": data["info"]["description"],
+            },
+            "metrics": {
+                "difficulty": None,
+                "success": is_score_100,
+                "attempted": True,
+                "success_percentage": None,
+                "cost": None,
+                "run_time": None,
+            },
+            "reached_cutoff": None,
+            "config": {},
+        }
+
+        BenchmarkRun.parse_obj(info_details)
+
+        print(json.dumps(info_details, indent=4))
+        return Response(
+            content=json.dumps(info_details),
+            status_code=200,
+            media_type="application/json",
+        )
+    except ApiException as e:
+        print(f"Error whilst trying to evaluate the task: {task_id}")
+        return Response(
+            content=json.dumps({"error": "Internal server error"}),
+            status_code=500,
+            media_type="application/json",
+        )
+    # path = Path(json_file).resolve()
+
+
+app.include_router(router, prefix="/ap/v1")