Skip to content

Commit

Permalink
Make agbenchmark a proxy of the evaluated agent (Significant-Gravitas…
Browse files Browse the repository at this point in the history
…#5279)

Make agbenchmark a Proxy of the evaluated agent

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
  • Loading branch information
waynehamadi authored Sep 20, 2023
1 parent 1a471b7 commit ff4c76b
Show file tree
Hide file tree
Showing 71 changed files with 2,461 additions and 1,299 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/benchmark-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,8 @@ jobs:
echo "Running the following command: ${prefix}agbenchmark --test=WriteFile"
${prefix}agbenchmark --test=WriteFile
sh run_benchmark &
sleep 5
python ../../benchmark/tests/test_web_server.py
cd ../../benchmark
poetry install
poetry run pytest tests
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
2 changes: 0 additions & 2 deletions autogpts/forge/forge/sdk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,10 @@
Pagination,
Status,
Step,
StepInput,
StepOutput,
StepRequestBody,
Task,
TaskArtifactsListResponse,
TaskInput,
TaskListResponse,
TaskRequestBody,
TaskStepsListResponse,
Expand Down
6 changes: 3 additions & 3 deletions autogpts/forge/forge/sdk/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

from .errors import NotFoundError
from .forge_log import ForgeLogger
from .schema import Artifact, Pagination, Status, Step, StepRequestBody, Task, TaskInput
from .schema import Artifact, Pagination, Status, Step, StepRequestBody, Task

LOG = ForgeLogger(__name__)

Expand Down Expand Up @@ -140,7 +140,7 @@ def __init__(self, database_string, debug_enabled: bool = False) -> None:
self.Session = sessionmaker(bind=self.engine)

async def create_task(
self, input: Optional[str], additional_input: Optional[TaskInput] = {}
self, input: Optional[str], additional_input: Optional[dict] = {}
) -> Task:
if self.debug_enabled:
LOG.debug("Creating new task")
Expand All @@ -150,7 +150,7 @@ async def create_task(
new_task = TaskModel(
task_id=str(uuid.uuid4()),
input=input,
additional_input=additional_input.json()
additional_input=additional_input
if additional_input
else {},
)
Expand Down
12 changes: 2 additions & 10 deletions autogpts/forge/forge/sdk/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,6 @@ class Pagination(BaseModel):
page_size: int = Field(..., description="Number of items per page.", example=25)


class TaskInput(BaseModel):
pass


class Artifact(BaseModel):
created_at: datetime = Field(
...,
Expand Down Expand Up @@ -66,10 +62,6 @@ class Artifact(BaseModel):
)


class StepInput(BaseModel):
pass


class StepOutput(BaseModel):
pass

Expand All @@ -81,7 +73,7 @@ class TaskRequestBody(BaseModel):
description="Input prompt for the task.",
example="Write the words you receive to the file 'output.txt'.",
)
additional_input: Optional[TaskInput] = {}
additional_input: Optional[dict] = {}


class Task(TaskRequestBody):
Expand Down Expand Up @@ -122,7 +114,7 @@ class StepRequestBody(BaseModel):
description="Input prompt for the step.",
example="Washington",
)
additional_input: Optional[StepInput] = {}
additional_input: Optional[dict] = {}


class Status(Enum):
Expand Down
268 changes: 168 additions & 100 deletions autogpts/forge/poetry.lock

Large diffs are not rendered by default.

3 changes: 0 additions & 3 deletions benchmark/agbenchmark/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,9 +272,6 @@ def version():
print(f"Benchmark Tool Version {version}")


from pathlib import Path


def serve():
import uvicorn

Expand Down
30 changes: 17 additions & 13 deletions benchmark/agbenchmark/agent_api_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,19 +60,23 @@ async def run_api_agent(
api_instance, artifacts_location, task_id, "artifacts_out"
)

artifacts = await api_instance.list_agent_task_artifacts(task_id=task_id)
for artifact in artifacts.artifacts:
# current absolute path of the directory of the file
directory_location = TEMP_FOLDER_ABS_PATH
if artifact.relative_path:
directory_location = directory_location / artifact.relative_path

with open(directory_location / artifact.file_name, "wb") as f:
content = await api_instance.download_agent_task_artifact(
task_id=task_id, artifact_id=artifact.artifact_id
)

f.write(content)
await copy_agent_artifacts_into_temp_folder(api_instance, task_id)


async def copy_agent_artifacts_into_temp_folder(api_instance, task_id):
artifacts = await api_instance.list_agent_task_artifacts(task_id=task_id)
for artifact in artifacts.artifacts:
# current absolute path of the directory of the file
directory_location = TEMP_FOLDER_ABS_PATH
if artifact.relative_path:
directory_location = directory_location / artifact.relative_path

with open(directory_location / artifact.file_name, "wb") as f:
content = await api_instance.download_agent_task_artifact(
task_id=task_id, artifact_id=artifact.artifact_id
)

f.write(content)


async def append_updates_file(step: Step):
Expand Down
209 changes: 206 additions & 3 deletions benchmark/agbenchmark/app.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,43 @@
import datetime
from collections import defaultdict, deque
from pathlib import Path

import httpx

from agbenchmark.agent_protocol_client import (
AgentApi,
ApiClient,
ApiException,
Configuration,
)
from agbenchmark.reports.processing.report_types_v2 import BenchmarkRun
from agbenchmark.schema import TaskEvalRequestBody

configuration = Configuration(host="http://localhost:8000" + "/ap/v1")

import json
import os
import sys
from typing import Any, Optional

import psutil
from fastapi import FastAPI
from fastapi import APIRouter, FastAPI
from fastapi import (
HTTPException as FastAPIHTTPException, # Import HTTPException from FastAPI
)
from fastapi import Request, Response
from fastapi.middleware.cors import CORSMiddleware

from agbenchmark.execute_sub_process import execute_subprocess
from agbenchmark.schema import Task, TaskRequestBody

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from fastapi import FastAPI
from pydantic import BaseModel, Extra

router = APIRouter()
import glob

# Change the current working directory to the benchmark path
# home_path = find_absolute_benchmark_path()
# os.chdir(home_path)
Expand All @@ -25,6 +46,27 @@

import psutil

challenges_path = os.path.join(os.path.dirname(__file__), "challenges")

json_files = deque(
glob.glob(
f"{challenges_path}/**/data.json",
recursive=True,
)
)

CHALLENGES = {}
task_informations = defaultdict(dict)

while json_files:
json_file = json_files.popleft()

with open(json_file, "r") as file:
data = json.load(file)
# ok
CHALLENGES[data["eval_id"]] = data
CHALLENGES[data["eval_id"]]["path"] = json_file


def find_agbenchmark_without_uvicorn():
pids = []
Expand Down Expand Up @@ -89,7 +131,7 @@ def stream_output(pipe):
print(line, end="")


@app.post("/reports")
@router.post("/reports")
def run_single_test(body: CreateReportRequest) -> Any:
pids = find_agbenchmark_without_uvicorn()
print(f"pids already running with agbenchmark: {pids}")
Expand Down Expand Up @@ -144,7 +186,7 @@ def run_single_test(body: CreateReportRequest) -> Any:
from fastapi import FastAPI, Request, Response


@app.get("/updates")
@router.get("/updates")
def get_updates(request: Request) -> Any:
from agbenchmark.__main__ import UPDATES_JSON_PATH

Expand Down Expand Up @@ -193,3 +235,164 @@ def get_updates(request: Request) -> Any:
media_type="application/json",
headers={"Content-Type": "application/json"},
)


@router.post("/agent/tasks", tags=["agent"], response_model=Task)
async def create_agent_task(task_eval_request: TaskEvalRequestBody) -> Task:
"""
Creates a new task using the provided TaskRequestBody and returns a Task.
Args:
request (Request): FastAPI request object.
task (TaskRequestBody): The task request containing input and additional input data.
Returns:
Task: A new task with task_id, input, additional_input, and empty lists for artifacts and steps.
Example:
Request (TaskRequestBody defined in schema.py):
{
"input": "Write the words you receive to the file 'output.txt'.",
"additional_input": "python/code"
}
Response (Task defined in schema.py):
{
"task_id": "50da533e-3904-4401-8a07-c49adf88b5eb",
"input": "Write the word 'Washington' to a .txt file",
"additional_input": "python/code",
"artifacts": [],
}
"""
from agbenchmark.agent_api_interface import upload_artifacts

try:
async with ApiClient(configuration) as api_client:
api_instance = AgentApi(api_client)
task_input = CHALLENGES[task_eval_request.eval_id]["task"]

task_request_body = TaskRequestBody(input=task_input)
task_response = await api_instance.create_agent_task(
task_request_body=task_request_body
)
task_informations[task_response.task_id][
"benchmark_start_time"
] = datetime.datetime.now(datetime.timezone.utc).strftime(
"%Y-%m-%dT%H:%M:%S+00:00"
)
task_informations[task_response.task_id][
"eval_id"
] = task_eval_request.eval_id
await api_instance.create_agent_task(task_request_body=task_request_body)
await upload_artifacts(
api_instance,
str(Path(CHALLENGES[task_eval_request.eval_id]["path"]).parent),
task_response.task_id,
"artifacts_in",
)
return Response(
content=task_response.json(),
status_code=200,
media_type="application/json",
)
except ApiException as e:
print(f"Error whilst trying to create a task: {task_eval_request}")
return Response(
content=json.dumps({"error": "Internal server error"}),
status_code=500,
media_type="application/json",
)


@router.post("/agent/tasks/{task_id}/steps")
async def proxy(request: Request, task_id: str):
async with httpx.AsyncClient() as client:
# Construct the new URL
new_url = f"http://localhost:8000/ap/v1/agent/tasks/{task_id}/steps"

# Forward the request
response = await client.post(
new_url,
data=await request.body(),
headers=dict(request.headers),
)

# Return the response from the forwarded request
return Response(content=response.content, status_code=response.status_code)


@router.post("/agent/tasks/{task_id}/evaluations")
async def create_evaluation(task_id: str) -> deque:
from agbenchmark.agent_api_interface import copy_agent_artifacts_into_temp_folder
from agbenchmark.generate_test import create_challenge

try:
async with ApiClient(configuration) as api_client:
api_instance = AgentApi(api_client)
await copy_agent_artifacts_into_temp_folder(api_instance, task_id)

data = CHALLENGES[task_informations[task_id]["eval_id"]]
json_file = CHALLENGES[task_informations[task_id]["eval_id"]]["path"]
json_files = deque()

_, challenge_class = create_challenge(data, json_file, json_files)
challenge_instance = challenge_class()
scores = challenge_instance.get_scores(config={})
test_name = "Test" + data["name"]
is_score_100 = 1 in scores["values"]

info_details = {
"repository_info": {
"repo_url": None,
"team_name": None,
"benchmark_git_commit_sha": None,
"agent_git_commit_sha": None,
},
"run_details": {
"run_id": None,
"command": "agbenchmark" + " --test=" + test_name,
"completion_time": None,
"benchmark_start_time": task_informations[task_id][
"benchmark_start_time"
],
"test_name": data["name"],
},
"task_info": {
"data_path": data["path"].split("benchmark/", 1)[-1],
"is_regression": None,
"category": data["category"],
"task": data["task"],
"answer": data["ground"]["answer"],
"description": data["info"]["description"],
},
"metrics": {
"difficulty": None,
"success": is_score_100,
"attempted": True,
"success_percentage": None,
"cost": None,
"run_time": None,
},
"reached_cutoff": None,
"config": {},
}

BenchmarkRun.parse_obj(info_details)

print(json.dumps(info_details, indent=4))
return Response(
content=json.dumps(info_details),
status_code=200,
media_type="application/json",
)
except ApiException as e:
print(f"Error whilst trying to evaluate the task: {task_id}")
return Response(
content=json.dumps({"error": "Internal server error"}),
status_code=500,
media_type="application/json",
)
# path = Path(json_file).resolve()


app.include_router(router, prefix="/ap/v1")
Loading

0 comments on commit ff4c76b

Please sign in to comment.