From 1a61c66898cfa903504963528949d562ae9388b9 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Fri, 11 Aug 2023 13:22:21 +0100 Subject: [PATCH] mock flag, workspace io fixes, mark fixes --- .env.example | 1 - agbenchmark/agent_interface.py | 7 ++++++- agbenchmark/challenges | 2 +- agbenchmark/conftest.py | 1 - .../reports/processing/process_report.py | 6 +++++- .../reports/processing/report_types.py | 2 +- agbenchmark/reports/reports.py | 19 ++++++++++--------- agbenchmark/start_benchmark.py | 4 ---- agbenchmark/utils/challenge.py | 15 ++++++++++----- agent/SuperAGI | 2 +- pyproject.toml | 3 ++- 11 files changed, 36 insertions(+), 26 deletions(-) diff --git a/.env.example b/.env.example index c4b6d26ada4..6d1504095fc 100644 --- a/.env.example +++ b/.env.example @@ -1,4 +1,3 @@ AGENT_NAME=mini-agi REPORT_LOCATION="reports/mini-agi" -MOCK_TEST=False # this is automatically set with the --mock flag OPENAI_API_KEY="sk-" # for LLM eval diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index 016c6373af2..2c9682cb2d7 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -80,8 +80,13 @@ def get_list_of_file_paths( def copy_artifacts_into_workspace( - workspace: str, artifact_folder_name: str, challenge_dir_path: str + workspace: str | dict[str, str], artifact_folder_name: str, challenge_dir_path: str ) -> None: + if isinstance(workspace, dict): + if artifact_folder_name == "artifacts_in": + workspace = workspace["input"] + else: + workspace = workspace["output"] file_paths = get_list_of_file_paths(challenge_dir_path, artifact_folder_name) for file_path in file_paths: if os.path.isfile(file_path): diff --git a/agbenchmark/challenges b/agbenchmark/challenges index b05bd27b8b0..0ec140a61ff 160000 --- a/agbenchmark/challenges +++ b/agbenchmark/challenges @@ -1 +1 @@ -Subproject commit b05bd27b8b056843e03c3e9d6056470eaba6e7dd +Subproject commit 0ec140a61ff6740bb62059e5d1d61495f845f7d2 diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index a535eed94a8..4d06e1bd01f 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -76,7 +76,6 @@ def workspace(config: Dict[str, Any]) -> Generator[str, None, None]: yield config["workspace"] # teardown after test function completes if not config.get("keep_workspace_files", False): - print("Emptying workspace") for filename in os.listdir(output_path): file_path = os.path.join(output_path, filename) try: diff --git a/agbenchmark/reports/processing/process_report.py b/agbenchmark/reports/processing/process_report.py index 72b5a5c43d3..25f9303a2fb 100644 --- a/agbenchmark/reports/processing/process_report.py +++ b/agbenchmark/reports/processing/process_report.py @@ -36,7 +36,11 @@ def get_agent_category(report: Report) -> dict[str, Any]: def get_highest_category_difficulty(data: Test) -> None: for category in data.category: - if category == "interface" or category == "iterate": + if ( + category == "interface" + or category == "iterate" + or category == "product_advisor" + ): continue categories.setdefault(category, 0) if data.metrics.success: diff --git a/agbenchmark/reports/processing/report_types.py b/agbenchmark/reports/processing/report_types.py index a1184dac8ed..30d4346ce0f 100644 --- a/agbenchmark/reports/processing/report_types.py +++ b/agbenchmark/reports/processing/report_types.py @@ -43,4 +43,4 @@ class Report(BaseModel): benchmark_start_time: str metrics: MetricsOverall tests: Dict[str, Union[Test, SuiteTest]] - config: Dict[str, str] + config: Dict[str, str | dict[str, str]] diff --git a/agbenchmark/reports/reports.py b/agbenchmark/reports/reports.py index d482532f667..ddf893c751c 100644 --- a/agbenchmark/reports/reports.py +++ b/agbenchmark/reports/reports.py @@ -7,7 +7,6 @@ from agbenchmark.reports.ReportManager import ReportManager from agbenchmark.start_benchmark import ( CONFIG_PATH, - MOCK_FLAG, REGRESSION_TESTS_PATH, REPORTS_PATH, SUCCESS_RATE_PATH, @@ -144,7 +143,6 @@ def update_regression_tests( def generate_single_call_report( item: Any, call: Any, challenge_data: dict[str, Any] ) -> None: - try: difficulty = challenge_data["info"]["difficulty"] except KeyError: @@ -205,11 +203,9 @@ def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None: if info_details and test_name: if run_time: cost = None - if not MOCK_FLAG and os.environ.get("HELICONE_API_KEY"): + if "--mock" not in sys.argv and os.environ.get("HELICONE_API_KEY"): print("Getting cost from Helicone") cost = get_data_from_helicone(test_name) - else: - print("Helicone not setup or mock flag set, not getting cost") info_details["metrics"]["cost"] = cost @@ -226,10 +222,15 @@ def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None: info_details["reached_cutoff"] = float(run_time) > challenge_data["cutoff"] - update_challenges_already_beaten(info_details, test_name) - if info_details.get("tests") is not None: - for nested_test_name, nested_test_info in info_details["tests"].items(): - update_challenges_already_beaten(nested_test_info, nested_test_name) + if "--mock" not in sys.argv: + update_challenges_already_beaten(info_details, test_name) + if info_details.get("tests") is not None: + for nested_test_name, nested_test_info in info_details[ + "tests" + ].items(): + update_challenges_already_beaten( + nested_test_info, nested_test_name + ) info_manager.add_test(test_name, info_details) diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index e46e369e393..1c5ea42fafb 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -22,8 +22,6 @@ HeliconeLockManager.write_custom_property( "benchmark_start_time", BENCHMARK_START_TIME ) -MOCK_FLAG = os.getenv("MOCK_TEST", "").lower() == "true" - ( HOME_DIRECTORY, @@ -170,8 +168,6 @@ def start( for key, value in config.items(): print(f"{key}: {value}") - os.environ["MOCK_TEST"] = "True" if mock else "False" - pytest_args = ["-vs"] if test: print("Running specific test:", test) diff --git a/agbenchmark/utils/challenge.py b/agbenchmark/utils/challenge.py index d130f48613d..eb9c7019436 100644 --- a/agbenchmark/utils/challenge.py +++ b/agbenchmark/utils/challenge.py @@ -10,7 +10,7 @@ import pytest from agbenchmark.agent_api_interface import run_api_agent -from agbenchmark.start_benchmark import MOCK_FLAG, OPTIONAL_CATEGORIES +from agbenchmark.start_benchmark import OPTIONAL_CATEGORIES from agbenchmark.utils.data_types import ChallengeData, Ground from agbenchmark.utils.prompts import ( END_PROMPT, @@ -61,7 +61,7 @@ async def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None: ) print(f"\033[1;30mTask: {self.task}\033[0m") - if MOCK_FLAG: + if "--mock" in sys.argv: print("Running mock agent") copy_artifacts_into_workspace( config["workspace"], "artifacts_out", self.ARTIFACTS_LOCATION @@ -88,7 +88,12 @@ def open_file(workspace: str, filename: str) -> str: with open(workspace_dir, "r") as f: return f.read() - def get_artifacts_out(self, workspace: str, ground: Ground) -> List[str]: + def get_artifacts_out( + self, workspace: str | dict[str, str], ground: Ground + ) -> List[str]: + if isinstance(workspace, dict): + workspace = workspace["output"] + script_dir = workspace files_contents = [] @@ -163,7 +168,7 @@ def scoring(self, config: Dict[str, Any], content: str, ground: Ground) -> float def llm_eval(self, config: Dict[str, Any], content: str, ground: Ground) -> float: openai.api_key = os.getenv("OPENAI_API_KEY") - if MOCK_FLAG: + if "--mock" in sys.argv: return 1.0 # the validation for this is done in the Eval BaseModel @@ -190,7 +195,7 @@ def get_scores(self, config: Dict[str, Any]) -> dict[str, Any]: percentage = None try: - if self.data.task == "" and MOCK_FLAG: + if self.data.task == "" and "--mock" in sys.argv: scores = [1.0] elif isinstance(self.data.ground, Ground): files_contents = self.get_artifacts_out( diff --git a/agent/SuperAGI b/agent/SuperAGI index b0318053b6c..48b21013742 160000 --- a/agent/SuperAGI +++ b/agent/SuperAGI @@ -1 +1 @@ -Subproject commit b0318053b6cbe357f2e020fe0f1275a2cb3da767 +Subproject commit 48b2101374264b97dbdfc2c0bb0ae45e769e157d diff --git a/pyproject.toml b/pyproject.toml index 1d39c7120a9..25d62cffcca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,8 @@ markers = [ "iterate", "adaptability", "safety", - "content_gen" + "content_gen", + "product_advisor" ] filterwarnings = [ "ignore::pytest.PytestAssertRewriteWarning",