From 1a61c66898cfa903504963528949d562ae9388b9 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Fri, 11 Aug 2023 13:22:21 +0100
Subject: [PATCH] mock flag, workspace io fixes, mark fixes

---
 .env.example                                  |  1 -
 agbenchmark/agent_interface.py                |  7 ++++++-
 agbenchmark/challenges                        |  2 +-
 agbenchmark/conftest.py                       |  1 -
 .../reports/processing/process_report.py      |  6 +++++-
 .../reports/processing/report_types.py        |  2 +-
 agbenchmark/reports/reports.py                | 19 ++++++++++---------
 agbenchmark/start_benchmark.py                |  4 ----
 agbenchmark/utils/challenge.py                | 15 ++++++++++-----
 agent/SuperAGI                                |  2 +-
 pyproject.toml                                |  3 ++-
 11 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/.env.example b/.env.example
index c4b6d26ada4..6d1504095fc 100644
--- a/.env.example
+++ b/.env.example
@@ -1,4 +1,3 @@
 AGENT_NAME=mini-agi
 REPORT_LOCATION="reports/mini-agi"
-MOCK_TEST=False # this is automatically set with the --mock flag
 OPENAI_API_KEY="sk-" # for LLM eval
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index 016c6373af2..2c9682cb2d7 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -80,8 +80,13 @@ def get_list_of_file_paths(
 
 
 def copy_artifacts_into_workspace(
-    workspace: str, artifact_folder_name: str, challenge_dir_path: str
+    workspace: str | dict[str, str], artifact_folder_name: str, challenge_dir_path: str
 ) -> None:
+    if isinstance(workspace, dict):
+        if artifact_folder_name == "artifacts_in":
+            workspace = workspace["input"]
+        else:
+            workspace = workspace["output"]
     file_paths = get_list_of_file_paths(challenge_dir_path, artifact_folder_name)
     for file_path in file_paths:
         if os.path.isfile(file_path):
diff --git a/agbenchmark/challenges b/agbenchmark/challenges
index b05bd27b8b0..0ec140a61ff 160000
--- a/agbenchmark/challenges
+++ b/agbenchmark/challenges
@@ -1 +1 @@
-Subproject commit b05bd27b8b056843e03c3e9d6056470eaba6e7dd
+Subproject commit 0ec140a61ff6740bb62059e5d1d61495f845f7d2
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index a535eed94a8..4d06e1bd01f 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -76,7 +76,6 @@ def workspace(config: Dict[str, Any]) -> Generator[str, None, None]:
     yield config["workspace"]
     # teardown after test function completes
     if not config.get("keep_workspace_files", False):
-        print("Emptying workspace")
         for filename in os.listdir(output_path):
             file_path = os.path.join(output_path, filename)
             try:
diff --git a/agbenchmark/reports/processing/process_report.py b/agbenchmark/reports/processing/process_report.py
index 72b5a5c43d3..25f9303a2fb 100644
--- a/agbenchmark/reports/processing/process_report.py
+++ b/agbenchmark/reports/processing/process_report.py
@@ -36,7 +36,11 @@ def get_agent_category(report: Report) -> dict[str, Any]:
 
     def get_highest_category_difficulty(data: Test) -> None:
         for category in data.category:
-            if category == "interface" or category == "iterate":
+            if (
+                category == "interface"
+                or category == "iterate"
+                or category == "product_advisor"
+            ):
                 continue
             categories.setdefault(category, 0)
             if data.metrics.success:
diff --git a/agbenchmark/reports/processing/report_types.py b/agbenchmark/reports/processing/report_types.py
index a1184dac8ed..30d4346ce0f 100644
--- a/agbenchmark/reports/processing/report_types.py
+++ b/agbenchmark/reports/processing/report_types.py
@@ -43,4 +43,4 @@ class Report(BaseModel):
     benchmark_start_time: str
     metrics: MetricsOverall
     tests: Dict[str, Union[Test, SuiteTest]]
-    config: Dict[str, str]
+    config: Dict[str, str | dict[str, str]]
diff --git a/agbenchmark/reports/reports.py b/agbenchmark/reports/reports.py
index d482532f667..ddf893c751c 100644
--- a/agbenchmark/reports/reports.py
+++ b/agbenchmark/reports/reports.py
@@ -7,7 +7,6 @@
 from agbenchmark.reports.ReportManager import ReportManager
 from agbenchmark.start_benchmark import (
     CONFIG_PATH,
-    MOCK_FLAG,
     REGRESSION_TESTS_PATH,
     REPORTS_PATH,
     SUCCESS_RATE_PATH,
@@ -144,7 +143,6 @@ def update_regression_tests(
 def generate_single_call_report(
     item: Any, call: Any, challenge_data: dict[str, Any]
 ) -> None:
-
     try:
         difficulty = challenge_data["info"]["difficulty"]
     except KeyError:
@@ -205,11 +203,9 @@ def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None:
     if info_details and test_name:
         if run_time:
             cost = None
-            if not MOCK_FLAG and os.environ.get("HELICONE_API_KEY"):
+            if "--mock" not in sys.argv and os.environ.get("HELICONE_API_KEY"):
                 print("Getting cost from Helicone")
                 cost = get_data_from_helicone(test_name)
-            else:
-                print("Helicone not setup or mock flag set, not getting cost")
 
             info_details["metrics"]["cost"] = cost
 
@@ -226,10 +222,15 @@ def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None:
 
             info_details["reached_cutoff"] = float(run_time) > challenge_data["cutoff"]
 
-            update_challenges_already_beaten(info_details, test_name)
-            if info_details.get("tests") is not None:
-                for nested_test_name, nested_test_info in info_details["tests"].items():
-                    update_challenges_already_beaten(nested_test_info, nested_test_name)
+            if "--mock" not in sys.argv:
+                update_challenges_already_beaten(info_details, test_name)
+                if info_details.get("tests") is not None:
+                    for nested_test_name, nested_test_info in info_details[
+                        "tests"
+                    ].items():
+                        update_challenges_already_beaten(
+                            nested_test_info, nested_test_name
+                        )
 
         info_manager.add_test(test_name, info_details)
 
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index e46e369e393..1c5ea42fafb 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -22,8 +22,6 @@
     HeliconeLockManager.write_custom_property(
         "benchmark_start_time", BENCHMARK_START_TIME
     )
-MOCK_FLAG = os.getenv("MOCK_TEST", "").lower() == "true"
-
 
 (
     HOME_DIRECTORY,
@@ -170,8 +168,6 @@ def start(
     for key, value in config.items():
         print(f"{key}: {value}")
 
-    os.environ["MOCK_TEST"] = "True" if mock else "False"
-
     pytest_args = ["-vs"]
     if test:
         print("Running specific test:", test)
diff --git a/agbenchmark/utils/challenge.py b/agbenchmark/utils/challenge.py
index d130f48613d..eb9c7019436 100644
--- a/agbenchmark/utils/challenge.py
+++ b/agbenchmark/utils/challenge.py
@@ -10,7 +10,7 @@
 import pytest
 
 from agbenchmark.agent_api_interface import run_api_agent
-from agbenchmark.start_benchmark import MOCK_FLAG, OPTIONAL_CATEGORIES
+from agbenchmark.start_benchmark import OPTIONAL_CATEGORIES
 from agbenchmark.utils.data_types import ChallengeData, Ground
 from agbenchmark.utils.prompts import (
     END_PROMPT,
@@ -61,7 +61,7 @@ async def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None:
         )
         print(f"\033[1;30mTask: {self.task}\033[0m")
 
-        if MOCK_FLAG:
+        if "--mock" in sys.argv:
             print("Running mock agent")
             copy_artifacts_into_workspace(
                 config["workspace"], "artifacts_out", self.ARTIFACTS_LOCATION
@@ -88,7 +88,12 @@ def open_file(workspace: str, filename: str) -> str:
         with open(workspace_dir, "r") as f:
             return f.read()
 
-    def get_artifacts_out(self, workspace: str, ground: Ground) -> List[str]:
+    def get_artifacts_out(
+        self, workspace: str | dict[str, str], ground: Ground
+    ) -> List[str]:
+        if isinstance(workspace, dict):
+            workspace = workspace["output"]
+
         script_dir = workspace
         files_contents = []
 
@@ -163,7 +168,7 @@ def scoring(self, config: Dict[str, Any], content: str, ground: Ground) -> float
 
     def llm_eval(self, config: Dict[str, Any], content: str, ground: Ground) -> float:
         openai.api_key = os.getenv("OPENAI_API_KEY")
-        if MOCK_FLAG:
+        if "--mock" in sys.argv:
             return 1.0
 
         # the validation for this is done in the Eval BaseModel
@@ -190,7 +195,7 @@ def get_scores(self, config: Dict[str, Any]) -> dict[str, Any]:
         percentage = None
 
         try:
-            if self.data.task == "" and MOCK_FLAG:
+            if self.data.task == "" and "--mock" in sys.argv:
                 scores = [1.0]
             elif isinstance(self.data.ground, Ground):
                 files_contents = self.get_artifacts_out(
diff --git a/agent/SuperAGI b/agent/SuperAGI
index b0318053b6c..48b21013742 160000
--- a/agent/SuperAGI
+++ b/agent/SuperAGI
@@ -1 +1 @@
-Subproject commit b0318053b6cbe357f2e020fe0f1275a2cb3da767
+Subproject commit 48b2101374264b97dbdfc2c0bb0ae45e769e157d
diff --git a/pyproject.toml b/pyproject.toml
index 1d39c7120a9..25d62cffcca 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -60,7 +60,8 @@ markers = [
     "iterate",
     "adaptability",
     "safety",
-    "content_gen"
+    "content_gen",
+    "product_advisor"
 ]
 filterwarnings = [
     "ignore::pytest.PytestAssertRewriteWarning",