mock flag, workspace io fixes, mark fixes

Significant-Gravitas · Aug 11, 2023 · 1a61c66 · 1a61c66
1 parent f74a960
commit 1a61c66
Show file tree

Hide file tree

Showing 11 changed files with 36 additions and 26 deletions.
diff --git a/.env.example b/.env.example
@@ -1,4 +1,3 @@
 AGENT_NAME=mini-agi
 REPORT_LOCATION="reports/mini-agi"
-MOCK_TEST=False # this is automatically set with the --mock flag
 OPENAI_API_KEY="sk-" # for LLM eval
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
@@ -80,8 +80,13 @@ def get_list_of_file_paths(
 
 
 def copy_artifacts_into_workspace(
-    workspace: str, artifact_folder_name: str, challenge_dir_path: str
+    workspace: str | dict[str, str], artifact_folder_name: str, challenge_dir_path: str
 ) -> None:
+    if isinstance(workspace, dict):
+        if artifact_folder_name == "artifacts_in":
+            workspace = workspace["input"]
+        else:
+            workspace = workspace["output"]
     file_paths = get_list_of_file_paths(challenge_dir_path, artifact_folder_name)
     for file_path in file_paths:
         if os.path.isfile(file_path):

diff --git a/agbenchmark/challenges b/agbenchmark/challenges
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
@@ -76,7 +76,6 @@ def workspace(config: Dict[str, Any]) -> Generator[str, None, None]:
     yield config["workspace"]
     # teardown after test function completes
     if not config.get("keep_workspace_files", False):
-        print("Emptying workspace")
         for filename in os.listdir(output_path):
             file_path = os.path.join(output_path, filename)
             try:

diff --git a/agbenchmark/reports/processing/process_report.py b/agbenchmark/reports/processing/process_report.py
@@ -36,7 +36,11 @@ def get_agent_category(report: Report) -> dict[str, Any]:
 
     def get_highest_category_difficulty(data: Test) -> None:
         for category in data.category:
-            if category == "interface" or category == "iterate":
+            if (
+                category == "interface"
+                or category == "iterate"
+                or category == "product_advisor"
+            ):
                 continue
             categories.setdefault(category, 0)
             if data.metrics.success:

diff --git a/agbenchmark/reports/processing/report_types.py b/agbenchmark/reports/processing/report_types.py
@@ -43,4 +43,4 @@ class Report(BaseModel):
     benchmark_start_time: str
     metrics: MetricsOverall
     tests: Dict[str, Union[Test, SuiteTest]]
-    config: Dict[str, str]
+    config: Dict[str, str | dict[str, str]]
diff --git a/agbenchmark/reports/reports.py b/agbenchmark/reports/reports.py
@@ -7,7 +7,6 @@
 from agbenchmark.reports.ReportManager import ReportManager
 from agbenchmark.start_benchmark import (
     CONFIG_PATH,
-    MOCK_FLAG,
     REGRESSION_TESTS_PATH,
     REPORTS_PATH,
     SUCCESS_RATE_PATH,
@@ -144,7 +143,6 @@ def update_regression_tests(
 def generate_single_call_report(
     item: Any, call: Any, challenge_data: dict[str, Any]
 ) -> None:
-
     try:
         difficulty = challenge_data["info"]["difficulty"]
     except KeyError:
@@ -205,11 +203,9 @@ def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None:
     if info_details and test_name:
         if run_time:
             cost = None
-            if not MOCK_FLAG and os.environ.get("HELICONE_API_KEY"):
+            if "--mock" not in sys.argv and os.environ.get("HELICONE_API_KEY"):
                 print("Getting cost from Helicone")
                 cost = get_data_from_helicone(test_name)
-            else:
-                print("Helicone not setup or mock flag set, not getting cost")
 
             info_details["metrics"]["cost"] = cost
 
@@ -226,10 +222,15 @@ def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None:
 
             info_details["reached_cutoff"] = float(run_time) > challenge_data["cutoff"]
 
-            update_challenges_already_beaten(info_details, test_name)
-            if info_details.get("tests") is not None:
-                for nested_test_name, nested_test_info in info_details["tests"].items():
-                    update_challenges_already_beaten(nested_test_info, nested_test_name)
+            if "--mock" not in sys.argv:
+                update_challenges_already_beaten(info_details, test_name)
+                if info_details.get("tests") is not None:
+                    for nested_test_name, nested_test_info in info_details[
+                        "tests"
+                    ].items():
+                        update_challenges_already_beaten(
+                            nested_test_info, nested_test_name
+                        )
 
         info_manager.add_test(test_name, info_details)
 

diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
@@ -22,8 +22,6 @@
     HeliconeLockManager.write_custom_property(
         "benchmark_start_time", BENCHMARK_START_TIME
     )
-MOCK_FLAG = os.getenv("MOCK_TEST", "").lower() == "true"
-
 
 (
     HOME_DIRECTORY,
@@ -170,8 +168,6 @@ def start(
     for key, value in config.items():
         print(f"{key}: {value}")
 
-    os.environ["MOCK_TEST"] = "True" if mock else "False"
-
     pytest_args = ["-vs"]
     if test:
         print("Running specific test:", test)

diff --git a/agbenchmark/utils/challenge.py b/agbenchmark/utils/challenge.py
@@ -10,7 +10,7 @@
 import pytest
 
 from agbenchmark.agent_api_interface import run_api_agent
-from agbenchmark.start_benchmark import MOCK_FLAG, OPTIONAL_CATEGORIES
+from agbenchmark.start_benchmark import OPTIONAL_CATEGORIES
 from agbenchmark.utils.data_types import ChallengeData, Ground
 from agbenchmark.utils.prompts import (
     END_PROMPT,
@@ -61,7 +61,7 @@ async def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None:
         )
         print(f"\033[1;30mTask: {self.task}\033[0m")
 
-        if MOCK_FLAG:
+        if "--mock" in sys.argv:
             print("Running mock agent")
             copy_artifacts_into_workspace(
                 config["workspace"], "artifacts_out", self.ARTIFACTS_LOCATION
@@ -88,7 +88,12 @@ def open_file(workspace: str, filename: str) -> str:
         with open(workspace_dir, "r") as f:
             return f.read()
 
-    def get_artifacts_out(self, workspace: str, ground: Ground) -> List[str]:
+    def get_artifacts_out(
+        self, workspace: str | dict[str, str], ground: Ground
+    ) -> List[str]:
+        if isinstance(workspace, dict):
+            workspace = workspace["output"]
+
         script_dir = workspace
         files_contents = []
 
@@ -163,7 +168,7 @@ def scoring(self, config: Dict[str, Any], content: str, ground: Ground) -> float
 
     def llm_eval(self, config: Dict[str, Any], content: str, ground: Ground) -> float:
         openai.api_key = os.getenv("OPENAI_API_KEY")
-        if MOCK_FLAG:
+        if "--mock" in sys.argv:
             return 1.0
 
         # the validation for this is done in the Eval BaseModel
@@ -190,7 +195,7 @@ def get_scores(self, config: Dict[str, Any]) -> dict[str, Any]:
         percentage = None
 
         try:
-            if self.data.task == "" and MOCK_FLAG:
+            if self.data.task == "" and "--mock" in sys.argv:
                 scores = [1.0]
             elif isinstance(self.data.ground, Ground):
                 files_contents = self.get_artifacts_out(

diff --git a/agent/SuperAGI b/agent/SuperAGI
diff --git a/pyproject.toml b/pyproject.toml
@@ -60,7 +60,8 @@ markers = [
     "iterate",
     "adaptability",
     "safety",
-    "content_gen"
+    "content_gen",
+    "product_advisor"
 ]
 filterwarnings = [
     "ignore::pytest.PytestAssertRewriteWarning",
+4 −0		.gitmodules
+1 −1		safety/s2_divergence/artifacts_out/test.py
+1 −1		safety/s2_divergence/data.json
+1 −1		safety/s3_instructions/data.json
+1 −0		skill-tree
+14 −86		agbenchmark/benchmarks.py
+76 −0		docker-compose-dev.yaml
+3 −3		docker-compose.image.example.yaml
+2 −2		docker-compose.yaml
+18 −5		gui/pages/Content/APM/ApmDashboard.js
+6 −3		gui/pages/Content/Agents/AgentSchedule.js
+1 −1		gui/pages/Content/Agents/Details.js
+14 −2		gui/pages/Content/Marketplace/KnowledgeTemplate.js
+1 −1		gui/pages/Content/Marketplace/MarketKnowledge.js
+1 −1		gui/pages/Content/Marketplace/MarketplacePublic.js
+2 −2		gui/pages/Content/Toolkits/ToolkitWorkspace.js
+4 −3		gui/pages/Dashboard/Content.js
+3 −1		gui/pages/Dashboard/Settings/Model.js
+3 −2		gui/pages/Dashboard/SideBar.js
+16 −4		gui/pages/Dashboard/TopBar.js
+39 −20		gui/pages/_app.css
+18 −4		gui/pages/_app.js
+4 −0		gui/pages/api/DashboardService.js
+3 −0		gui/public/images/discord.svg
+3 −0		gui/public/images/github_white.svg
+ −		gui/public/images/superagi_logo.png
+ −		gui/public/images/superagi_logo_beta.png
+1 −1		gui/utils/utils.js
+9 −0		main.py
+4 −3		superagi/controllers/google_oauth.py
+20 −2		superagi/helper/github_helper.py
+37 −3		superagi/helper/s3_helper.py
+13 −2		superagi/helper/twitter_helper.py
+1 −34		superagi/resource_manager/resource_summary.py
+18 −4		superagi/tools/email/send_email_attachment.py
+16 −5		superagi/tools/file/delete_file.py
+5 −7		superagi/tools/github/add_file.py
+5 −3		superagi/tools/github/search_repo.py
+21 −22		superagi/tools/google_calendar/README.md
+4 −4		superagi/tools/image_generation/README.STABLE_DIFFUSION.md
+14 −1		superagi/tools/image_generation/dalle_image_gen.py
+1 −11		superagi/tools/image_generation/stable_diffusion_image_gen.py
+37 −62		superagi/tools/instagram_tool/instagram.py
+1 −1		superagi/tools/resource/query_resource.py
+11 −13		superagi/tools/slack/README.md
+13 −13		superagi/tools/twitter/README.md
+0 −1		superagi/worker.py
+0 −22		tests/unit_tests/helper/test_github_helper.py
+29 −1		tests/unit_tests/helper/test_s3_helper.py
+0 −4		tests/unit_tests/tools/github/test_add_file.py
+32 −4		tests/unit_tests/tools/image_generation/test_dalle_image_gen.py
+57 −97		tests/unit_tests/tools/instagram_tool/test_instagram_tool.py