Skip to content
This repository has been archived by the owner on Jun 9, 2024. It is now read-only.

Commit

Permalink
mock flag, workspace io fixes, mark fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
SilenNaihin committed Aug 11, 2023
1 parent f74a960 commit 1a61c66
Show file tree
Hide file tree
Showing 11 changed files with 36 additions and 26 deletions.
1 change: 0 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
AGENT_NAME=mini-agi
REPORT_LOCATION="reports/mini-agi"
MOCK_TEST=False # this is automatically set with the --mock flag
OPENAI_API_KEY="sk-" # for LLM eval
7 changes: 6 additions & 1 deletion agbenchmark/agent_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,13 @@ def get_list_of_file_paths(


def copy_artifacts_into_workspace(
workspace: str, artifact_folder_name: str, challenge_dir_path: str
workspace: str | dict[str, str], artifact_folder_name: str, challenge_dir_path: str
) -> None:
if isinstance(workspace, dict):
if artifact_folder_name == "artifacts_in":
workspace = workspace["input"]
else:
workspace = workspace["output"]
file_paths = get_list_of_file_paths(challenge_dir_path, artifact_folder_name)
for file_path in file_paths:
if os.path.isfile(file_path):
Expand Down
1 change: 0 additions & 1 deletion agbenchmark/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ def workspace(config: Dict[str, Any]) -> Generator[str, None, None]:
yield config["workspace"]
# teardown after test function completes
if not config.get("keep_workspace_files", False):
print("Emptying workspace")
for filename in os.listdir(output_path):
file_path = os.path.join(output_path, filename)
try:
Expand Down
6 changes: 5 additions & 1 deletion agbenchmark/reports/processing/process_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,11 @@ def get_agent_category(report: Report) -> dict[str, Any]:

def get_highest_category_difficulty(data: Test) -> None:
for category in data.category:
if category == "interface" or category == "iterate":
if (
category == "interface"
or category == "iterate"
or category == "product_advisor"
):
continue
categories.setdefault(category, 0)
if data.metrics.success:
Expand Down
2 changes: 1 addition & 1 deletion agbenchmark/reports/processing/report_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,4 @@ class Report(BaseModel):
benchmark_start_time: str
metrics: MetricsOverall
tests: Dict[str, Union[Test, SuiteTest]]
config: Dict[str, str]
config: Dict[str, str | dict[str, str]]
19 changes: 10 additions & 9 deletions agbenchmark/reports/reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from agbenchmark.reports.ReportManager import ReportManager
from agbenchmark.start_benchmark import (
CONFIG_PATH,
MOCK_FLAG,
REGRESSION_TESTS_PATH,
REPORTS_PATH,
SUCCESS_RATE_PATH,
Expand Down Expand Up @@ -144,7 +143,6 @@ def update_regression_tests(
def generate_single_call_report(
item: Any, call: Any, challenge_data: dict[str, Any]
) -> None:

try:
difficulty = challenge_data["info"]["difficulty"]
except KeyError:
Expand Down Expand Up @@ -205,11 +203,9 @@ def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None:
if info_details and test_name:
if run_time:
cost = None
if not MOCK_FLAG and os.environ.get("HELICONE_API_KEY"):
if "--mock" not in sys.argv and os.environ.get("HELICONE_API_KEY"):
print("Getting cost from Helicone")
cost = get_data_from_helicone(test_name)
else:
print("Helicone not setup or mock flag set, not getting cost")

info_details["metrics"]["cost"] = cost

Expand All @@ -226,10 +222,15 @@ def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None:

info_details["reached_cutoff"] = float(run_time) > challenge_data["cutoff"]

update_challenges_already_beaten(info_details, test_name)
if info_details.get("tests") is not None:
for nested_test_name, nested_test_info in info_details["tests"].items():
update_challenges_already_beaten(nested_test_info, nested_test_name)
if "--mock" not in sys.argv:
update_challenges_already_beaten(info_details, test_name)
if info_details.get("tests") is not None:
for nested_test_name, nested_test_info in info_details[
"tests"
].items():
update_challenges_already_beaten(
nested_test_info, nested_test_name
)

info_manager.add_test(test_name, info_details)

Expand Down
4 changes: 0 additions & 4 deletions agbenchmark/start_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@
HeliconeLockManager.write_custom_property(
"benchmark_start_time", BENCHMARK_START_TIME
)
MOCK_FLAG = os.getenv("MOCK_TEST", "").lower() == "true"


(
HOME_DIRECTORY,
Expand Down Expand Up @@ -170,8 +168,6 @@ def start(
for key, value in config.items():
print(f"{key}: {value}")

os.environ["MOCK_TEST"] = "True" if mock else "False"

pytest_args = ["-vs"]
if test:
print("Running specific test:", test)
Expand Down
15 changes: 10 additions & 5 deletions agbenchmark/utils/challenge.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import pytest

from agbenchmark.agent_api_interface import run_api_agent
from agbenchmark.start_benchmark import MOCK_FLAG, OPTIONAL_CATEGORIES
from agbenchmark.start_benchmark import OPTIONAL_CATEGORIES
from agbenchmark.utils.data_types import ChallengeData, Ground
from agbenchmark.utils.prompts import (
END_PROMPT,
Expand Down Expand Up @@ -61,7 +61,7 @@ async def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None:
)
print(f"\033[1;30mTask: {self.task}\033[0m")

if MOCK_FLAG:
if "--mock" in sys.argv:
print("Running mock agent")
copy_artifacts_into_workspace(
config["workspace"], "artifacts_out", self.ARTIFACTS_LOCATION
Expand All @@ -88,7 +88,12 @@ def open_file(workspace: str, filename: str) -> str:
with open(workspace_dir, "r") as f:
return f.read()

def get_artifacts_out(self, workspace: str, ground: Ground) -> List[str]:
def get_artifacts_out(
self, workspace: str | dict[str, str], ground: Ground
) -> List[str]:
if isinstance(workspace, dict):
workspace = workspace["output"]

script_dir = workspace
files_contents = []

Expand Down Expand Up @@ -163,7 +168,7 @@ def scoring(self, config: Dict[str, Any], content: str, ground: Ground) -> float

def llm_eval(self, config: Dict[str, Any], content: str, ground: Ground) -> float:
openai.api_key = os.getenv("OPENAI_API_KEY")
if MOCK_FLAG:
if "--mock" in sys.argv:
return 1.0

# the validation for this is done in the Eval BaseModel
Expand All @@ -190,7 +195,7 @@ def get_scores(self, config: Dict[str, Any]) -> dict[str, Any]:
percentage = None

try:
if self.data.task == "" and MOCK_FLAG:
if self.data.task == "" and "--mock" in sys.argv:
scores = [1.0]
elif isinstance(self.data.ground, Ground):
files_contents = self.get_artifacts_out(
Expand Down
2 changes: 1 addition & 1 deletion agent/SuperAGI
Submodule SuperAGI updated 47 files
+14 −86 agbenchmark/benchmarks.py
+76 −0 docker-compose-dev.yaml
+3 −3 docker-compose.image.example.yaml
+2 −2 docker-compose.yaml
+18 −5 gui/pages/Content/APM/ApmDashboard.js
+6 −3 gui/pages/Content/Agents/AgentSchedule.js
+1 −1 gui/pages/Content/Agents/Details.js
+14 −2 gui/pages/Content/Marketplace/KnowledgeTemplate.js
+1 −1 gui/pages/Content/Marketplace/MarketKnowledge.js
+1 −1 gui/pages/Content/Marketplace/MarketplacePublic.js
+2 −2 gui/pages/Content/Toolkits/ToolkitWorkspace.js
+4 −3 gui/pages/Dashboard/Content.js
+3 −1 gui/pages/Dashboard/Settings/Model.js
+3 −2 gui/pages/Dashboard/SideBar.js
+16 −4 gui/pages/Dashboard/TopBar.js
+39 −20 gui/pages/_app.css
+18 −4 gui/pages/_app.js
+4 −0 gui/pages/api/DashboardService.js
+3 −0 gui/public/images/discord.svg
+3 −0 gui/public/images/github_white.svg
+ gui/public/images/superagi_logo.png
+ gui/public/images/superagi_logo_beta.png
+1 −1 gui/utils/utils.js
+9 −0 main.py
+4 −3 superagi/controllers/google_oauth.py
+20 −2 superagi/helper/github_helper.py
+37 −3 superagi/helper/s3_helper.py
+13 −2 superagi/helper/twitter_helper.py
+1 −34 superagi/resource_manager/resource_summary.py
+18 −4 superagi/tools/email/send_email_attachment.py
+16 −5 superagi/tools/file/delete_file.py
+5 −7 superagi/tools/github/add_file.py
+5 −3 superagi/tools/github/search_repo.py
+21 −22 superagi/tools/google_calendar/README.md
+4 −4 superagi/tools/image_generation/README.STABLE_DIFFUSION.md
+14 −1 superagi/tools/image_generation/dalle_image_gen.py
+1 −11 superagi/tools/image_generation/stable_diffusion_image_gen.py
+37 −62 superagi/tools/instagram_tool/instagram.py
+1 −1 superagi/tools/resource/query_resource.py
+11 −13 superagi/tools/slack/README.md
+13 −13 superagi/tools/twitter/README.md
+0 −1 superagi/worker.py
+0 −22 tests/unit_tests/helper/test_github_helper.py
+29 −1 tests/unit_tests/helper/test_s3_helper.py
+0 −4 tests/unit_tests/tools/github/test_add_file.py
+32 −4 tests/unit_tests/tools/image_generation/test_dalle_image_gen.py
+57 −97 tests/unit_tests/tools/instagram_tool/test_instagram_tool.py
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ markers = [
"iterate",
"adaptability",
"safety",
"content_gen"
"content_gen",
"product_advisor"
]
filterwarnings = [
"ignore::pytest.PytestAssertRewriteWarning",
Expand Down

0 comments on commit 1a61c66

Please sign in to comment.