Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/dev' into propose-max-3-actions-…
Browse files Browse the repository at this point in the history
…in-weekly-analysis
  • Loading branch information
rjambrecic committed May 21, 2024
2 parents aad28bd + dbdac36 commit c3727d2
Show file tree
Hide file tree
Showing 21 changed files with 395 additions and 91 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
url,success_rate,avg_time
https://faststream.airt.ai,1.0,784.6
https://www.disneystore.eu,1.0,745.62
https://www.ikea.com/gb/en/,1.0,904.52
Total,1.0,811.58

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
url,success_rate,avg_time
https://faststream.airt.ai,1.0,911.87
https://www.disneystore.eu,1.0,872.92
https://www.ikea.com/gb/en/,1.0,2492.2
Total,1.0,1425.66
61 changes: 61 additions & 0 deletions benchmarking/end2end-benchmark-task-list-2024-05-20T23:25:19.csv

Large diffs are not rendered by default.

66 changes: 41 additions & 25 deletions captn/captn_agents/backend/benchmarking/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def lock_file(path: Path) -> Iterator[None]:
lock_path.unlink()


COMMON_COLUMNS = ["execution_time", "status", "success", "output"]
COMMON_COLUMNS = ["execution_time", "status", "success", "output", "retries"]


def _add_common_columns_and_save(
Expand Down Expand Up @@ -219,13 +219,17 @@ def run_test(
) -> Dict[str, Any]:
try:
time_start = time.time()
output = benchmark(**kwargs)
success = True
output, retry_from_scratch_counters = benchmark(**kwargs)
if retry_from_scratch_counters == 0:
success = "Success"
else:
success = "Success with retry"
except Exception as e:
traceback.print_stack()
traceback.print_exc()
output = str(e)
success = False
success = "Failed"
retry_from_scratch_counters = -1
finally:
total_time = time.time() - time_start
status = "DONE"
Expand All @@ -235,6 +239,7 @@ def run_test(
"status": status,
"success": success,
"output": output,
"retries": retry_from_scratch_counters,
}


Expand All @@ -250,31 +255,42 @@ def get_random_nan_index(xs: pd.Series) -> Any:

def create_ag_report(df: pd.DataFrame, groupby_list: List[str]) -> pd.DataFrame:
df = df.dropna(subset=["success"])
group = df.groupby(groupby_list)["success"]
total = group.count()
success = group.sum()
success_rate = (success / total).rename("success_rate")

group = df.groupby(groupby_list)["execution_time"]
avg_time = group.mean().rename("avg_time")

ag_report_df = pd.concat([success_rate, avg_time], axis=1).sort_values(
"success_rate", ascending=True
result = (
df.groupby(groupby_list)
.agg(
success_percentage=(
"success",
lambda x: sum(x == "Success") / len(x) * 100,
),
success_with_retry_percentage=(
"success",
lambda x: sum(x == "Success with retry") / len(x) * 100,
),
failed_percentage=("success", lambda x: sum(x == "Failed") / len(x) * 100),
avg_time=("execution_time", "mean"),
)
.reset_index()
)

# Calculate total success rate and average time
total_success = success.sum()
count = len(df)
total_time = df["execution_time"].sum()
total_success_rate = total_success / count
total_avg_time = total_time / count

ag_report_df.loc["Total"] = [total_success_rate, total_avg_time]
total_success_percentage = df["success"].eq("Success").mean() * 100
total_success_with_retry_percentage = (
df["success"].eq("Success with retry").mean() * 100
)
total_failed_percentage = df["success"].eq("Failed").mean() * 100
total_avg_execution_time = df["execution_time"].mean()
total_row = {
"success_percentage": total_success_percentage,
"success_with_retry_percentage": total_success_with_retry_percentage,
"failed_percentage": total_failed_percentage,
"avg_time": total_avg_execution_time,
}
for col in groupby_list:
total_row[col] = None
result.loc["Total"] = total_row

# round columns to 2 decimal places
ag_report_df = ag_report_df.round(2)
result.loc[result.index[-1], "url"] = "Total"

return ag_report_df
return result.reset_index(drop=True).round(2)


GROUP_BY_DICT = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def benchmark_brief_creation(
url: str,
team_name: str,
llm: str = Models.gpt3_5,
) -> str:
) -> Tuple[str, int]:
config_list = get_config_list(llm)

user_id = 123
Expand Down Expand Up @@ -166,7 +166,9 @@ def benchmark_brief_creation(
"task" in delegate_task_function_sugestion_function["arguments"]
) # nosec: [B101]

return delegate_task_function_sugestion_function["arguments"] # type: ignore[no-any-return]
return delegate_task_function_sugestion_function[
"arguments"
], team.retry_from_scratch_counter
finally:
poped_team = Team.pop_team(user_id=user_id, conv_id=conv_id)
assert isinstance(poped_team, Team) # nosec: [B101]
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ def continue_conversation_until_finished(
def benchmark_campaign_creation(
url: str,
llm: str = Models.gpt4,
) -> str:
) -> Tuple[str, int]:
try:
task = URL_TASK_DICT[url]
config_list = get_config_list(llm)
Expand Down Expand Up @@ -215,7 +215,7 @@ def benchmark_campaign_creation(
mock_create_ad_group_ad=mock_create_ad_group_ad,
mock_create_ad_group_keyword=mock_create_ad_group_keyword,
mock_create_campaign=mock_create_campaign,
)
), campaign_creation_team.retry_from_scratch_counter
finally:
user_id, conv_id = campaign_creation_team.name.split("_")[-2:]
success = Team.pop_team(user_id=int(user_id), conv_id=int(conv_id))
Expand Down
28 changes: 17 additions & 11 deletions captn/captn_agents/backend/benchmarking/end2end.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from contextlib import contextmanager
from tempfile import TemporaryDirectory
from typing import Any, Iterator
from typing import Any, Iterator, Tuple
from unittest.mock import patch

from autogen.cache import Cache
Expand Down Expand Up @@ -49,13 +49,13 @@ def _patch_brief_creation_team_vars(
def benchmark_end2end(
url: str,
llm: str = Models.gpt4,
) -> str:
) -> Tuple[str, int]:
config_list = get_config_list(llm)

user_id = 123
conv_id = 234
task = _get_task(url)
team = BriefCreationTeam(
brief_creation_team = BriefCreationTeam(
task=task, user_id=user_id, conv_id=conv_id, config_list=config_list
)

Expand All @@ -66,15 +66,17 @@ def benchmark_end2end(
with TemporaryDirectory() as cache_dir:
with Cache.disk(cache_path_root=cache_dir) as cache:
with _patch_brief_creation_team_vars(
team=team, client_system_message=client_system_message, cache=cache
team=brief_creation_team,
client_system_message=client_system_message,
cache=cache,
) as mock_change_the_team_and_start_new_chat:
with _patch_campaign_creation_team_vars() as (
mock_create_ad_group,
mock_create_ad_group_ad,
mock_create_ad_group_keyword,
mock_create_campaign,
):
team.initiate_chat(cache=cache)
brief_creation_team.initiate_chat(cache=cache)
continue_conversation_until_finished(
user_id=user_id,
conv_id=conv_id,
Expand All @@ -94,12 +96,16 @@ def benchmark_end2end(
)
assert isinstance(campaign_creation_team, CampaignCreationTeam) # nosec: [B101]

return run_assertions_and_return_last_message(
campaign_creation_team=campaign_creation_team,
mock_create_ad_group=mock_create_ad_group,
mock_create_ad_group_ad=mock_create_ad_group_ad,
mock_create_ad_group_keyword=mock_create_ad_group_keyword,
mock_create_campaign=mock_create_campaign,
return (
run_assertions_and_return_last_message(
campaign_creation_team=campaign_creation_team,
mock_create_ad_group=mock_create_ad_group,
mock_create_ad_group_ad=mock_create_ad_group_ad,
mock_create_ad_group_keyword=mock_create_ad_group_keyword,
mock_create_campaign=mock_create_campaign,
),
brief_creation_team.retry_from_scratch_counter
+ campaign_creation_team.retry_from_scratch_counter,
)
finally:
poped_team = Team.pop_team(user_id=user_id, conv_id=conv_id)
Expand Down
7 changes: 5 additions & 2 deletions captn/captn_agents/backend/benchmarking/websurfer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Tuple

from ..tools._functions import (
get_get_info_from_the_web_page,
get_llm_config_gpt_3_5,
Expand All @@ -17,7 +19,7 @@ def benchmark_websurfer(
navigator_llm: str = Models.gpt4,
timestamp: str = "2024-01-01T00:00:0",
introduce_give_up_after: int = 7,
) -> str:
) -> Tuple[str, int]:
llm_configs = {
Models.gpt3_5.value: get_llm_config_gpt_3_5(),
Models.gpt4.value: get_llm_config_gpt_4(),
Expand All @@ -42,4 +44,5 @@ def benchmark_websurfer(
):
raise AssertionError(last_message)

return last_message
# TODO: Always return 0 as a retry counter
return last_message, 0
6 changes: 2 additions & 4 deletions captn/captn_agents/backend/teams/_brief_creation_team.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,6 @@ def __init__(
temperature: float = 0.2,
config_list: Optional[List[Dict[str, str]]] = None,
):
self.task = task
self.initial_brief = task

recommended_modifications_and_answer_list: List[
Tuple[Dict[str, Any], Optional[str]]
] = []
Expand All @@ -58,6 +55,7 @@ def __init__(
user_id=user_id,
conv_id=conv_id,
roles=roles,
task=task,
function_map=function_map,
work_dir=work_dir,
max_round=max_round,
Expand Down Expand Up @@ -85,7 +83,7 @@ def _add_tools(self) -> None:
self.toolbox = create_brief_creation_team_toolbox(
user_id=self.user_id,
conv_id=self.conv_id,
initial_brief=self.initial_brief,
initial_brief=self.task,
)
for agent in self.members:
if agent != self.user_proxy:
Expand Down
9 changes: 7 additions & 2 deletions captn/captn_agents/backend/teams/_campaign_creation_team.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,6 @@ def __init__(
temperature: float = 0.2,
config_list: Optional[List[Dict[str, str]]] = None,
):
self.task = task

recommended_modifications_and_answer_list: List[
Tuple[Dict[str, Any], Optional[str]]
] = []
Expand All @@ -113,6 +111,7 @@ def __init__(
user_id=user_id,
conv_id=conv_id,
roles=roles,
task=task,
function_map=function_map,
work_dir=work_dir,
max_round=max_round,
Expand Down Expand Up @@ -319,3 +318,9 @@ def get_brief_template(cls) -> str:
Now Let's get all the information from the clients web page and create a detailed plan for the campaign.
"""

def initiate_chat(self, **kwargs: Any) -> None:
if self.toolbox._context.changes_made: # type: ignore[union-attr]
self.initial_message += f"\nThe following modifications have already been done: {self.toolbox._context.changes_made}\n" # type: ignore[union-attr]
self.toolbox._context.changes_made = "" # type: ignore[union-attr]
super().initiate_chat(**kwargs)
2 changes: 1 addition & 1 deletion captn/captn_agents/backend/teams/_google_ads_team.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def __init__(
user_id=user_id,
conv_id=conv_id,
roles=roles,
task=task,
function_map=function_map,
work_dir=work_dir,
max_round=max_round,
Expand All @@ -89,7 +90,6 @@ def __init__(
use_user_proxy=True,
)
self.conv_id = conv_id
self.task = task
self.llm_config = GoogleAdsTeam._get_llm_config(
seed=seed, temperature=temperature
)
Expand Down
Loading

0 comments on commit c3727d2

Please sign in to comment.