Merge remote-tracking branch 'origin/dev' into propose-max-3-actions-…

…in-weekly-analysis
airtai · May 21, 2024 · c3727d2 · c3727d2
2 parents aad28bd + dbdac36
commit c3727d2
Show file tree

Hide file tree

Showing 21 changed files with 395 additions and 91 deletions.
diff --git a/benchmarking/campaign-creation-benchmark-task-list-2024-05-20T15:53:22-aggregated.csv b/benchmarking/campaign-creation-benchmark-task-list-2024-05-20T15:53:22-aggregated.csv
@@ -0,0 +1,5 @@
+url,success_rate,avg_time
+https://faststream.airt.ai,1.0,784.6
+https://www.disneystore.eu,1.0,745.62
+https://www.ikea.com/gb/en/,1.0,904.52
+Total,1.0,811.58
diff --git a/benchmarking/campaign-creation-benchmark-task-list-2024-05-20T15:53:22.csv b/benchmarking/campaign-creation-benchmark-task-list-2024-05-20T15:53:22.csv
diff --git a/benchmarking/end2end-benchmark-task-list-2024-05-20T23:25:19-aggregated.csv b/benchmarking/end2end-benchmark-task-list-2024-05-20T23:25:19-aggregated.csv
@@ -0,0 +1,5 @@
+url,success_rate,avg_time
+https://faststream.airt.ai,1.0,911.87
+https://www.disneystore.eu,1.0,872.92
+https://www.ikea.com/gb/en/,1.0,2492.2
+Total,1.0,1425.66
diff --git a/benchmarking/end2end-benchmark-task-list-2024-05-20T23:25:19.csv b/benchmarking/end2end-benchmark-task-list-2024-05-20T23:25:19.csv
diff --git a/captn/captn_agents/backend/benchmarking/base.py b/captn/captn_agents/backend/benchmarking/base.py
@@ -34,7 +34,7 @@ def lock_file(path: Path) -> Iterator[None]:
         lock_path.unlink()
 
 
-COMMON_COLUMNS = ["execution_time", "status", "success", "output"]
+COMMON_COLUMNS = ["execution_time", "status", "success", "output", "retries"]
 
 
 def _add_common_columns_and_save(
@@ -219,13 +219,17 @@ def run_test(
 ) -> Dict[str, Any]:
     try:
         time_start = time.time()
-        output = benchmark(**kwargs)
-        success = True
+        output, retry_from_scratch_counters = benchmark(**kwargs)
+        if retry_from_scratch_counters == 0:
+            success = "Success"
+        else:
+            success = "Success with retry"
     except Exception as e:
         traceback.print_stack()
         traceback.print_exc()
         output = str(e)
-        success = False
+        success = "Failed"
+        retry_from_scratch_counters = -1
     finally:
         total_time = time.time() - time_start
         status = "DONE"
@@ -235,6 +239,7 @@ def run_test(
         "status": status,
         "success": success,
         "output": output,
+        "retries": retry_from_scratch_counters,
     }
 
 
@@ -250,31 +255,42 @@ def get_random_nan_index(xs: pd.Series) -> Any:
 
 def create_ag_report(df: pd.DataFrame, groupby_list: List[str]) -> pd.DataFrame:
     df = df.dropna(subset=["success"])
-    group = df.groupby(groupby_list)["success"]
-    total = group.count()
-    success = group.sum()
-    success_rate = (success / total).rename("success_rate")
-
-    group = df.groupby(groupby_list)["execution_time"]
-    avg_time = group.mean().rename("avg_time")
-
-    ag_report_df = pd.concat([success_rate, avg_time], axis=1).sort_values(
-        "success_rate", ascending=True
+    result = (
+        df.groupby(groupby_list)
+        .agg(
+            success_percentage=(
+                "success",
+                lambda x: sum(x == "Success") / len(x) * 100,
+            ),
+            success_with_retry_percentage=(
+                "success",
+                lambda x: sum(x == "Success with retry") / len(x) * 100,
+            ),
+            failed_percentage=("success", lambda x: sum(x == "Failed") / len(x) * 100),
+            avg_time=("execution_time", "mean"),
+        )
+        .reset_index()
     )
 
-    # Calculate total success rate and average time
-    total_success = success.sum()
-    count = len(df)
-    total_time = df["execution_time"].sum()
-    total_success_rate = total_success / count
-    total_avg_time = total_time / count
-
-    ag_report_df.loc["Total"] = [total_success_rate, total_avg_time]
+    total_success_percentage = df["success"].eq("Success").mean() * 100
+    total_success_with_retry_percentage = (
+        df["success"].eq("Success with retry").mean() * 100
+    )
+    total_failed_percentage = df["success"].eq("Failed").mean() * 100
+    total_avg_execution_time = df["execution_time"].mean()
+    total_row = {
+        "success_percentage": total_success_percentage,
+        "success_with_retry_percentage": total_success_with_retry_percentage,
+        "failed_percentage": total_failed_percentage,
+        "avg_time": total_avg_execution_time,
+    }
+    for col in groupby_list:
+        total_row[col] = None
+    result.loc["Total"] = total_row
 
-    # round columns to 2 decimal places
-    ag_report_df = ag_report_df.round(2)
+    result.loc[result.index[-1], "url"] = "Total"
 
-    return ag_report_df
+    return result.reset_index(drop=True).round(2)
 
 
 GROUP_BY_DICT = {

diff --git a/captn/captn_agents/backend/benchmarking/brief_creation_team.py b/captn/captn_agents/backend/benchmarking/brief_creation_team.py
@@ -113,7 +113,7 @@ def benchmark_brief_creation(
     url: str,
     team_name: str,
     llm: str = Models.gpt3_5,
-) -> str:
+) -> Tuple[str, int]:
     config_list = get_config_list(llm)
 
     user_id = 123
@@ -166,7 +166,9 @@ def benchmark_brief_creation(
                         "task" in delegate_task_function_sugestion_function["arguments"]
                     )  # nosec: [B101]
 
-                    return delegate_task_function_sugestion_function["arguments"]  # type: ignore[no-any-return]
+                    return delegate_task_function_sugestion_function[
+                        "arguments"
+                    ], team.retry_from_scratch_counter
     finally:
         poped_team = Team.pop_team(user_id=user_id, conv_id=conv_id)
         assert isinstance(poped_team, Team)  # nosec: [B101]
diff --git a/captn/captn_agents/backend/benchmarking/campaign_creation_team.py b/captn/captn_agents/backend/benchmarking/campaign_creation_team.py
@@ -182,7 +182,7 @@ def continue_conversation_until_finished(
 def benchmark_campaign_creation(
     url: str,
     llm: str = Models.gpt4,
-) -> str:
+) -> Tuple[str, int]:
     try:
         task = URL_TASK_DICT[url]
         config_list = get_config_list(llm)
@@ -215,7 +215,7 @@ def benchmark_campaign_creation(
                 mock_create_ad_group_ad=mock_create_ad_group_ad,
                 mock_create_ad_group_keyword=mock_create_ad_group_keyword,
                 mock_create_campaign=mock_create_campaign,
-            )
+            ), campaign_creation_team.retry_from_scratch_counter
     finally:
         user_id, conv_id = campaign_creation_team.name.split("_")[-2:]
         success = Team.pop_team(user_id=int(user_id), conv_id=int(conv_id))

diff --git a/captn/captn_agents/backend/benchmarking/end2end.py b/captn/captn_agents/backend/benchmarking/end2end.py
@@ -1,6 +1,6 @@
 from contextlib import contextmanager
 from tempfile import TemporaryDirectory
-from typing import Any, Iterator
+from typing import Any, Iterator, Tuple
 from unittest.mock import patch
 
 from autogen.cache import Cache
@@ -49,13 +49,13 @@ def _patch_brief_creation_team_vars(
 def benchmark_end2end(
     url: str,
     llm: str = Models.gpt4,
-) -> str:
+) -> Tuple[str, int]:
     config_list = get_config_list(llm)
 
     user_id = 123
     conv_id = 234
     task = _get_task(url)
-    team = BriefCreationTeam(
+    brief_creation_team = BriefCreationTeam(
         task=task, user_id=user_id, conv_id=conv_id, config_list=config_list
     )
 
@@ -66,15 +66,17 @@ def benchmark_end2end(
         with TemporaryDirectory() as cache_dir:
             with Cache.disk(cache_path_root=cache_dir) as cache:
                 with _patch_brief_creation_team_vars(
-                    team=team, client_system_message=client_system_message, cache=cache
+                    team=brief_creation_team,
+                    client_system_message=client_system_message,
+                    cache=cache,
                 ) as mock_change_the_team_and_start_new_chat:
                     with _patch_campaign_creation_team_vars() as (
                         mock_create_ad_group,
                         mock_create_ad_group_ad,
                         mock_create_ad_group_keyword,
                         mock_create_campaign,
                     ):
-                        team.initiate_chat(cache=cache)
+                        brief_creation_team.initiate_chat(cache=cache)
                         continue_conversation_until_finished(
                             user_id=user_id,
                             conv_id=conv_id,
@@ -94,12 +96,16 @@ def benchmark_end2end(
                         )
                         assert isinstance(campaign_creation_team, CampaignCreationTeam)  # nosec: [B101]
 
-                        return run_assertions_and_return_last_message(
-                            campaign_creation_team=campaign_creation_team,
-                            mock_create_ad_group=mock_create_ad_group,
-                            mock_create_ad_group_ad=mock_create_ad_group_ad,
-                            mock_create_ad_group_keyword=mock_create_ad_group_keyword,
-                            mock_create_campaign=mock_create_campaign,
+                        return (
+                            run_assertions_and_return_last_message(
+                                campaign_creation_team=campaign_creation_team,
+                                mock_create_ad_group=mock_create_ad_group,
+                                mock_create_ad_group_ad=mock_create_ad_group_ad,
+                                mock_create_ad_group_keyword=mock_create_ad_group_keyword,
+                                mock_create_campaign=mock_create_campaign,
+                            ),
+                            brief_creation_team.retry_from_scratch_counter
+                            + campaign_creation_team.retry_from_scratch_counter,
                         )
     finally:
         poped_team = Team.pop_team(user_id=user_id, conv_id=conv_id)

diff --git a/captn/captn_agents/backend/benchmarking/websurfer.py b/captn/captn_agents/backend/benchmarking/websurfer.py
@@ -1,3 +1,5 @@
+from typing import Tuple
+
 from ..tools._functions import (
     get_get_info_from_the_web_page,
     get_llm_config_gpt_3_5,
@@ -17,7 +19,7 @@ def benchmark_websurfer(
     navigator_llm: str = Models.gpt4,
     timestamp: str = "2024-01-01T00:00:0",
     introduce_give_up_after: int = 7,
-) -> str:
+) -> Tuple[str, int]:
     llm_configs = {
         Models.gpt3_5.value: get_llm_config_gpt_3_5(),
         Models.gpt4.value: get_llm_config_gpt_4(),
@@ -42,4 +44,5 @@ def benchmark_websurfer(
     ):
         raise AssertionError(last_message)
 
-    return last_message
+    # TODO: Always return 0 as a retry counter
+    return last_message, 0
diff --git a/captn/captn_agents/backend/teams/_brief_creation_team.py b/captn/captn_agents/backend/teams/_brief_creation_team.py
@@ -44,9 +44,6 @@ def __init__(
         temperature: float = 0.2,
         config_list: Optional[List[Dict[str, str]]] = None,
     ):
-        self.task = task
-        self.initial_brief = task
-
         recommended_modifications_and_answer_list: List[
             Tuple[Dict[str, Any], Optional[str]]
         ] = []
@@ -58,6 +55,7 @@ def __init__(
             user_id=user_id,
             conv_id=conv_id,
             roles=roles,
+            task=task,
             function_map=function_map,
             work_dir=work_dir,
             max_round=max_round,
@@ -85,7 +83,7 @@ def _add_tools(self) -> None:
         self.toolbox = create_brief_creation_team_toolbox(
             user_id=self.user_id,
             conv_id=self.conv_id,
-            initial_brief=self.initial_brief,
+            initial_brief=self.task,
         )
         for agent in self.members:
             if agent != self.user_proxy:

diff --git a/captn/captn_agents/backend/teams/_campaign_creation_team.py b/captn/captn_agents/backend/teams/_campaign_creation_team.py
@@ -101,8 +101,6 @@ def __init__(
         temperature: float = 0.2,
         config_list: Optional[List[Dict[str, str]]] = None,
     ):
-        self.task = task
-
         recommended_modifications_and_answer_list: List[
             Tuple[Dict[str, Any], Optional[str]]
         ] = []
@@ -113,6 +111,7 @@ def __init__(
             user_id=user_id,
             conv_id=conv_id,
             roles=roles,
+            task=task,
             function_map=function_map,
             work_dir=work_dir,
             max_round=max_round,
@@ -319,3 +318,9 @@ def get_brief_template(cls) -> str:
 
 Now Let's get all the information from the clients web page and create a detailed plan for the campaign.
 """
+
+    def initiate_chat(self, **kwargs: Any) -> None:
+        if self.toolbox._context.changes_made:  # type: ignore[union-attr]
+            self.initial_message += f"\nThe following modifications have already been done: {self.toolbox._context.changes_made}\n"  # type: ignore[union-attr]
+            self.toolbox._context.changes_made = ""  # type: ignore[union-attr]
+        super().initiate_chat(**kwargs)
diff --git a/captn/captn_agents/backend/teams/_google_ads_team.py b/captn/captn_agents/backend/teams/_google_ads_team.py
@@ -80,6 +80,7 @@ def __init__(
             user_id=user_id,
             conv_id=conv_id,
             roles=roles,
+            task=task,
             function_map=function_map,
             work_dir=work_dir,
             max_round=max_round,
@@ -89,7 +90,6 @@ def __init__(
             use_user_proxy=True,
         )
         self.conv_id = conv_id
-        self.task = task
         self.llm_config = GoogleAdsTeam._get_llm_config(
             seed=seed, temperature=temperature
         )