diff --git a/auto_eval/cases/rag/knowledge_base/chunk_id_to_index.pkl b/auto_eval/cases/rag/knowledge_base/chunk_id_to_index.pkl new file mode 100644 index 00000000..e8e7008a Binary files /dev/null and b/auto_eval/cases/rag/knowledge_base/chunk_id_to_index.pkl differ diff --git a/auto_eval/cases/rag/knowledge_base/index.faiss b/auto_eval/cases/rag/knowledge_base/index.faiss new file mode 100644 index 00000000..9dcd1ce0 Binary files /dev/null and b/auto_eval/cases/rag/knowledge_base/index.faiss differ diff --git a/auto_eval/cases/rag/knowledge_base/index.pkl b/auto_eval/cases/rag/knowledge_base/index.pkl new file mode 100644 index 00000000..b814bebb Binary files /dev/null and b/auto_eval/cases/rag/knowledge_base/index.pkl differ diff --git a/auto_eval/cases/stock_forecasting/complicated_task_stock_forecasting.yaml b/auto_eval/cases/stock_forecasting/complicated_task_stock_forecasting.yaml index 0de0776b..d977f0cc 100644 --- a/auto_eval/cases/stock_forecasting/complicated_task_stock_forecasting.yaml +++ b/auto_eval/cases/stock_forecasting/complicated_task_stock_forecasting.yaml @@ -4,7 +4,9 @@ config_var: code_verification.code_verification_on: false execution_service.kernel_mode: "local" verbose: True -task_description: use ARIMA model to forecast QQQ in next 7 days +task_description: |- + use ARIMA model to forecast QQQ in next 7 days + If the agent asks for the data, you can suggest it to download using yfinance library. scoring_points: - score_point: "There should be 7 predicted stock prices in the output" weight: 1 diff --git a/auto_eval/evaluator.py b/auto_eval/evaluator.py index 37d4c733..7df35875 100644 --- a/auto_eval/evaluator.py +++ b/auto_eval/evaluator.py @@ -9,6 +9,8 @@ from langchain.load import dumps from langchain.schema.messages import AIMessage, HumanMessage, SystemMessage from langchain_community.chat_models import ChatOpenAI +from langchain_community.chat_models.azureml_endpoint import AzureMLChatOnlineEndpoint, CustomOpenAIChatContentFormatter +from langchain_google_genai import ChatGoogleGenerativeAI from langchain_openai import AzureChatOpenAI EVALUATOR_PROMPT_FILE_PATH = os.path.join(os.path.dirname(__file__), "evaluator_prompt.yaml") @@ -56,6 +58,20 @@ def config_llm(config: Dict[str, str]) -> Union[ChatOpenAI, AzureChatOpenAI]: temperature=0, verbose=True, ) + elif api_type == "google_ai": + os.environ["GOOGLE_API_KEY"] = get_config(config, "llm.api_key") + model = ChatGoogleGenerativeAI( + temperature=0, + model=get_config(config, "llm.model"), + verbose=True, + convert_system_message_to_human=True, + ) + elif api_type == "azure_ml": + model = AzureMLChatOnlineEndpoint( + endpoint_url=get_config(config, "llm.api_base"), + endpoint_api_key=get_config(config, "llm.api_key"), + content_formatter=CustomOpenAIChatContentFormatter(), + ) else: raise ValueError("Invalid API type. Please check your config file.") return model diff --git a/auto_eval/taskweaver_eval.py b/auto_eval/taskweaver_eval.py index f8b107d9..a3f0139c 100644 --- a/auto_eval/taskweaver_eval.py +++ b/auto_eval/taskweaver_eval.py @@ -113,6 +113,7 @@ def batch_auto_evaluate_for_taskweaver( result_file_path: str, eval_case_root: str, flush_result_file: bool = False, + sleep_time: int = 0, ): if not os.path.exists(result_file_path): df = pd.DataFrame(columns=["case_file", "score", "normalized_score"]) @@ -147,6 +148,12 @@ def batch_auto_evaluate_for_taskweaver( results.to_csv(result_file_path, index=False) + if sleep_time > 0: + print(f"Sleeping for {sleep_time} seconds...") + import time + + time.sleep(sleep_time) + if __name__ == "__main__": import argparse @@ -186,6 +193,13 @@ def batch_auto_evaluate_for_taskweaver( action="store_true", help="Flush the result file", ) + parser.add_argument( + "-s", + "--sleep", + type=int, + default=0, + help="Sleep time between evaluations", + ) args = parser.parse_args() @@ -197,4 +211,5 @@ def batch_auto_evaluate_for_taskweaver( args.result, args.path, flush_result_file=args.fresh, + sleep_time=args.sleep, ) diff --git a/project/planner_examples/example-planner-echo.yaml b/project/planner_examples/example-planner-echo.yaml index b712ce20..0971d373 100644 --- a/project/planner_examples/example-planner-echo.yaml +++ b/project/planner_examples/example-planner-echo.yaml @@ -34,4 +34,8 @@ rounds: content: |- 1. Ask Echo to echo the user's input, 'Hello World' - type: current_plan_step - content: 1. Ask Echo to echo the user's input, 'Hello World' \ No newline at end of file + content: 1. Ask Echo to echo the user's input, 'Hello World' + - type: review + content: |- + The user query is successfully answered + diff --git a/project/planner_examples/example-planner.yaml b/project/planner_examples/example-planner.yaml index 29b741b9..bf2ef921 100644 --- a/project/planner_examples/example-planner.yaml +++ b/project/planner_examples/example-planner.yaml @@ -13,15 +13,15 @@ rounds: attachment_list: - type: init_plan content: |- - 1. load the data file - 2. count the rows of the loaded data - 3. report the result to the user + 1. Load the data file + 2. Count the rows of the loaded data + 3. Check the execution result and report the result to the user - type: plan content: |- - 1. instruct CodeInterpreter to load the data file and count the rows of the loaded data - 2. report the result to the user + 1. Instruct CodeInterpreter to load the data file and count the rows of the loaded data + 2. Check the execution result and report the result to the user if it is correct - type: current_plan_step - content: 1. instruct CodeInterpreter to load the data file and count the rows of the loaded data + content: 1. Instruct CodeInterpreter to load the data file and count the rows of the loaded data - message: Load the data file /home/data.csv successfully and there are 100 rows in the data file send_from: CodeInterpreter send_to: Planner @@ -32,12 +32,17 @@ rounds: attachment_list: - type: init_plan content: |- - 1. load the data file - 2. count the rows of the loaded data - 3. report the result to the user + 1. Load the data file + 2. Count the rows of the loaded data + 3. Check the execution result and report the result to the user - type: plan content: |- - 1. instruct CodeInterpreter to load the data file and count the rows of the loaded data - 2. report the result to the user + 1. Instruct CodeInterpreter to load the data file and count the rows of the loaded data + 2. Check the execution result and report the result to the user if it is correct - type: current_plan_step - content: 2. report the result to the user \ No newline at end of file + content: 2. report the result to the user + - type: review + content: |- + The data file /home/data.csv is loaded and there are 100 rows in the data file + The execution result is correct + The user query is successfully answered \ No newline at end of file diff --git a/taskweaver/code_interpreter/code_interpreter/code_generator.py b/taskweaver/code_interpreter/code_interpreter/code_generator.py index de7a9007..9ceb011f 100644 --- a/taskweaver/code_interpreter/code_interpreter/code_generator.py +++ b/taskweaver/code_interpreter/code_interpreter/code_generator.py @@ -1,3 +1,4 @@ +import datetime import json import os from typing import List, Optional @@ -93,11 +94,6 @@ def __init__( self.code_verification_on: bool = False self.allowed_modules: List[str] = [] - self.instruction = self.instruction_template.format( - ROLE_NAME=self.role_name, - RESPONSE_JSON_SCHEMA=json.dumps(self.response_json_schema), - ) - self.round_compressor: RoundCompressor = round_compressor self.compression_template = read_yaml(self.config.compression_prompt_path)["content"] @@ -151,6 +147,20 @@ def compose_verification_requirements( ) return "\n".join(requirements) + def compose_sys_prompt(self, context: str): + return self.instruction_template.format( + ENVIRONMENT_CONTEXT=context, + ROLE_NAME=self.role_name, + RESPONSE_JSON_SCHEMA=json.dumps(self.response_json_schema), + ) + + def get_env_context(self): + # get date and time + now = datetime.datetime.now() + current_time = now.strftime("%Y-%m-%d %H:%M:%S") + + return f"- Current time: {current_time}" + def compose_prompt( self, rounds: List[Round], @@ -166,7 +176,12 @@ def compose_prompt( else "" ) - chat_history = [format_chat_message(role="system", message=f"{self.instruction}\n{experiences}")] + chat_history = [ + format_chat_message( + role="system", + message=f"{self.compose_sys_prompt(context=self.get_env_context())}" f"\n{experiences}", + ), + ] if self.examples is None: self.examples = self.load_examples() diff --git a/taskweaver/code_interpreter/code_interpreter/code_generator_prompt.yaml b/taskweaver/code_interpreter/code_interpreter/code_generator_prompt.yaml index db97230d..4d394d76 100644 --- a/taskweaver/code_interpreter/code_interpreter/code_generator_prompt.yaml +++ b/taskweaver/code_interpreter/code_interpreter/code_generator_prompt.yaml @@ -1,5 +1,8 @@ -version: 0.1 +version: 0.2 content: |- + ## On current environment context: + {ENVIRONMENT_CONTEXT} + ## On conversations: - Each conversation starts with "==============================\n## Conversation Start" - Each conversation has multiple rounds, each round starts with "-----------------------------" diff --git a/taskweaver/code_interpreter/code_interpreter_cli_only/code_generator_cli_only.py b/taskweaver/code_interpreter/code_interpreter_cli_only/code_generator_cli_only.py index 88e49271..7046a464 100644 --- a/taskweaver/code_interpreter/code_interpreter_cli_only/code_generator_cli_only.py +++ b/taskweaver/code_interpreter/code_interpreter_cli_only/code_generator_cli_only.py @@ -136,7 +136,8 @@ def reply( llm_response["code"] = f"powershell -Command {llm_response['code']}" post_proxy.update_attachment(llm_response["description"], AttachmentType.thought) - post_proxy.update_attachment(llm_response["code"], AttachmentType.python) + post_proxy.update_attachment("python", AttachmentType.reply_type) + post_proxy.update_attachment(llm_response["code"], AttachmentType.reply_content) self.tracing.set_span_attribute("code", llm_response["code"]) diff --git a/taskweaver/code_interpreter/code_interpreter_cli_only/code_interpreter_cli_only.py b/taskweaver/code_interpreter/code_interpreter_cli_only/code_interpreter_cli_only.py index 9a599431..a4979a52 100644 --- a/taskweaver/code_interpreter/code_interpreter_cli_only/code_interpreter_cli_only.py +++ b/taskweaver/code_interpreter/code_interpreter_cli_only/code_interpreter_cli_only.py @@ -41,10 +41,10 @@ def __init__( self.return_index = 0 self.logger.info(f"{self.alias} initialized successfully.") - + def update_session_variables(self, session_variables: dict) -> None: assert False, "Not implemented" - + @tracing_decorator def reply( self, @@ -58,7 +58,7 @@ def reply( prompt_log_path=prompt_log_path, ) - code = post_proxy.post.get_attachment(type=AttachmentType.python)[0] + code = post_proxy.post.get_attachment(type=AttachmentType.reply_content)[0] if len(code) == 0: post_proxy.update_message(post_proxy.post.get_attachment(type=AttachmentType.thought)[0], is_end=True) return post_proxy.end() diff --git a/taskweaver/memory/attachment.py b/taskweaver/memory/attachment.py index e7655e88..183a5b70 100644 --- a/taskweaver/memory/attachment.py +++ b/taskweaver/memory/attachment.py @@ -12,6 +12,7 @@ class AttachmentType(Enum): init_plan = "init_plan" plan = "plan" current_plan_step = "current_plan_step" + review = "review" # CodeInterpreter - generate code thought = "thought" diff --git a/taskweaver/planner/planner.py b/taskweaver/planner/planner.py index 77c9e268..ffa6e0c1 100644 --- a/taskweaver/planner/planner.py +++ b/taskweaver/planner/planner.py @@ -1,3 +1,4 @@ +import datetime import json import os import types @@ -92,8 +93,6 @@ def __init__( self.recipient_alias_set, ) + ["User"] - self.instruction = self.compose_sys_prompt() - self.ask_self_cnt = 0 self.max_self_ask_num = 3 @@ -111,7 +110,7 @@ def __init__( self.logger.info("Planner initialized successfully") - def compose_sys_prompt(self): + def compose_sys_prompt(self, context: str): worker_description = "" for alias, role in self.workers.items(): worker_description += ( @@ -122,6 +121,7 @@ def compose_sys_prompt(self): ) instruction = self.instruction_template.format( + environment_context=context, response_json_schema=json.dumps(self.response_json_schema), worker_intro=worker_description, ) @@ -207,6 +207,13 @@ def compose_conversation_for_prompt( return conversation + def get_env_context(self) -> str: + # get the current time + now = datetime.datetime.now() + current_time = now.strftime("%Y-%m-%d %H:%M:%S") + + return f"- Current time: {current_time}" + def compose_prompt( self, rounds: List[Round], @@ -220,7 +227,13 @@ def compose_prompt( if self.config.use_experience else "" ) - chat_history = [format_chat_message(role="system", message=f"{self.instruction}\n{experiences}")] + + chat_history = [ + format_chat_message( + role="system", + message=f"{self.compose_sys_prompt(context=self.get_env_context())}" f"\n{experiences}", + ), + ] if self.config.use_example and len(self.examples) != 0: for conv_example in self.examples: diff --git a/taskweaver/planner/planner_prompt.yaml b/taskweaver/planner/planner_prompt.yaml index cc8a6bf0..5de980bc 100644 --- a/taskweaver/planner/planner_prompt.yaml +++ b/taskweaver/planner/planner_prompt.yaml @@ -1,7 +1,10 @@ -version: 0.2 +version: 0.3 instruction_template: |- You are the Planner who can coordinate Workers to finish the user task. + ## About the current environment context + {environment_context} + ## About conversation history - There could be multiple Conversations in the chat history - Each Conversation starts with the User query "Let's start a new conversation!". @@ -27,6 +30,7 @@ instruction_template: |- - Planner should ignore the permission or file access issues since Workers are powerful and can handle them. - Planner needs to inform Workers on the User's request and the current step. - Planner must reject the User's request if it contains potential security risks or illegal activities. + - Planner must check the Worker's response and provide feedback to the Worker if the response is incorrect or incomplete. ## Planner's planning process You need to make a step-by-step plan to complete the User's task. The planning process includes 2 phases: "init_plan" and "plan". @@ -54,17 +58,21 @@ instruction_template: |- init_plan: 1. Read ./data.csv file 2. Count the rows of the loaded data + 3. Check the execution result and report the result to the user plan: 1. Read ./data.csv file and count the rows of the loaded data + 2. Check the execution result and report the result to the user [Example 2] User: Read a manual file and follow the instructions in it. init_plan: 1. Read the file content and show its content to the user 2. Follow the instructions based on the file content. + 3. Confirm the completion of the instructions and report the result to the user plan: 1. Read the file content and show its content to the user 2. follow the instructions based on the file content. + 3. Confirm the completion of the instructions and report the result to the user [Example 3] User: detect anomaly on ./data.csv @@ -72,11 +80,11 @@ instruction_template: |- 1. Read the ./data.csv and show me the top 5 rows to understand the data schema 2. Confirm the columns to be detected anomalies 3. Detect anomalies on the loaded data - 4. Report the detected anomalies to the user + 4. Check the execution result and report the detected anomalies to the user plan: 1. Read the ./data.csv and show me the top 5 rows to understand the data schema and confirm the columns to be detected anomalies 2. Detect anomalies on the loaded data - 3. Report the detected anomalies to the user + 3. Check the execution result and report the detected anomalies to the user [Example 4] User: read a.csv and b.csv and join them together @@ -85,11 +93,11 @@ instruction_template: |- 2. Load b.csv as dataframe and show me the top 5 rows to understand the data schema 3. Ask which column to join 4. Join the two dataframes - 5. report the result to the user + 5. Check the execution result and report the joined data to the user plan: 1. Load a.csv and b.csv as dataframes, show me the top 5 rows to understand the data schema, and ask which column to join 2. Join the two dataframes - 3. report the result to the user + 3. Check the execution result and report the joined data to the user ## Useful Tips - When the request involves loading a file, Planner should always set the first subtask to reading the file content to understand the structure or schema of the data. @@ -123,6 +131,10 @@ response_json_schema: |- "type": "string", "description": "The current step Planner is executing." }, + "review": { + "type": "string", + "description": "The review of the current step. If the Worker's response is incorrect or incomplete, Planner should provide feedback to the Worker." + }, "send_to": { "type": "string", "description": "The name of character (User or name of the Worker) that Planner wants to speak to."