From cb7d7b2e3bcb96ec6aced707bf367b8c4424cce9 Mon Sep 17 00:00:00 2001 From: Hugo Saporetti Junior Date: Thu, 26 Sep 2024 17:42:50 -0300 Subject: [PATCH] Refactorings, bugfixes and parse improvements --- src/demo/features/rag/x_refs_demo.py | 2 +- src/main/askai/core/enums/acc_color.py | 106 ++++++++++++++++ src/main/askai/core/enums/acc_response.py | 117 ++++++------------ .../core/features/processors/task_splitter.py | 8 +- .../askai/core/features/router/__init__.py | 2 +- .../{task_accuracy.py => evaluation.py} | 36 +++--- .../askai/core/features/router/task_agent.py | 6 +- .../askai/core/features/tools/terminal.py | 2 +- src/main/askai/core/features/tools/vision.py | 2 +- src/main/askai/core/model/action_plan.py | 52 ++++---- src/main/askai/core/support/utilities.py | 49 ++++++++ .../askai/resources/prompts/acc-report.txt | 8 ++ src/main/askai/resources/prompts/accuracy.txt | 95 -------------- .../askai/resources/prompts/evaluation.txt | 110 +++++++++++++++- .../askai/resources/prompts/task-splitter.txt | 10 +- src/test/core/model/__init__.py | 0 src/test/core/model/test_action_plan.py | 66 ++++++++++ src/test/fixtures/__init__.py | 0 src/test/fixtures/action_plan.py | 64 ++++++++++ .../resources/llm-responses/task-splitter.txt | 57 +++++++++ 20 files changed, 550 insertions(+), 242 deletions(-) create mode 100644 src/main/askai/core/enums/acc_color.py rename src/main/askai/core/features/router/{task_accuracy.py => evaluation.py} (81%) create mode 100644 src/main/askai/resources/prompts/acc-report.txt delete mode 100644 src/main/askai/resources/prompts/accuracy.txt create mode 100644 src/test/core/model/__init__.py create mode 100644 src/test/core/model/test_action_plan.py create mode 100644 src/test/fixtures/__init__.py create mode 100644 src/test/fixtures/action_plan.py create mode 100644 src/test/resources/llm-responses/task-splitter.txt diff --git a/src/demo/features/rag/x_refs_demo.py b/src/demo/features/rag/x_refs_demo.py index c1b04fc6..16ebdd0a 100644 --- a/src/demo/features/rag/x_refs_demo.py +++ b/src/demo/features/rag/x_refs_demo.py @@ -1,4 +1,4 @@ -from askai.core.features.router.task_accuracy import resolve_x_refs +from askai.core.features.router.evaluation import resolve_x_refs from askai.core.support.shared_instances import shared from askai.core.support.utilities import display_text from utils import get_resource, init_context diff --git a/src/main/askai/core/enums/acc_color.py b/src/main/askai/core/enums/acc_color.py new file mode 100644 index 00000000..cf730a36 --- /dev/null +++ b/src/main/askai/core/enums/acc_color.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" + @project: HsPyLib-AskAI + @package: askai.core.enums.acc_response + @file: acc_color.py + @created: thu, 26 Sep 2024 + @author: Hugo Saporetti Junior + @site: https://github.com/yorevs/askai + @license: MIT - Please refer to + + Copyright (c) 2024, HomeSetup +""" +from typing import Literal, TypeAlias + +from hspylib.core.enums.enumeration import Enumeration + +AccuracyColors: TypeAlias = Literal["Blue", "Green", "Yellow", "Orange", "Red"] + + +class AccColor(Enumeration): + """TODO""" + + # fmt: off + + INTERRUPT = 'Black', -1 + + EXCELLENT = 'Blue', 0 + + GOOD = 'Green', 1 + + MODERATE = 'Yellow', 2 + + INCOMPLETE = 'Orange', 3 + + BAD = 'Red', 4 + + def __init__(self, color: AccuracyColors, weight: int): + self._color: AccuracyColors = color + self._weight: int = weight + + def __eq__(self, other: "AccColor") -> bool: + return self.val == other.val + + def __lt__(self, other) -> bool: + return self.val < other.val + + def __le__(self, other) -> bool: + return self.val <= other.val + + def __gt__(self, other) -> bool: + return self.val > other.val + + def __ge__(self, other) -> bool: + return self.val >= other.val + + def __str__(self) -> str: + return self.color + + @classmethod + def of_color(cls, color_str: AccuracyColors) -> 'AccColor': + """Create an AccResponse instance based on status and optional reasoning. + :param color_str: The color as a string. + :return: An instance of AccColor with the given color. + """ + acc_color: tuple[str, int] = next((c for c in cls.values() if c[0] == color_str.title()), None) + if acc_color and isinstance(acc_color, tuple): + return cls.of_value(acc_color) + raise ValueError(f"'{color_str}'is not a valid AccColor") + + @property + def color(self) -> AccuracyColors: + return self.value[0] + + @property + def val(self) -> int: + """Gets the integer value of the verbosity level. + :return: The integer representation of the verbosity level. + """ + return int(self.value[1]) + + @property + def is_bad(self) -> bool: + return self in [self.BAD, self.INCOMPLETE] + + @property + def is_moderate(self) -> bool: + return self == self.MODERATE + + @property + def is_good(self) -> bool: + return self in [self.GOOD, self.EXCELLENT] + + @property + def is_interrupt(self) -> bool: + return self == self.INTERRUPT + + def passed(self, threshold: "AccColor") -> bool: + """Determine whether the response matches a 'PASS' classification. + :param threshold: The threshold or criteria used to determine a 'PASS' classification. + :return: True if the response meets or exceeds the 'PASS' threshold, otherwise False. + """ + if isinstance(threshold, AccColor): + return self.val <= threshold.val + return False diff --git a/src/main/askai/core/enums/acc_response.py b/src/main/askai/core/enums/acc_response.py index 1b9da14d..b8bef5d2 100644 --- a/src/main/askai/core/enums/acc_response.py +++ b/src/main/askai/core/enums/acc_response.py @@ -12,107 +12,62 @@ Copyright (c) 2024, HomeSetup """ +import os +from dataclasses import dataclass -import re -from typing import Literal +from askai.core.enums.acc_color import AccColor, AccuracyColors +from askai.core.support.utilities import parse_field +from hspylib.core.tools.text_tools import ensure_endswith -from hspylib.core.enums.enumeration import Enumeration - -class AccResponse(Enumeration): +@dataclass(frozen=True) +class AccResponse: """Track and classify accuracy responses based on color classifications. This class provides an enumeration of possible accuracy responses, which are typically represented by different colors. """ - # fmt: off - - EXCELLENT = 'Blue' - - GOOD = 'Green' - - MODERATE = 'Yellow' - - INCOMPLETE = 'Orange' - - BAD = 'Red' - - INTERRUPT = 'Black' - - # fmt: on + acc_color: AccColor + accuracy: float + reasoning: str + tips: str @classmethod - def matches(cls, output: str) -> re.Match: - """Find a match in the given output string. - :param output: The string to search for a match. - :return: A match object if a match is found. - :raises: re.error if an error occurs during the matching process. - """ - flags: int = re.IGNORECASE | re.MULTILINE | re.DOTALL - return re.search(cls._re(), output.replace("\n", " "), flags=flags) - - @classmethod - def _re(cls) -> str: + def parse_response(cls, response: str) -> "AccResponse": """TODO""" - return rf"^\$?({'|'.join(cls.values())})[:,-]\s*[0-9]+%\s+(.+)" - @classmethod - def strip_code(cls, message: str) -> str: - """Strip the color code from the message. - :param message: The message from which to strip color codes. - :return: The message with color codes removed. - """ - mat = cls.matches(message) - return str(mat.group(2)).strip() if mat else message.strip() + # FIXME: Remove log the response + with open("/Users/hjunior/Desktop/acc-response-resp.txt", "w") as f_bosta: + f_bosta.write(response + os.linesep) + f_bosta.flush() - @classmethod - def of_status(cls, status: str, reasoning: str | None) -> "AccResponse": - """Create an AccResponse instance based on status and optional reasoning. - :param status: The status as a string. - :param reasoning: Optional reasoning for the status, formatted as '%
'. - :return: An instance of AccResponse with the given status and reasoning. - """ - resp = cls.of_value(status.title()) - if reasoning and (mat := re.match(r"(^[0-9]{1,3})%\s+(.*)", reasoning)): - resp.rate = float(mat.group(1)) - resp.reasoning = mat.group(2) - return resp - - def __init__(self, color: Literal["Blue", "Green", "Yellow", "Orange", "Red"]): - self.color = color - self.reasoning: str | None = None - self.rate: float | None = None + # Parse fields + acc_color: AccColor = AccColor.of_color(parse_field("@color", response)) + accuracy: float = float(parse_field("@accuracy", response).strip("%")) + reasoning: str = parse_field("@reasoning", response) + tips: str = parse_field("@tips", response) + + return AccResponse(acc_color, accuracy, reasoning, tips) def __str__(self): - details: str = f"{' -> ' + str(self.rate) + '% ' + self.reasoning if self.reasoning else ''}" - return f"{self.name}{details}" + return f"{self.status} -> {self.details}" @property - def is_bad(self) -> bool: - return self in [self.BAD, self.INCOMPLETE] + def color(self) -> AccuracyColors: + return self.acc_color.color @property - def is_moderate(self) -> bool: - return self == self.MODERATE + def status(self) -> str: + return f"{self.color}, {str(self.accuracy)}%" @property - def is_good(self) -> bool: - return self in [self.GOOD, self.EXCELLENT] + def details(self) -> str: + return f"{ensure_endswith(self.reasoning, '.')} {'**' + self.tips + '**' if self.tips else ''}" @property def is_interrupt(self) -> bool: - return self == self.INTERRUPT - - def passed(self, threshold: "AccResponse") -> bool: - """Determine whether the response matches a 'PASS' classification. - :param threshold: The threshold or criteria used to determine a 'PASS' classification. - :return: True if the response meets or exceeds the 'PASS' threshold, otherwise False. - """ - if isinstance(threshold, AccResponse): - idx_self, idx_threshold = None, None - for i, v in enumerate(AccResponse.values()): - if v == self.value: - idx_self = i - if v == threshold.value: - idx_threshold = i - return idx_self is not None and idx_threshold is not None and idx_self <= idx_threshold - return False + """TODO""" + return self.acc_color.is_interrupt + + def is_pass(self, threshold: AccColor) -> bool: + """TODO""" + return self.acc_color.passed(threshold) diff --git a/src/main/askai/core/features/processors/task_splitter.py b/src/main/askai/core/features/processors/task_splitter.py index 0a0e3fe4..ccb98eba 100644 --- a/src/main/askai/core/features/processors/task_splitter.py +++ b/src/main/askai/core/features/processors/task_splitter.py @@ -25,10 +25,11 @@ from askai.core.askai_prompt import prompt from askai.core.component.geo_location import geo_location from askai.core.engine.openai.temperature import Temperature +from askai.core.enums.acc_color import AccColor from askai.core.enums.acc_response import AccResponse from askai.core.enums.routing_model import RoutingModel from askai.core.features.router.agent_tools import features -from askai.core.features.router.task_accuracy import assert_accuracy +from askai.core.features.router.evaluation import assert_accuracy from askai.core.features.router.task_agent import agent from askai.core.features.tools.general import final_answer from askai.core.model.action_plan import ActionPlan @@ -165,8 +166,7 @@ def _splitter_wrapper_() -> Optional[str]: if response := runnable.invoke({"input": question}, config={"configurable": {"session_id": "HISTORY"}}): log.info("Router::[RESPONSE] Received from AI: \n%s.", str(response.content)) plan = ActionPlan.create(question, response, model) - task_list = plan.tasks - if task_list: + if task_list := plan.tasks: events.reply.emit(reply=AIReply.debug(msg.action_plan(str(plan)))) if plan.speak: events.reply.emit(reply=AIReply.info(plan.speak)) @@ -183,7 +183,7 @@ def _splitter_wrapper_() -> Optional[str]: try: wrapper_output = self._process_tasks(task_list) - assert_accuracy(question, wrapper_output, AccResponse.MODERATE) + assert_accuracy(question, wrapper_output, AccColor.MODERATE) except (InterruptionRequest, TerminatingQuery) as err: return str(err) except self.RETRIABLE_ERRORS: diff --git a/src/main/askai/core/features/router/__init__.py b/src/main/askai/core/features/router/__init__.py index 7b49aa9f..a9b6dc5e 100644 --- a/src/main/askai/core/features/router/__init__.py +++ b/src/main/askai/core/features/router/__init__.py @@ -5,5 +5,5 @@ # Package: main.askai.core.features.router """Package initialization.""" -__all__ = ["model_selector", "task_accuracy", "task_agent", "agent_tools.py"] +__all__ = ["model_selector", "evaluation.py", "task_agent", "agent_tools.py"] __version__ = "1.0.13" diff --git a/src/main/askai/core/features/router/task_accuracy.py b/src/main/askai/core/features/router/evaluation.py similarity index 81% rename from src/main/askai/core/features/router/task_accuracy.py rename to src/main/askai/core/features/router/evaluation.py index a8cff968..79e2d7e6 100644 --- a/src/main/askai/core/features/router/task_accuracy.py +++ b/src/main/askai/core/features/router/evaluation.py @@ -17,6 +17,7 @@ from askai.core.askai_messages import msg from askai.core.askai_prompt import prompt from askai.core.engine.openai.temperature import Temperature +from askai.core.enums.acc_color import AccColor from askai.core.enums.acc_response import AccResponse from askai.core.model.ai_reply import AIReply from askai.core.support.langchain_support import lc_llm @@ -30,21 +31,21 @@ import logging as log -EVALUATION_GUIDE: str = dedent( - """ +# fmt: off +EVALUATION_GUIDE: str = dedent(""" **Accuracy Evaluation Guidelines:** 1. Analyze past responses to ensure accuracy. 2. Regularly self-critique overall responses. 3. Reflect on past strategies to refine your approach. 4. Experiment with different methods or solutions. -""" -).strip() +""").strip() +# fmt: on RAG: RAGProvider = RAGProvider("accuracy.csv") -def assert_accuracy(question: str, ai_response: str, pass_threshold: AccResponse = AccResponse.MODERATE) -> AccResponse: +def assert_accuracy(question: str, ai_response: str, pass_threshold: AccColor = AccColor.MODERATE) -> AccResponse: """Assert that the AI's response to the question meets the required accuracy threshold. :param question: The user's question. :param ai_response: The AI's response to be analyzed for accuracy. @@ -53,32 +54,31 @@ def assert_accuracy(question: str, ai_response: str, pass_threshold: AccResponse :return: The accuracy classification of the AI's response as an AccResponse enum value. """ if ai_response and ai_response not in msg.accurate_responses: - issues_prompt = PromptTemplate(input_variables=["problems"], template=prompt.read_prompt("evaluation")) - assert_template = PromptTemplate( - input_variables=["rag", "input", "response"], template=prompt.read_prompt("accuracy") + acc_template = PromptTemplate(input_variables=["problems"], template=prompt.read_prompt("acc-report")) + eval_template = PromptTemplate( + input_variables=["rag", "input", "response"], template=prompt.read_prompt("evaluation") ) - final_prompt = assert_template.format(rag=RAG.get_rag_examples(question), input=question, response=ai_response) + final_prompt = eval_template.format(rag=RAG.get_rag_examples(question), input=question, response=ai_response) log.info("Assert::[QUESTION] '%s' context: '%s'", question, ai_response) llm = lc_llm.create_chat_model(Temperature.COLDEST.temp) response: AIMessage = llm.invoke(final_prompt) if response and (output := response.content): - if mat := AccResponse.matches(output): - status, details = mat.group(1), mat.group(2) - log.info("Accuracy check -> status: '%s' reason: '%s'", status, details) - events.reply.emit(reply=AIReply.debug(msg.assert_acc(status, details))) - if (rag_resp := AccResponse.of_status(status, details)).is_interrupt: + if acc := AccResponse.parse_response(output): + log.info("Accuracy check -> status: '%s' details: '%s'", acc.status, acc.details) + events.reply.emit(reply=AIReply.debug(msg.assert_acc(acc.status, acc.details))) + if acc.is_interrupt: # AI flags that it can't continue interacting. log.warning(msg.interruption_requested(output)) raise InterruptionRequest(ai_response) - elif not rag_resp.passed(pass_threshold): + elif not acc.is_pass(pass_threshold): # Include the guidelines for the first mistake. if not shared.context.get("EVALUATION"): shared.context.push("EVALUATION", EVALUATION_GUIDE) - shared.context.push("EVALUATION", issues_prompt.format(problems=AccResponse.strip_code(output))) + shared.context.push("EVALUATION", acc_template.format(problems=acc.details)) raise InaccurateResponse(f"AI Assistant failed to respond => '{response.content}'") - return rag_resp - # At this point, the response was not Good. + return acc + # At this point, the response was inaccurate. raise InaccurateResponse(f"AI Assistant didn't respond accurately. Response: '{response}'") diff --git a/src/main/askai/core/features/router/task_agent.py b/src/main/askai/core/features/router/task_agent.py index 0441297e..681b1317 100644 --- a/src/main/askai/core/features/router/task_agent.py +++ b/src/main/askai/core/features/router/task_agent.py @@ -6,9 +6,9 @@ from askai.core.askai_messages import msg from askai.core.askai_prompt import prompt from askai.core.engine.openai.temperature import Temperature -from askai.core.enums.acc_response import AccResponse +from askai.core.enums.acc_color import AccColor from askai.core.features.router.agent_tools import features -from askai.core.features.router.task_accuracy import assert_accuracy +from askai.core.features.router.evaluation import assert_accuracy from askai.core.model.ai_reply import AIReply from askai.core.support.langchain_support import lc_llm from askai.core.support.shared_instances import shared @@ -60,7 +60,7 @@ def invoke(self, task: str) -> str: shared.context.push("HISTORY", task, "assistant") shared.context.push("HISTORY", output, "assistant") shared.memory.save_context({"input": task}, {"output": output}) - assert_accuracy(task, output, AccResponse.MODERATE) + assert_accuracy(task, output, AccColor.MODERATE) else: output = msg.no_output("AI") diff --git a/src/main/askai/core/features/tools/terminal.py b/src/main/askai/core/features/tools/terminal.py index 328a197d..23582407 100644 --- a/src/main/askai/core/features/tools/terminal.py +++ b/src/main/askai/core/features/tools/terminal.py @@ -14,7 +14,7 @@ """ from askai.core.askai_events import events from askai.core.askai_messages import msg -from askai.core.features.router.task_accuracy import resolve_x_refs +from askai.core.features.router.evaluation import resolve_x_refs from askai.core.model.ai_reply import AIReply from askai.core.support.shared_instances import shared from askai.core.support.utilities import extract_path, media_type_of diff --git a/src/main/askai/core/features/tools/vision.py b/src/main/askai/core/features/tools/vision.py index 51c4bdcd..78fe35e7 100644 --- a/src/main/askai/core/features/tools/vision.py +++ b/src/main/askai/core/features/tools/vision.py @@ -2,7 +2,7 @@ from askai.core.askai_messages import msg from askai.core.component.cache_service import PICTURE_DIR from askai.core.engine.ai_vision import AIVision -from askai.core.features.router.task_accuracy import resolve_x_refs +from askai.core.features.router.evaluation import resolve_x_refs from askai.core.model.ai_reply import AIReply from askai.core.model.image_result import ImageResult from askai.core.support.shared_instances import shared diff --git a/src/main/askai/core/model/action_plan.py b/src/main/askai/core/model/action_plan.py index 1c33a9e3..b5b81dcc 100644 --- a/src/main/askai/core/model/action_plan.py +++ b/src/main/askai/core/model/action_plan.py @@ -12,12 +12,13 @@ Copyright (c) 2024, HomeSetup """ -import ast +import os import re from dataclasses import dataclass, field from types import SimpleNamespace from askai.core.model.model_result import ModelResult +from askai.core.support.utilities import parse_field, parse_list, parse_word from hspylib.core.preconditions import check_state from langchain_core.messages import AIMessage @@ -31,7 +32,7 @@ class ActionPlan: question: str = None speak: str = None primary_goal: str = None - sub_goals: list[str] = field(default_factory=list) + sub_goals: list[SimpleNamespace] = field(default_factory=list) tasks: list[SimpleNamespace] = field(default_factory=list) model: ModelResult = field(default_factory=ModelResult.default) @@ -58,34 +59,19 @@ def _parse_response(question: str, response: str) -> "ActionPlan": :param response: The router's response. :return: An instance of ActionPlan created from the parsed response. """ - flags: int = re.IGNORECASE | re.MULTILINE | re.DOTALL + # FIXME: Remove log the response + with open("/Users/hjunior/Desktop/task-splitter-resp.txt", "w") as f_bosta: + f_bosta.write(response + os.linesep) + f_bosta.flush() - # Define patterns for the required fields - speak_pattern = r"@speak:\s*\"(.+?)\"" - primary_goal_pattern = r"@primary_goal:\s*(.+)" - sub_goals_pattern = r"@sub_goals:\s*\[(.+?)\]" - tasks_pattern = r"@tasks:\s*\[(.+?)\]" - direct_pattern = r"\**Direct:\**\s*(.+?)" - - # Extract using regex - speak_match = re.search(speak_pattern, response, flags) - primary_goal_match = re.search(primary_goal_pattern, response, flags) - sub_goals_match = re.search(sub_goals_pattern, response, flags) - tasks_match = re.search(tasks_pattern, response, flags) - direct_match = re.search(direct_pattern, response, flags) - - # Parse fields - speak = speak_match.group(1) if speak_match else None - primary_goal = primary_goal_match.group(1) if primary_goal_match else None - sub_goals = ast.literal_eval(f"[{sub_goals_match.group(1)}]") if sub_goals_match else [] - tasks = ast.literal_eval(f"[{tasks_match.group(1)}]") if tasks_match else [] - tasks = list(map(lambda t: SimpleNamespace(**t), tasks)) - direct = direct_match.group(1) if direct_match else None + speak: str = parse_field("@speak", response) + primary_goal: str = parse_field("@primary_goal", response) + sub_goals: list[SimpleNamespace] = parse_list("@sub_goals", response) + tasks: list[SimpleNamespace] = parse_list("@tasks", response) + direct: str = parse_word("direct", response) # fmt: off - if direct: - plan = ActionPlan._direct_answer(question, response, ModelResult.default()) - elif speak and primary_goal and tasks: + if primary_goal and tasks: plan = ActionPlan( question=question, speak=speak, @@ -93,6 +79,8 @@ def _parse_response(question: str, response: str) -> "ActionPlan": sub_goals=sub_goals, tasks=tasks ) + elif direct and len(direct) > 1: + plan = ActionPlan._direct_answer(question, response, ModelResult.default()) else: plan = ActionPlan._direct_task(question, response, ModelResult.default()) # fmt: on @@ -137,3 +125,13 @@ def __str__(self): def __len__(self): return len(self.tasks) + + def __eq__(self, other: "ActionPlan") -> bool: + """TODO""" + return ( + self.question == other.question + and self.speak == other.speak + and self.primary_goal == other.primary_goal + and self.sub_goals == other.sub_goals + and self.tasks == other.tasks + ) diff --git a/src/main/askai/core/support/utilities.py b/src/main/askai/core/support/utilities.py index 70168244..4e4c1bdd 100644 --- a/src/main/askai/core/support/utilities.py +++ b/src/main/askai/core/support/utilities.py @@ -13,6 +13,7 @@ Copyright (c) 2024, HomeSetup """ import base64 +import json import mimetypes import os import re @@ -21,6 +22,7 @@ import sys from os.path import basename, dirname from pathlib import Path +from types import SimpleNamespace from typing import AnyStr, Optional from askai.core.support.text_formatter import text_formatter @@ -198,3 +200,50 @@ def seconds(millis: int) -> float: :return: The equivalent time in seconds as a float. """ return millis / 1000 + + +def ensure_parseable(content: str) -> str: + """TODO""" + if content.startswith('['): + # Content is already a JSON array + return content + # Process lines to extract JSON objects + lines = content.split('\n') + json_objects: list[str] = list() + for line in lines: + if not (line := line.strip()): + continue # Skip empty lines + if line.startswith('-'): + line = line.lstrip('-').strip() + json_objects.append(line) + # Wrap in square brackets to form a JSON array + json_array_str = f"[{','.join(json_objects)}]" + return json_array_str + + +def parse_field(field_name: str, text: str) -> Optional[str]: + """TODO""" + flags: int = re.IGNORECASE | re.DOTALL + field_pattern: str = field_name + r':\s*\"?(.+?)["@]' + field_matcher: re.Match[str] | None = re.search(field_pattern, text, flags) + field_value: str = field_matcher.group(1) if field_matcher else None + return field_value.strip() if field_value else None + + +def parse_word(word: str, text: str) -> Optional[str]: + """TODO""" + flags: int = re.IGNORECASE | re.DOTALL + word_pattern: str = r'[*_\s]*' + word + r':[*_\s]*(.+)["@]' + word_matcher: re.Match[str] | None = re.search(word_pattern, text, flags) + word_value: str = word_matcher.group(1) if word_matcher else None + return word_value.strip() if word_value else None + + +def parse_list(field_name: str, text: str) -> list[SimpleNamespace]: + """TODO""" + flags: int = re.IGNORECASE | re.DOTALL + list_pattern: str = field_name + r':\s*(.*?)(?:\n@|$)' + list_matcher: re.Match[str] | None = re.search(list_pattern, text, flags) + list_value: list[str] = json.loads(ensure_parseable(list_matcher.group(1))) if list_matcher else [] + assert isinstance(list_value, list), f"Parse error: Could not parse a list: {list_matcher.group(1)}" + return list(map(lambda t: SimpleNamespace(**t), list_value)) if list_value else [] diff --git a/src/main/askai/resources/prompts/acc-report.txt b/src/main/askai/resources/prompts/acc-report.txt new file mode 100644 index 00000000..8cd9e2fc --- /dev/null +++ b/src/main/askai/resources/prompts/acc-report.txt @@ -0,0 +1,8 @@ +The (AI-Assistant) provided a bad answer. +Improve subsequent responses by addressing the following: + +--- +{problems} +--- + +If you don't have an answer, simply say: "I don't know". Don't try do make up an answer. diff --git a/src/main/askai/resources/prompts/accuracy.txt b/src/main/askai/resources/prompts/accuracy.txt deleted file mode 100644 index 47da751c..00000000 --- a/src/main/askai/resources/prompts/accuracy.txt +++ /dev/null @@ -1,95 +0,0 @@ -You are a QA (quality assurance) Specialist. - -Given the human input and the AI response, your task is to evaluate "AI-generated" responses quality. - -This involves examining the user question, user goals and their underlying sub goals, and the "AI-response". Disregard the "Response Formatting" when classifying. - -Check if the AI response explicitly and directly addresses the user's request by providing all necessary information and details. The response must include specific actions, steps, tasks, and objectives as per the user's question. - -Evaluate using percentage as your color threshold reference (from 0% to 100%). - -Avoid using markdown or any special formatting in your response. - -Use the following criteria for classification: - -1. **Blue**: Level of accuracy [100%-95%]. Reasoning: The AI response is perfect responding to the question posed, including a detailed and accurate information. - -1. **Green**: Level of accuracy [94%-70%]. Reasoning: The AI response successfully addresses the question posed, indicating a full understanding and appropriate analysis. - -2. **Yellow**: Level of accuracy [69%-50%]. Reasoning: The AI response partially addresses the question but lacks full depth or detail, suggesting moderate understanding. - -3. **Orange**: Level of accuracy [49%-30%]. Reasoning: The AI response is incomplete or if you have low confidence of the classification. - -4. **Red**: Level of accuracy [29%-0%]. Reasoning: The AI response fails to adequately address the question, indicating a misunderstanding or incorrect analysis. - -5. **Black**: Level of accuracy [Any]. Reasoning: The AI responds that it cannot continue with further interactions due to a specified reason. - - -**When evaluating responses, classify 'Black' (Interrupt Responses) when:** - -- The response explains the lack of information, context, or when the AI is clearly having trouble understanding the user input. - -- The response is a negative answer to the question. - - -**When evaluating responses, classify 'Green' or 'Blue' (Known Good/Excellent Responses) when:** - -- 'Blue' if it detects a successful command execution, e.g., "OK, command succeeded". The "command output" can be disregarded for this classification. - -- The response responds to conversations where there are no definitive right or wrong answers. - -- Regardless of the question, if the response includes the phrase: "Summarization of docs at: '' succeeded !". - -- Regardless of the question, if the response includes the phrase: "Your search returned the following:". - - -**When evaluating responses, classify 'Red' (Known Bad Responses) when:** - -- The response fails to resolve the primary goal; 'Orange' if the response fails to resolve any of the sub goals. - -- The response language used by the AI differs from the user question (language mismatch). - -- The following message is detected: "Invalid or incomplete response". - -- Acknowledging or mentioning previous responses, indicating or stating the intention of accomplishment, are considered unhelpful. - -- Check if the response is coherent with the question. Ensure the answer is unbiased and does not rely on stereotypes. Detect AI hallucinations by verifying the accuracy of the response. Classify the response -as 'Red' if it does not align with known facts. - -- The AI is seeking user assistance. - - -**Classification Guidelines (rate from 0% to 100%):** - -- Assess the AI's response for correctness by considering its ability to effectively address and correct syntax errors or misinterpretations in the user's input, rather than focusing solely on literal repetitions or minor discrepancies in terminology. - -- Revise the classifications for responses from the AI that contain irrelevant information to 'Yellow' instead of 'Red', as any additional information is still valued. - -- "I don't know." may be a good response. Before classifying, check the chat context or provided contexts to make sure the AI understood the question, but, it does not have an answer. If that's the case, classify as 'Green'. - -- Do not include any part of the question in your response. - -- Indicate your classification choice ('Red', 'Orange', 'Yellow', 'Green', or 'Blue') followed by the reasoning behind your decision. - -- When reviewing cross-references, vigilance is crucial to prevent misclassifications due to ambiguous entries. Consult if the ambiguity was resolved. Exercise caution to avoid categorizing entries as 'Red' or 'Orange' unless absolutely certain. - -- When the primary goal is achieved but lacks further details, classify it as 'Yellow' or 'Green', depending on the amount of missing details. - -- Before returning a classification, check the chat history and all provided context, as that may lead to a different classification, and to double check the classification is accurate. - ---- -{rag} ---- - -When have an answer, format like: - -: - - -Human Input: "{input}" - - -AI Response: "{response}" - - -Begin the classification! diff --git a/src/main/askai/resources/prompts/evaluation.txt b/src/main/askai/resources/prompts/evaluation.txt index 8cd9e2fc..99782b0a 100644 --- a/src/main/askai/resources/prompts/evaluation.txt +++ b/src/main/askai/resources/prompts/evaluation.txt @@ -1,8 +1,110 @@ -The (AI-Assistant) provided a bad answer. -Improve subsequent responses by addressing the following: +You are a QA (quality assurance) Specialist. + +Given the human input and the AI response, your task is to evaluate "AI-generated" responses quality. + +This involves examining the user question, user goals and their underlying sub goals, and the "AI-response". Disregard the "Response Formatting" when classifying. + +Check if the AI response explicitly and directly addresses the user's request by providing all necessary information and details. The response must include specific actions, steps, tasks, and objectives as per the user's question. + +Evaluate using percentage as your color threshold reference (from 0% to 100%). + +Avoid using markdown or any special formatting in your response. + +Use the following criteria for classification: + +1. **Blue**: Level of accuracy [100%-95%]. Reasoning: The AI response is perfect responding to the question posed, including a detailed and accurate information. + +1. **Green**: Level of accuracy [94%-70%]. Reasoning: The AI response successfully addresses the question posed, indicating a full understanding and appropriate analysis. + +2. **Yellow**: Level of accuracy [69%-50%]. Reasoning: The AI response partially addresses the question but lacks full depth or detail, suggesting moderate understanding. + +3. **Orange**: Level of accuracy [49%-30%]. Reasoning: The AI response is incomplete or if you have low confidence of the classification. + +4. **Red**: Level of accuracy [29%-0%]. Reasoning: The AI response fails to adequately address the question, indicating a misunderstanding or incorrect analysis. + +5. **Black**: Level of accuracy [Any]. Reasoning: The AI responds that it cannot continue with further interactions due to a specified reason. + + +**When evaluating responses, classify 'Black' (Interrupt Responses) when:** + +- The response explains the lack of information, context, or when the AI is clearly having trouble understanding the user input. + +- The response is a negative answer to the question. + + +**When evaluating responses, classify 'Green' or 'Blue' (Known Good/Excellent Responses) when:** + +- 'Blue' if it detects a successful command execution, e.g., "OK, command succeeded". The "command output" can be disregarded for this classification. + +- The response responds to conversations where there are no definitive right or wrong answers. + +- Regardless of the question, if the response includes the phrase: "Summarization of docs at: '' succeeded !". + +- Regardless of the question, if the response includes the phrase: "Your search returned the following:". + + +**When evaluating responses, classify 'Red' (Known Bad Responses) when:** + +- The response fails to resolve the primary goal; 'Orange' if the response fails to resolve any of the sub goals. + +- The response language used by the AI differs from the user question (language mismatch). + +- The following message is detected: "Invalid or incomplete response". + +- Acknowledging or mentioning previous responses, indicating or stating the intention of accomplishment, are considered unhelpful. + +- Check if the response is coherent with the question. Ensure the answer is unbiased and does not rely on stereotypes. Detect AI hallucinations by verifying the accuracy of the response. Classify the response +as 'Red' if it does not align with known facts. + +- The AI is seeking user assistance. + + +**Classification Guidelines (rate from 0% to 100%):** + +- Assess the AI's response for correctness by considering its ability to effectively address and correct syntax errors or misinterpretations in the user's input, rather than focusing solely on literal repetitions or minor discrepancies in terminology. + +- Revise the classifications for responses from the AI that contain irrelevant information to 'Yellow' instead of 'Red', as any additional information is still valued. + +- "I don't know." may be a good response. Before classifying, check the chat context or provided contexts to make sure the AI understood the question, but, it does not have an answer. If that's the case, classify as 'Green'. + +- Do not include any part of the question in your response. + +- Indicate your classification choice ('Red', 'Orange', 'Yellow', 'Green', or 'Blue') followed by the reasoning behind your decision. + +- When reviewing cross-references, vigilance is crucial to prevent misclassifications due to ambiguous entries. Consult if the ambiguity was resolved. Exercise caution to avoid categorizing entries as 'Red' or 'Orange' unless absolutely certain. + +- When the primary goal is achieved but lacks further details, classify it as 'Yellow' or 'Green', depending on the amount of missing details. + +- Before returning a classification, check the chat history and all provided context, as that may lead to a different classification, and to double check the classification is accurate. --- -{problems} +{rag} --- -If you don't have an answer, simply say: "I don't know". Don't try do make up an answer. +The response should follow this format: + +@ai_response: "" +@thought: "" +@observation: "" +@criticism: "" +... (repeat thought/observation/criticism N times) +@thought: "Everything is looking good." +@conclusion: "I know what to respond." +@color: "" +@accuracy: "%" +@reasoning: "" +@tips: "" + + +<> + +**THE RESPONSE FORMAT IS CRUCIAL, ALTERING IT WILL CAUSE THE PARSER TO FAIL.** + + +Human Input: "{input}" + + +AI Response: "{response}" + + +Begin the classification! diff --git a/src/main/askai/resources/prompts/task-splitter.txt b/src/main/askai/resources/prompts/task-splitter.txt index 004206a3..c6a586a9 100644 --- a/src/main/askai/resources/prompts/task-splitter.txt +++ b/src/main/askai/resources/prompts/task-splitter.txt @@ -112,16 +112,14 @@ The response should follow this format: @thought: "" @observation: "" @criticism: "" -@check_history: "" -... (repeat thought/observation/criticism/check_history N times) -@thought: "Everything is looking good." +... (repeat thought/observation/criticism N times) @conclusion: "I know what to respond." @primary_goal: "" @sub_goals: [ {{{{ "id": "", "sub_goal": "" }}}}, ... (repeat N times) ] -@speak: "", +@speak: "" @tasks: [ {{{{ "id": "", "task": "", "path": "absolute file or folder path; 'N/A' if uncertain or not needed" }}}}, ... (repeat N times) @@ -130,7 +128,7 @@ The response should follow this format: <> -Always remember to include at least @primary_goal, @sub_goals, @speak, and @tasks. Ensure that @sub_goals and @tasks are parseable by ast.literal_eval, so make sure to include the '[..]' brackets around them. Retain all '@' and '"' symbols and avoid reformatting '@words', as altering them will cause the parser to fail. +**THE RESPONSE FORMAT IS CRUCIAL, ALTERING IT WILL CAUSE THE PARSER TO FAIL.** -Begin! +Begin splittings the tasks! diff --git a/src/test/core/model/__init__.py b/src/test/core/model/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/test/core/model/test_action_plan.py b/src/test/core/model/test_action_plan.py new file mode 100644 index 00000000..ff7f4646 --- /dev/null +++ b/src/test/core/model/test_action_plan.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" + @project: HsPyLib-AskAI + @package: askai.test.core.support + @file: test_utilities.py + @created: Fri, 22 Mar 2024 + @author: "Hugo Saporetti Junior + @site: "https://github.com/yorevs/hspylib") + @license: MIT - Please refer to + + Copyright (c) 2024, HomeSetup +""" +import sys +import unittest +from pathlib import Path + +from askai.__classpath__ import classpath +from askai.core.model.action_plan import ActionPlan +from askai.core.model.model_result import ModelResult +from langchain_core.messages import AIMessage + +from fixtures.action_plan import stub_response + + +class TestClass(unittest.TestCase): + + RESPONSE_FILE: Path = Path(classpath.root_dir / "test/resources/llm-responses/task-splitter.txt") + + RESPONSE_FILE_TEXT: str = RESPONSE_FILE.read_text() + + RESPONSES: list[str] = list(filter(None, map(str.strip, RESPONSE_FILE_TEXT.split("---")))) + + # Setup tests + def setUp(self): + pass + + # Teardown tests + def tearDown(self): + pass + + # TEST CASES ---------- + + def test_should_extract_and_parse_llm_responses(self): + # fmt: off + # Question, AI response, expected ActionPlan object. + test_cases = [ + ('list my downloads', self.RESPONSES[0], stub_response(0)), + ('list my downloads', self.RESPONSES[1], stub_response(0)), + ('hello', self.RESPONSES[2], stub_response(1)), + ('List my downloads and let me know if there is any image.', self.RESPONSES[3], stub_response(2)), + ('List my downloads and let me know if there is any image.', self.RESPONSES[4], stub_response(2)), + ] + # fmt: on + + for question, response, expected in test_cases: + with self.subTest(response=response): + result = ActionPlan.create(question, AIMessage(response), ModelResult.default()) + self.assertEqual(result, expected) + + +# Program entry point. +if __name__ == "__main__": + suite = unittest.TestLoader().loadTestsFromTestCase(TestClass) + unittest.TextTestRunner(verbosity=2, failfast=True, stream=sys.stdout).run(suite) diff --git a/src/test/fixtures/__init__.py b/src/test/fixtures/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/test/fixtures/action_plan.py b/src/test/fixtures/action_plan.py new file mode 100644 index 00000000..b40c96be --- /dev/null +++ b/src/test/fixtures/action_plan.py @@ -0,0 +1,64 @@ +from types import SimpleNamespace +from typing import Optional + +from askai.core.model.action_plan import ActionPlan +from askai.core.model.model_result import ModelResult +from hspylib.core.tools.dict_tools import get_or_default + + +def stub_response(index: int) -> Optional[ActionPlan]: + # Fields: question, speak, primary_goal, sub_goals, tasks, model + responses: list = [ + ActionPlan( + "list my downloads", + "I will help you list the contents of your 'Downloads' folder.", + "List the contents of the 'Downloads' folder", + [], + [ + SimpleNamespace( + **{ + "id": "1", + "task": "List the contents of the 'Downloads' folder", + "path": "/Users/hjunior/Downloads", + } + ) + ], + ModelResult.default(), + ), + ActionPlan( + "hello", + "I will greet the user and initiate the conversation.", + "Respond to the user's greeting.", + [], + [SimpleNamespace(**{"id": "1", "task": "Direct: 'Hello! How can I assist you today?'"})], + ModelResult.default(), + ), + ActionPlan( + "List my downloads and let me know if there is any image.", + "I will list the contents of your downloads folder and check for any image files present.", + "List the contents of the downloads folder and identify any image files", + [ + SimpleNamespace(**{"id": "1", "sub_goal": "List the contents of the downloads folder"}), + SimpleNamespace(**{"id": "2", "sub_goal": "Identify image files in the downloads folder"}), + ], + [ + SimpleNamespace( + **{ + "id": "1", + "task": "List the contents of the downloads folder", + "path": "/Users/hjunior/Downloads", + } + ), + SimpleNamespace( + **{ + "id": "2", + "task": "Identify image files in the downloads folder", + "path": "/Users/hjunior/Downloads", + } + ), + ], + ModelResult.default(), + ), + ] + + return get_or_default(responses, index, None) diff --git a/src/test/resources/llm-responses/task-splitter.txt b/src/test/resources/llm-responses/task-splitter.txt new file mode 100644 index 00000000..b5741a5a --- /dev/null +++ b/src/test/resources/llm-responses/task-splitter.txt @@ -0,0 +1,57 @@ +@thought: The user wants to list the contents of the "Downloads" folder. +@observation: The user is requesting to view the files present in the "Downloads" directory. +@conclusion: I need to assist the user in listing the contents of the "Downloads" folder. + +@primary_goal: "List the contents of the 'Downloads' folder" +@speak: "I will help you list the contents of your 'Downloads' folder." +@tasks: [ + { "id": "1", "task": "List the contents of the 'Downloads' folder", "path": "/Users/hjunior/Downloads" } +] +--- +@thought: The user wants to list the contents of the "Downloads" folder. +@observation: The user is requesting to view the files present in the "Downloads" directory. +@conclusion: I need to assist the user in listing the contents of the "Downloads" folder. + +@primary_goal: "List the contents of the 'Downloads' folder" +@speak: "I will help you list the contents of your 'Downloads' folder." +@tasks: + { "id": "1", "task": "List the contents of the 'Downloads' folder", "path": "/Users/hjunior/Downloads" } +--- +@thought: The user is initiating the conversation with a simple greeting. +@observation: The user's query is straightforward and requires a friendly response. +@conclusion: I should respond with a welcoming message to engage the user in the conversation. +@primary_goal: Respond to the user's greeting. +@speak: I will greet the user and initiate the conversation. +@tasks: + { "id": "1", "task": "Direct: 'Hello! How can I assist you today?'" } +--- +@thought: The user wants me to list the contents of the downloads folder and check for any image files present. +@observation: The user is interested in knowing the contents of the downloads folder and specifically if there are any image files. +@conclusion: I need to list the downloads folder and identify any image files within it. + +@primary_goal: "List the contents of the downloads folder and identify any image files" +@sub_goals: [ + { "id": "1", "sub_goal": "List the contents of the downloads folder" }, + { "id": "2", "sub_goal": "Identify image files in the downloads folder" } +] +@speak: "I will list the contents of your downloads folder and check for any image files present." + +@tasks: [ + { "id": "1", "task": "List the contents of the downloads folder", "path": "/Users/hjunior/Downloads" }, + { "id": "2", "task": "Identify image files in the downloads folder", "path": "/Users/hjunior/Downloads" } +] +--- +@thought: The user wants me to list the contents of the downloads folder and check for any image files present. +@observation: The user is interested in knowing the contents of the downloads folder and specifically if there are any image files. +@conclusion: I need to list the downloads folder and identify any image files within it. + +@primary_goal: "List the contents of the downloads folder and identify any image files" +@sub_goals: + - { "id": "1", "sub_goal": "List the contents of the downloads folder" } + - { "id": "2", "sub_goal": "Identify image files in the downloads folder" } + +@speak: "I will list the contents of your downloads folder and check for any image files present." + +@tasks: + { "id": "1", "task": "List the contents of the downloads folder", "path": "/Users/hjunior/Downloads" } + { "id": "2", "task": "Identify image files in the downloads folder", "path": "/Users/hjunior/Downloads" }