From c23a2ba696502a89fd5c04f164e4469047482bbc Mon Sep 17 00:00:00 2001 From: Hugo Saporetti Junior Date: Thu, 14 Mar 2024 04:04:53 -0300 Subject: [PATCH] Improving prompts - Summarization and Internet - 3 --- src/main/askai/core/askai.py | 4 ++-- src/main/askai/core/askai_messages.py | 4 ++++ .../askai/core/component/internet_service.py | 22 +++++++++++++------ src/main/askai/core/component/summarizer.py | 19 +++++++++------- src/main/askai/core/model/query_response.py | 1 + .../core/processor/internet_processor.py | 7 ++++-- src/main/askai/core/support/utilities.py | 2 +- .../resources/assets/prompts/proxy-prompt.txt | 8 +++---- 8 files changed, 42 insertions(+), 25 deletions(-) diff --git a/src/main/askai/core/askai.py b/src/main/askai/core/askai.py index 4ec6d51c..9b538885 100644 --- a/src/main/askai/core/askai.py +++ b/src/main/askai/core/askai.py @@ -258,8 +258,8 @@ def _process_response(self, proxy_response: QueryResponse) -> bool: # Query processors if processor or (q_type := proxy_response.query_type): if not processor and not (processor := AIProcessor.get_by_query_type(q_type)): - log.error(f"Unable to find a proper processor for query type: {q_type}") - self.reply_error(str(proxy_response)) + log.error(f"Unable to find a proper processor: {str(proxy_response)}") + self.reply_error(msg.no_processor(q_type)) return False log.info("%s::Processing response for '%s'", processor, proxy_response.question) status, output = processor.process(proxy_response) diff --git a/src/main/askai/core/askai_messages.py b/src/main/askai/core/askai_messages.py index 004ba8a5..7f0da3b6 100644 --- a/src/main/askai/core/askai_messages.py +++ b/src/main/askai/core/askai_messages.py @@ -93,6 +93,10 @@ def invalid_cmd_format(self, output: str) -> str: # Failures + @lru_cache + def no_processor(self, query_type: str) -> str: + return self.translate(f"No suitable processor found for query type '{query_type}' !") + @lru_cache def invalid_response(self, response_text: str) -> str: return self.translate(f"Received an invalid query response/type '{response_text}' !") diff --git a/src/main/askai/core/component/internet_service.py b/src/main/askai/core/component/internet_service.py index f8ccf9b0..07d6cc6c 100644 --- a/src/main/askai/core/component/internet_service.py +++ b/src/main/askai/core/component/internet_service.py @@ -16,10 +16,11 @@ from typing import Optional, List from hspylib.core.metaclass.singleton import Singleton -from langchain.chains import load_summarize_chain from langchain.chains.combine_documents import create_stuff_documents_chain +from langchain.chains.retrieval_qa.base import RetrievalQA from langchain_community.document_loaders.async_html import AsyncHtmlLoader from langchain_community.utilities import GoogleSearchAPIWrapper +from langchain_community.vectorstores.chroma import Chroma from langchain_core.documents import Document from langchain_core.prompts import ChatPromptTemplate from langchain_core.tools import Tool @@ -27,6 +28,8 @@ from askai.core.askai_events import AskAiEvents from askai.core.askai_messages import msg +from askai.core.component.cache_service import PERSIST_DIR +from askai.core.component.summarizer import summarizer from askai.core.support.langchain_support import lc_llm, load_document @@ -38,13 +41,18 @@ class InternetService(metaclass=Singleton): ASKAI_INTERNET_DATA_KEY = "askai-internet-data" @staticmethod - def scrap_sites(*sites: str) -> Optional[str]: - """TODO""" + def scrap_sites(query: str, *sites: str) -> Optional[str]: + """Scrap a web page and summarize it's contents.""" log.info("Scrapping sites: '%s'", str(sites)) - docs: List[Document] = load_document(AsyncHtmlLoader, *sites) - chain = load_summarize_chain(lc_llm.create_chat_model(), chain_type="stuff") - search_results = chain.invoke(docs) - return search_results['output_text'] + documents: List[Document] = load_document(AsyncHtmlLoader, list(sites)) + if len(documents) > 0: + texts: List[Document] = summarizer.text_splitter.split_documents(documents) + v_store = Chroma.from_documents(texts, lc_llm.create_embeddings(), persist_directory=str(PERSIST_DIR)) + retriever = RetrievalQA.from_chain_type( + llm=lc_llm.create_model(), chain_type="stuff", retriever=v_store.as_retriever()) + search_results = retriever.invoke({"query": query}) + return search_results['result'] + return None def __init__(self): self._google = GoogleSearchAPIWrapper() diff --git a/src/main/askai/core/component/summarizer.py b/src/main/askai/core/component/summarizer.py index 39440928..fbcde35b 100644 --- a/src/main/askai/core/component/summarizer.py +++ b/src/main/askai/core/component/summarizer.py @@ -26,7 +26,7 @@ from langchain_community.document_loaders import DirectoryLoader from langchain_community.vectorstores.chroma import Chroma from langchain_core.documents import Document -from langchain_text_splitters import RecursiveCharacterTextSplitter +from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter from askai.core.askai_events import AskAiEvents from askai.core.askai_messages import msg @@ -64,6 +64,10 @@ def glob(self) -> str: def path(self) -> str: return f"{self.folder}{self.glob}" + @property + def text_splitter(self) -> TextSplitter: + return self._text_splitter + @lru_cache def generate(self, folder: str | Path, glob: str = None) -> None: """Generate a summarization of the folder contents. @@ -74,11 +78,10 @@ def generate(self, folder: str | Path, glob: str = None) -> None: self._glob: str = glob.strip() AskAiEvents.ASKAI_BUS.events.reply.emit(message=msg.summarizing(self.path)) log.info("Summarizing documents from '%s'", self.path) - embeddings = lc_llm.create_embeddings() documents: List[Document] = DirectoryLoader(self.folder, glob=self.glob).load() if len(documents) > 0: texts: List[Document] = self._text_splitter.split_documents(documents) - v_store = Chroma.from_documents(texts, embeddings, persist_directory=str(PERSIST_DIR)) + v_store = Chroma.from_documents(texts, lc_llm.create_embeddings(), persist_directory=str(PERSIST_DIR)) self._retriever = RetrievalQA.from_chain_type( llm=lc_llm.create_model(), chain_type="stuff", retriever=v_store.as_retriever()) else: @@ -89,17 +92,17 @@ def query(self, *queries: str) -> Optional[List[SummaryResult]]: check_argument(len(queries) > 0) if self._retriever is not None: results: List[SummaryResult] = [] - for query_string in queries: - if result := self.query_one(query_string): + for query in queries: + if result := self.query_one(query): results.append(result) return results return None @lru_cache - def query_one(self, query_string: str) -> Optional[SummaryResult]: + def query_one(self, query: str) -> Optional[SummaryResult]: """Query the AI about a given query based on the summarized content.""" - check_argument(len(query_string) > 0) - if result := self._retriever.invoke({"query": query_string}): + check_argument(len(query) > 0) + if result := self._retriever.invoke({"query": query}): return SummaryResult( self._folder, self._glob, result['query'].strip(), result['result'].strip() ) diff --git a/src/main/askai/core/model/query_response.py b/src/main/askai/core/model/query_response.py index b756398a..46302ce3 100644 --- a/src/main/askai/core/model/query_response.py +++ b/src/main/askai/core/model/query_response.py @@ -25,6 +25,7 @@ class QueryResponse: response: str = "" terminating: bool = False intelligible: bool = True + changing_subject: bool = False require_internet: bool = False require_summarization: bool = False commands: List[TerminalCommand] = field(default_factory=list) diff --git a/src/main/askai/core/processor/internet_processor.py b/src/main/askai/core/processor/internet_processor.py index 3ebf49b8..791b1437 100644 --- a/src/main/askai/core/processor/internet_processor.py +++ b/src/main/askai/core/processor/internet_processor.py @@ -13,6 +13,7 @@ Copyright·(c)·2024,·HSPyLib """ import logging as log +from functools import partial from typing import Optional, Tuple from hspylib.core.zoned_datetime import now @@ -37,7 +38,6 @@ def __init__(self): def process(self, query_response: QueryResponse) -> Tuple[bool, Optional[str]]: status = False - output = None template = PromptTemplate(input_variables=['cur_date'], template=self.template()) final_prompt: str = msg.translate(template.format(cur_date=now('%Y-%d-%m'))) shared.context.set("SETUP", final_prompt, "system") @@ -49,7 +49,10 @@ def process(self, query_response: QueryResponse) -> Tuple[bool, Optional[str]]: if (response := shared.engine.ask(context, temperature=0.0, top_p=0.0)) and response.is_success: search: SearchResult = object_mapper.of_json(response.message, SearchResult) query = " + ".join(search.keywords) - if results := internet.search_google(query, *search.sites): + fc_call = partial(internet.scrap_sites, query) \ + if query_response.require_summarization \ + else partial(internet.search_google, query) + if results := fc_call(*search.sites): search.results = results output = self._wrap_output(query_response, search) shared.context.set("INTERNET", output, "assistant") diff --git a/src/main/askai/core/support/utilities.py b/src/main/askai/core/support/utilities.py index 6fbe9ef8..dcc74b3b 100644 --- a/src/main/askai/core/support/utilities.py +++ b/src/main/askai/core/support/utilities.py @@ -57,7 +57,7 @@ def beautify(text: Any) -> str: text = re.sub(r"([Jj]oke( [Tt]ime)?)[-:\s][ \n\t]*", f"{ASKAI_CHAT_ICONS['']}{''} Joke: ", text) text = re.sub(r"[Ff]un [Ff]acts?[-:\s][ \n\t]*", f"{ASKAI_CHAT_ICONS['']}{''} Fun Fact: ", text) text = re.sub(r"[Aa]dvice[-:\s][ \n\t]*", f"{ASKAI_CHAT_ICONS['']}{''} Advice: ", text) - text = re.sub(r"Errors?[-:\s][ \n\t]*", f"%EL0%{ASKAI_CHAT_ICONS['']}{''} Error: ", text) + text = re.sub(r"Errors?[-:\s][ \n\t]*", f"%EL1%{ASKAI_CHAT_ICONS['']}{''} Error: ", text) text = re.sub(r"^\n+", '', text, re.MULTILINE) text = re.sub(r"\n{2,}", '\n', text, re.MULTILINE) text = re.sub(re_url, r'%CYAN% \1%GREEN%', text) diff --git a/src/main/askai/resources/assets/prompts/proxy-prompt.txt b/src/main/askai/resources/assets/prompts/proxy-prompt.txt index 6ae986d8..ad6aa3fe 100644 --- a/src/main/askai/resources/assets/prompts/proxy-prompt.txt +++ b/src/main/askai/resources/assets/prompts/proxy-prompt.txt @@ -8,16 +8,14 @@ Before responding to the user, it is imperative that you follow the step-by-step - Determine whether the query suggests the user intends to end the conversation. -- Determine whether the query suggests the user intends to change the subject of the conversation. - - Determine whether real-time data are required for ensuring the utmost accuracy and relevance in responses. This pertains specifically to situations where the required information may be outdated or cannot be retrieved from the chat history or your existing database. -- Determine whether summarizing documents is necessary to provide an accurate and comprehensive response. +- Determine whether summarizing documents or web sites is necessary to provide an accurate and comprehensive response. + +- Prior to making any decisions, it's crucial to review the entire chat history. The context provided previously may influence the outcome, so it's important to ensure that the answer isn't already within that context before proceeding. - The final response is a formatted JSON with no additional description or context. - The final response 'JSON' must contain the boolean fields: 'intelligible', 'terminating', 'require_internet' and 'require_summarization'. - The final response 'JSON' must contain the string fields: fields: 'query_type' and 'question'. - -- Prior to making any decisions, it's crucial to review the entire chat history. The context provided previously may influence the outcome, so it's important to ensure that the answer isn't already within that context before proceeding.