From c23a2ba696502a89fd5c04f164e4469047482bbc Mon Sep 17 00:00:00 2001
From: Hugo Saporetti Junior <yorevs@gmail.com>
Date: Thu, 14 Mar 2024 04:04:53 -0300
Subject: [PATCH] Improving prompts - Summarization and Internet - 3

---
 src/main/askai/core/askai.py                  |  4 ++--
 src/main/askai/core/askai_messages.py         |  4 ++++
 .../askai/core/component/internet_service.py  | 22 +++++++++++++------
 src/main/askai/core/component/summarizer.py   | 19 +++++++++-------
 src/main/askai/core/model/query_response.py   |  1 +
 .../core/processor/internet_processor.py      |  7 ++++--
 src/main/askai/core/support/utilities.py      |  2 +-
 .../resources/assets/prompts/proxy-prompt.txt |  8 +++----
 8 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/src/main/askai/core/askai.py b/src/main/askai/core/askai.py
index 4ec6d51c..9b538885 100644
--- a/src/main/askai/core/askai.py
+++ b/src/main/askai/core/askai.py
@@ -258,8 +258,8 @@ def _process_response(self, proxy_response: QueryResponse) -> bool:
         # Query processors
         if processor or (q_type := proxy_response.query_type):
             if not processor and not (processor := AIProcessor.get_by_query_type(q_type)):
-                log.error(f"Unable to find a proper processor for query type: {q_type}")
-                self.reply_error(str(proxy_response))
+                log.error(f"Unable to find a proper processor: {str(proxy_response)}")
+                self.reply_error(msg.no_processor(q_type))
                 return False
             log.info("%s::Processing response for '%s'", processor, proxy_response.question)
             status, output = processor.process(proxy_response)
diff --git a/src/main/askai/core/askai_messages.py b/src/main/askai/core/askai_messages.py
index 004ba8a5..7f0da3b6 100644
--- a/src/main/askai/core/askai_messages.py
+++ b/src/main/askai/core/askai_messages.py
@@ -93,6 +93,10 @@ def invalid_cmd_format(self, output: str) -> str:
 
     # Failures
 
+    @lru_cache
+    def no_processor(self, query_type: str) -> str:
+        return self.translate(f"No suitable processor found for query type '{query_type}' !")
+
     @lru_cache
     def invalid_response(self, response_text: str) -> str:
         return self.translate(f"Received an invalid query response/type '{response_text}' !")
diff --git a/src/main/askai/core/component/internet_service.py b/src/main/askai/core/component/internet_service.py
index f8ccf9b0..07d6cc6c 100644
--- a/src/main/askai/core/component/internet_service.py
+++ b/src/main/askai/core/component/internet_service.py
@@ -16,10 +16,11 @@
 from typing import Optional, List
 
 from hspylib.core.metaclass.singleton import Singleton
-from langchain.chains import load_summarize_chain
 from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain.chains.retrieval_qa.base import RetrievalQA
 from langchain_community.document_loaders.async_html import AsyncHtmlLoader
 from langchain_community.utilities import GoogleSearchAPIWrapper
+from langchain_community.vectorstores.chroma import Chroma
 from langchain_core.documents import Document
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.tools import Tool
@@ -27,6 +28,8 @@
 
 from askai.core.askai_events import AskAiEvents
 from askai.core.askai_messages import msg
+from askai.core.component.cache_service import PERSIST_DIR
+from askai.core.component.summarizer import summarizer
 from askai.core.support.langchain_support import lc_llm, load_document
 
 
@@ -38,13 +41,18 @@ class InternetService(metaclass=Singleton):
     ASKAI_INTERNET_DATA_KEY = "askai-internet-data"
 
     @staticmethod
-    def scrap_sites(*sites: str) -> Optional[str]:
-        """TODO"""
+    def scrap_sites(query: str, *sites: str) -> Optional[str]:
+        """Scrap a web page and summarize it's contents."""
         log.info("Scrapping sites: '%s'", str(sites))
-        docs: List[Document] = load_document(AsyncHtmlLoader, *sites)
-        chain = load_summarize_chain(lc_llm.create_chat_model(), chain_type="stuff")
-        search_results = chain.invoke(docs)
-        return search_results['output_text']
+        documents: List[Document] = load_document(AsyncHtmlLoader, list(sites))
+        if len(documents) > 0:
+            texts: List[Document] = summarizer.text_splitter.split_documents(documents)
+            v_store = Chroma.from_documents(texts, lc_llm.create_embeddings(), persist_directory=str(PERSIST_DIR))
+            retriever = RetrievalQA.from_chain_type(
+                llm=lc_llm.create_model(), chain_type="stuff", retriever=v_store.as_retriever())
+            search_results = retriever.invoke({"query": query})
+            return search_results['result']
+        return None
 
     def __init__(self):
         self._google = GoogleSearchAPIWrapper()
diff --git a/src/main/askai/core/component/summarizer.py b/src/main/askai/core/component/summarizer.py
index 39440928..fbcde35b 100644
--- a/src/main/askai/core/component/summarizer.py
+++ b/src/main/askai/core/component/summarizer.py
@@ -26,7 +26,7 @@
 from langchain_community.document_loaders import DirectoryLoader
 from langchain_community.vectorstores.chroma import Chroma
 from langchain_core.documents import Document
-from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
 
 from askai.core.askai_events import AskAiEvents
 from askai.core.askai_messages import msg
@@ -64,6 +64,10 @@ def glob(self) -> str:
     def path(self) -> str:
         return f"{self.folder}{self.glob}"
 
+    @property
+    def text_splitter(self) -> TextSplitter:
+        return self._text_splitter
+
     @lru_cache
     def generate(self, folder: str | Path, glob: str = None) -> None:
         """Generate a summarization of the folder contents.
@@ -74,11 +78,10 @@ def generate(self, folder: str | Path, glob: str = None) -> None:
         self._glob: str = glob.strip()
         AskAiEvents.ASKAI_BUS.events.reply.emit(message=msg.summarizing(self.path))
         log.info("Summarizing documents from '%s'", self.path)
-        embeddings = lc_llm.create_embeddings()
         documents: List[Document] = DirectoryLoader(self.folder, glob=self.glob).load()
         if len(documents) > 0:
             texts: List[Document] = self._text_splitter.split_documents(documents)
-            v_store = Chroma.from_documents(texts, embeddings, persist_directory=str(PERSIST_DIR))
+            v_store = Chroma.from_documents(texts, lc_llm.create_embeddings(), persist_directory=str(PERSIST_DIR))
             self._retriever = RetrievalQA.from_chain_type(
                 llm=lc_llm.create_model(), chain_type="stuff", retriever=v_store.as_retriever())
         else:
@@ -89,17 +92,17 @@ def query(self, *queries: str) -> Optional[List[SummaryResult]]:
         check_argument(len(queries) > 0)
         if self._retriever is not None:
             results: List[SummaryResult] = []
-            for query_string in queries:
-                if result := self.query_one(query_string):
+            for query in queries:
+                if result := self.query_one(query):
                     results.append(result)
             return results
         return None
 
     @lru_cache
-    def query_one(self, query_string: str) -> Optional[SummaryResult]:
+    def query_one(self, query: str) -> Optional[SummaryResult]:
         """Query the AI about a given query based on the summarized content."""
-        check_argument(len(query_string) > 0)
-        if result := self._retriever.invoke({"query": query_string}):
+        check_argument(len(query) > 0)
+        if result := self._retriever.invoke({"query": query}):
             return SummaryResult(
                 self._folder, self._glob, result['query'].strip(), result['result'].strip()
             )
diff --git a/src/main/askai/core/model/query_response.py b/src/main/askai/core/model/query_response.py
index b756398a..46302ce3 100644
--- a/src/main/askai/core/model/query_response.py
+++ b/src/main/askai/core/model/query_response.py
@@ -25,6 +25,7 @@ class QueryResponse:
     response: str = ""
     terminating: bool = False
     intelligible: bool = True
+    changing_subject: bool = False
     require_internet: bool = False
     require_summarization: bool = False
     commands: List[TerminalCommand] = field(default_factory=list)
diff --git a/src/main/askai/core/processor/internet_processor.py b/src/main/askai/core/processor/internet_processor.py
index 3ebf49b8..791b1437 100644
--- a/src/main/askai/core/processor/internet_processor.py
+++ b/src/main/askai/core/processor/internet_processor.py
@@ -13,6 +13,7 @@
    Copyright·(c)·2024,·HSPyLib
 """
 import logging as log
+from functools import partial
 from typing import Optional, Tuple
 
 from hspylib.core.zoned_datetime import now
@@ -37,7 +38,6 @@ def __init__(self):
 
     def process(self, query_response: QueryResponse) -> Tuple[bool, Optional[str]]:
         status = False
-        output = None
         template = PromptTemplate(input_variables=['cur_date'], template=self.template())
         final_prompt: str = msg.translate(template.format(cur_date=now('%Y-%d-%m')))
         shared.context.set("SETUP", final_prompt, "system")
@@ -49,7 +49,10 @@ def process(self, query_response: QueryResponse) -> Tuple[bool, Optional[str]]:
             if (response := shared.engine.ask(context, temperature=0.0, top_p=0.0)) and response.is_success:
                 search: SearchResult = object_mapper.of_json(response.message, SearchResult)
                 query = " + ".join(search.keywords)
-                if results := internet.search_google(query, *search.sites):
+                fc_call = partial(internet.scrap_sites, query) \
+                    if query_response.require_summarization \
+                    else partial(internet.search_google, query)
+                if results := fc_call(*search.sites):
                     search.results = results
                     output = self._wrap_output(query_response, search)
                     shared.context.set("INTERNET", output, "assistant")
diff --git a/src/main/askai/core/support/utilities.py b/src/main/askai/core/support/utilities.py
index 6fbe9ef8..dcc74b3b 100644
--- a/src/main/askai/core/support/utilities.py
+++ b/src/main/askai/core/support/utilities.py
@@ -57,7 +57,7 @@ def beautify(text: Any) -> str:
     text = re.sub(r"([Jj]oke( [Tt]ime)?)[-:\s][ \n\t]*", f"{ASKAI_CHAT_ICONS['']}{''}  Joke: ", text)
     text = re.sub(r"[Ff]un [Ff]acts?[-:\s][ \n\t]*", f"{ASKAI_CHAT_ICONS['']}{''}  Fun Fact: ", text)
     text = re.sub(r"[Aa]dvice[-:\s][ \n\t]*", f"{ASKAI_CHAT_ICONS['']}{''}  Advice: ", text)
-    text = re.sub(r"Errors?[-:\s][ \n\t]*", f"%EL0%{ASKAI_CHAT_ICONS['']}{''}  Error: ", text)
+    text = re.sub(r"Errors?[-:\s][ \n\t]*", f"%EL1%{ASKAI_CHAT_ICONS['']}{''}  Error: ", text)
     text = re.sub(r"^\n+", '', text, re.MULTILINE)
     text = re.sub(r"\n{2,}", '\n', text, re.MULTILINE)
     text = re.sub(re_url, r'%CYAN% \1%GREEN%', text)
diff --git a/src/main/askai/resources/assets/prompts/proxy-prompt.txt b/src/main/askai/resources/assets/prompts/proxy-prompt.txt
index 6ae986d8..ad6aa3fe 100644
--- a/src/main/askai/resources/assets/prompts/proxy-prompt.txt
+++ b/src/main/askai/resources/assets/prompts/proxy-prompt.txt
@@ -8,16 +8,14 @@ Before responding to the user, it is imperative that you follow the step-by-step
 
 - Determine whether the query suggests the user intends to end the conversation.
 
-- Determine whether the query suggests the user intends to change the subject of the conversation.
-
 - Determine whether real-time data are required for ensuring the utmost accuracy and relevance in responses. This pertains specifically to situations where the required information may be outdated or cannot be retrieved from the chat history or your existing database.
 
-- Determine whether summarizing documents is necessary to provide an accurate and comprehensive response.
+- Determine whether summarizing documents or web sites is necessary to provide an accurate and comprehensive response.
+
+- Prior to making any decisions, it's crucial to review the entire chat history. The context provided previously may influence the outcome, so it's important to ensure that the answer isn't already within that context before proceeding.
 
 - The final response is a formatted JSON with no additional description or context.
 
 - The final response 'JSON' must contain the boolean fields: 'intelligible', 'terminating', 'require_internet' and 'require_summarization'.
 
 - The final response 'JSON' must contain the string fields: fields: 'query_type' and 'question'.
-
-- Prior to making any decisions, it's crucial to review the entire chat history. The context provided previously may influence the outcome, so it's important to ensure that the answer isn't already within that context before proceeding.