Improving prompts - Summarization and Internet - 3

yorevs · Mar 14, 2024 · c23a2ba · c23a2ba
1 parent d5bff8d
commit c23a2ba
Show file tree

Hide file tree

Showing 8 changed files with 42 additions and 25 deletions.
diff --git a/src/main/askai/core/askai.py b/src/main/askai/core/askai.py
@@ -258,8 +258,8 @@ def _process_response(self, proxy_response: QueryResponse) -> bool:
         # Query processors
         if processor or (q_type := proxy_response.query_type):
             if not processor and not (processor := AIProcessor.get_by_query_type(q_type)):
-                log.error(f"Unable to find a proper processor for query type: {q_type}")
-                self.reply_error(str(proxy_response))
+                log.error(f"Unable to find a proper processor: {str(proxy_response)}")
+                self.reply_error(msg.no_processor(q_type))
                 return False
             log.info("%s::Processing response for '%s'", processor, proxy_response.question)
             status, output = processor.process(proxy_response)

diff --git a/src/main/askai/core/askai_messages.py b/src/main/askai/core/askai_messages.py
@@ -93,6 +93,10 @@ def invalid_cmd_format(self, output: str) -> str:
 
     # Failures
 
+    @lru_cache
+    def no_processor(self, query_type: str) -> str:
+        return self.translate(f"No suitable processor found for query type '{query_type}' !")
+
     @lru_cache
     def invalid_response(self, response_text: str) -> str:
         return self.translate(f"Received an invalid query response/type '{response_text}' !")

diff --git a/src/main/askai/core/component/internet_service.py b/src/main/askai/core/component/internet_service.py
@@ -16,17 +16,20 @@
 from typing import Optional, List
 
 from hspylib.core.metaclass.singleton import Singleton
-from langchain.chains import load_summarize_chain
 from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain.chains.retrieval_qa.base import RetrievalQA
 from langchain_community.document_loaders.async_html import AsyncHtmlLoader
 from langchain_community.utilities import GoogleSearchAPIWrapper
+from langchain_community.vectorstores.chroma import Chroma
 from langchain_core.documents import Document
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.tools import Tool
 from langchain_text_splitters import CharacterTextSplitter
 
 from askai.core.askai_events import AskAiEvents
 from askai.core.askai_messages import msg
+from askai.core.component.cache_service import PERSIST_DIR
+from askai.core.component.summarizer import summarizer
 from askai.core.support.langchain_support import lc_llm, load_document
 
 
@@ -38,13 +41,18 @@ class InternetService(metaclass=Singleton):
     ASKAI_INTERNET_DATA_KEY = "askai-internet-data"
 
     @staticmethod
-    def scrap_sites(*sites: str) -> Optional[str]:
-        """TODO"""
+    def scrap_sites(query: str, *sites: str) -> Optional[str]:
+        """Scrap a web page and summarize it's contents."""
         log.info("Scrapping sites: '%s'", str(sites))
-        docs: List[Document] = load_document(AsyncHtmlLoader, *sites)
-        chain = load_summarize_chain(lc_llm.create_chat_model(), chain_type="stuff")
-        search_results = chain.invoke(docs)
-        return search_results['output_text']
+        documents: List[Document] = load_document(AsyncHtmlLoader, list(sites))
+        if len(documents) > 0:
+            texts: List[Document] = summarizer.text_splitter.split_documents(documents)
+            v_store = Chroma.from_documents(texts, lc_llm.create_embeddings(), persist_directory=str(PERSIST_DIR))
+            retriever = RetrievalQA.from_chain_type(
+                llm=lc_llm.create_model(), chain_type="stuff", retriever=v_store.as_retriever())
+            search_results = retriever.invoke({"query": query})
+            return search_results['result']
+        return None
 
     def __init__(self):
         self._google = GoogleSearchAPIWrapper()

diff --git a/src/main/askai/core/component/summarizer.py b/src/main/askai/core/component/summarizer.py
@@ -26,7 +26,7 @@
 from langchain_community.document_loaders import DirectoryLoader
 from langchain_community.vectorstores.chroma import Chroma
 from langchain_core.documents import Document
-from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
 
 from askai.core.askai_events import AskAiEvents
 from askai.core.askai_messages import msg
@@ -64,6 +64,10 @@ def glob(self) -> str:
     def path(self) -> str:
         return f"{self.folder}{self.glob}"
 
+    @property
+    def text_splitter(self) -> TextSplitter:
+        return self._text_splitter
+
     @lru_cache
     def generate(self, folder: str | Path, glob: str = None) -> None:
         """Generate a summarization of the folder contents.
@@ -74,11 +78,10 @@ def generate(self, folder: str | Path, glob: str = None) -> None:
         self._glob: str = glob.strip()
         AskAiEvents.ASKAI_BUS.events.reply.emit(message=msg.summarizing(self.path))
         log.info("Summarizing documents from '%s'", self.path)
-        embeddings = lc_llm.create_embeddings()
         documents: List[Document] = DirectoryLoader(self.folder, glob=self.glob).load()
         if len(documents) > 0:
             texts: List[Document] = self._text_splitter.split_documents(documents)
-            v_store = Chroma.from_documents(texts, embeddings, persist_directory=str(PERSIST_DIR))
+            v_store = Chroma.from_documents(texts, lc_llm.create_embeddings(), persist_directory=str(PERSIST_DIR))
             self._retriever = RetrievalQA.from_chain_type(
                 llm=lc_llm.create_model(), chain_type="stuff", retriever=v_store.as_retriever())
         else:
@@ -89,17 +92,17 @@ def query(self, *queries: str) -> Optional[List[SummaryResult]]:
         check_argument(len(queries) > 0)
         if self._retriever is not None:
             results: List[SummaryResult] = []
-            for query_string in queries:
-                if result := self.query_one(query_string):
+            for query in queries:
+                if result := self.query_one(query):
                     results.append(result)
             return results
         return None
 
     @lru_cache
-    def query_one(self, query_string: str) -> Optional[SummaryResult]:
+    def query_one(self, query: str) -> Optional[SummaryResult]:
         """Query the AI about a given query based on the summarized content."""
-        check_argument(len(query_string) > 0)
-        if result := self._retriever.invoke({"query": query_string}):
+        check_argument(len(query) > 0)
+        if result := self._retriever.invoke({"query": query}):
             return SummaryResult(
                 self._folder, self._glob, result['query'].strip(), result['result'].strip()
             )

diff --git a/src/main/askai/core/model/query_response.py b/src/main/askai/core/model/query_response.py
@@ -25,6 +25,7 @@ class QueryResponse:
     response: str = ""
     terminating: bool = False
     intelligible: bool = True
+    changing_subject: bool = False
     require_internet: bool = False
     require_summarization: bool = False
     commands: List[TerminalCommand] = field(default_factory=list)

diff --git a/src/main/askai/core/processor/internet_processor.py b/src/main/askai/core/processor/internet_processor.py
@@ -13,6 +13,7 @@
    Copyright·(c)·2024,·HSPyLib
 """
 import logging as log
+from functools import partial
 from typing import Optional, Tuple
 
 from hspylib.core.zoned_datetime import now
@@ -37,7 +38,6 @@ def __init__(self):
 
     def process(self, query_response: QueryResponse) -> Tuple[bool, Optional[str]]:
         status = False
-        output = None
         template = PromptTemplate(input_variables=['cur_date'], template=self.template())
         final_prompt: str = msg.translate(template.format(cur_date=now('%Y-%d-%m')))
         shared.context.set("SETUP", final_prompt, "system")
@@ -49,7 +49,10 @@ def process(self, query_response: QueryResponse) -> Tuple[bool, Optional[str]]:
             if (response := shared.engine.ask(context, temperature=0.0, top_p=0.0)) and response.is_success:
                 search: SearchResult = object_mapper.of_json(response.message, SearchResult)
                 query = " + ".join(search.keywords)
-                if results := internet.search_google(query, *search.sites):
+                fc_call = partial(internet.scrap_sites, query) \
+                    if query_response.require_summarization \
+                    else partial(internet.search_google, query)
+                if results := fc_call(*search.sites):
                     search.results = results
                     output = self._wrap_output(query_response, search)
                     shared.context.set("INTERNET", output, "assistant")

diff --git a/src/main/askai/core/support/utilities.py b/src/main/askai/core/support/utilities.py
@@ -57,7 +57,7 @@ def beautify(text: Any) -> str:
     text = re.sub(r"([Jj]oke( [Tt]ime)?)[-:\s][ \n\t]*", f"{ASKAI_CHAT_ICONS['']}{''}  Joke: ", text)
     text = re.sub(r"[Ff]un [Ff]acts?[-:\s][ \n\t]*", f"{ASKAI_CHAT_ICONS['']}{''}  Fun Fact: ", text)
     text = re.sub(r"[Aa]dvice[-:\s][ \n\t]*", f"{ASKAI_CHAT_ICONS['']}{''}  Advice: ", text)
-    text = re.sub(r"Errors?[-:\s][ \n\t]*", f"%EL0%{ASKAI_CHAT_ICONS['']}{''}  Error: ", text)
+    text = re.sub(r"Errors?[-:\s][ \n\t]*", f"%EL1%{ASKAI_CHAT_ICONS['']}{''}  Error: ", text)
     text = re.sub(r"^\n+", '', text, re.MULTILINE)
     text = re.sub(r"\n{2,}", '\n', text, re.MULTILINE)
     text = re.sub(re_url, r'%CYAN% \1%GREEN%', text)

diff --git a/src/main/askai/resources/assets/prompts/proxy-prompt.txt b/src/main/askai/resources/assets/prompts/proxy-prompt.txt
@@ -8,16 +8,14 @@ Before responding to the user, it is imperative that you follow the step-by-step
 
 - Determine whether the query suggests the user intends to end the conversation.
 
-- Determine whether the query suggests the user intends to change the subject of the conversation.
-
 - Determine whether real-time data are required for ensuring the utmost accuracy and relevance in responses. This pertains specifically to situations where the required information may be outdated or cannot be retrieved from the chat history or your existing database.
 
-- Determine whether summarizing documents is necessary to provide an accurate and comprehensive response.
+- Determine whether summarizing documents or web sites is necessary to provide an accurate and comprehensive response.
+
+- Prior to making any decisions, it's crucial to review the entire chat history. The context provided previously may influence the outcome, so it's important to ensure that the answer isn't already within that context before proceeding.
 
 - The final response is a formatted JSON with no additional description or context.
 
 - The final response 'JSON' must contain the boolean fields: 'intelligible', 'terminating', 'require_internet' and 'require_summarization'.
 
 - The final response 'JSON' must contain the string fields: fields: 'query_type' and 'question'.
-
-- Prior to making any decisions, it's crucial to review the entire chat history. The context provided previously may influence the outcome, so it's important to ensure that the answer isn't already within that context before proceeding.