Improving prompts - Summarization and Internet

yorevs · Mar 14, 2024 · f2ac96b · f2ac96b
1 parent 25fe495
commit f2ac96b
Show file tree

Hide file tree

Showing 17 changed files with 132 additions and 60 deletions.
diff --git a/dependencies.hspd b/dependencies.hspd
@@ -25,6 +25,9 @@ package: langchain-community
 package: unstructured
 package: python-magic-bin
 package: chromadb
+package: tiktoken
+package: aiohttp
+package: html2text
 package: PyAudio
 package: SpeechRecognition
 package: openai-whisper

diff --git a/src/main/askai/core/askai.py b/src/main/askai/core/askai.py
@@ -43,7 +43,6 @@
 from askai.core.processor.ai_processor import AIProcessor
 from askai.core.processor.generic_processor import GenericProcessor
 from askai.core.processor.internet_processor import InternetProcessor
-from askai.core.processor.output_processor import OutputProcessor
 from askai.core.processor.processor_proxy import proxy
 from askai.core.processor.summary_processor import SummaryProcessor
 from askai.core.support.object_mapper import object_mapper
@@ -244,14 +243,14 @@ def _process_response(self, proxy_response: QueryResponse) -> bool:
         # Intrinsic features
         if not proxy_response.intelligible:
             self.reply_error(msg.intelligible(proxy_response.question))
-            return False
+            return True
         elif proxy_response.terminating:
             log.info("User wants to terminate the conversation.")
             return False
         elif proxy_response.require_internet:
             log.info("Internet is required to fulfill the request.")
             processor = AIProcessor.get_by_name(InternetProcessor.__name__)
-            processor.bind(AIProcessor.get_by_query_type(proxy_response.query_type))
+            processor.bind(AIProcessor.get_by_name(GenericProcessor.__name__))
         elif proxy_response.require_summarization:
             log.info("Summarization is required to fulfill the request.")
             processor = AIProcessor.get_by_name(SummaryProcessor.__name__)

diff --git a/src/main/askai/core/askai_messages.py b/src/main/askai/core/askai_messages.py
@@ -75,6 +75,10 @@ def summarizing(self, path: str) -> str:
     def cmd_no_output(self) -> str:
         return self.translate(f"The command didn't return an output !")
 
+    @lru_cache
+    def search_empty(self) -> str:
+        return self.translate(f"The google research didn't return an output !")
+
     @lru_cache
     def access_grant(self) -> str:
         return self.translate(f"'AskAI' requires access to your files, folders and apps. Continue (yes/[no])?")
@@ -103,7 +107,7 @@ def cmd_failed(self, cmd_line: str) -> str:
 
     @lru_cache
     def intelligible(self, question: str) -> str:
-        return self.translate(f"Your question '{question}' is not clear, please rephrase !")
+        return self.translate(f"Your question '{question}' is not clear, please reformulate !")
 
     @lru_cache
     def llm_error(self, error: str) -> str:

diff --git a/src/main/askai/core/component/internet_service.py b/src/main/askai/core/component/internet_service.py
@@ -13,17 +13,21 @@
    Copyright·(c)·2024,·HSPyLib
 """
 import logging as log
-import os
-from functools import lru_cache
-from typing import Optional
+from typing import Optional, List
 
 from hspylib.core.metaclass.singleton import Singleton
+from langchain.chains import load_summarize_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain_community.document_loaders.async_html import AsyncHtmlLoader
 from langchain_community.utilities import GoogleSearchAPIWrapper
+from langchain_core.documents import Document
+from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.tools import Tool
+from langchain_text_splitters import CharacterTextSplitter
 
 from askai.core.askai_events import AskAiEvents
 from askai.core.askai_messages import msg
-from askai.core.support.langchain_support import lc_llm
+from askai.core.support.langchain_support import lc_llm, load_document
 
 
 class InternetService(metaclass=Singleton):
@@ -33,37 +37,41 @@ class InternetService(metaclass=Singleton):
 
     ASKAI_INTERNET_DATA_KEY = "askai-internet-data"
 
+    @staticmethod
+    def scrap_sites(*sites: str) -> Optional[str]:
+        """TODO"""
+        log.info("Scrapping sites: '%s'", str(sites))
+        docs: List[Document] = load_document(AsyncHtmlLoader, *sites)
+        chain = load_summarize_chain(lc_llm.create_chat_model(), chain_type="stuff")
+        search_results = chain.invoke(docs)
+        return search_results['output_text']
+
     def __init__(self):
         self._google = GoogleSearchAPIWrapper()
         self._tool = Tool(
             name="google_search",
             description="Search Google for recent results.",
             func=self._google.run)
 
-    @lru_cache
     def search_google(self, query: str, *sites: str) -> Optional[str]:
         """Search the web using google search API.
         :param query: The google search query string.
         :param sites: The sites you want google to search for.
         """
-        search_results: str = ''
         AskAiEvents.ASKAI_BUS.events.reply.emit(message=msg.searching())
-        log.info("Searching GOOGLE for '%s'  url: '%s'", query, str(sites))
-        if sites:
+        if len(sites) > 0:
+            log.info("Searching GOOGLE for '%s'  url: '%s'", query, str(sites))
+            search_results: str = ''
             for url in sites:
                 search_results += str(self._tool.run(f"{query} site: {url}"))
-        else:
-            search_results += str(self._tool.run(f"{query}"))
-        log.debug(f"Internet search output: %s", search_results)
+            text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)
+            docs: List[Document] = [Document(page_content=x) for x in text_splitter.split_text(search_results)]
+            prompt = ChatPromptTemplate.from_messages([("system", "{query}\n\n{context}")])
+            chain = create_stuff_documents_chain(lc_llm.create_chat_model(), prompt)
+            search_results = chain.invoke({"query": query, "context": docs})
+            return search_results
 
-        return search_results
+        return None
 
 
 assert (internet := InternetService().INSTANCE) is not None
-
-
-if __name__ == '__main__':
-    q = 'What is the whether like in Belo Horizonte now'
-    embeddings = lc_llm.create_embeddings()
-    c = internet.search_google(q)
-    eq = embeddings.embed_query(q)
diff --git a/src/main/askai/core/engine/ai_engine.py b/src/main/askai/core/engine/ai_engine.py
@@ -12,6 +12,8 @@
 
    Copyright·(c)·2024,·HSPyLib
 """
+from langchain_core.language_models import BaseLLM, BaseChatModel
+
 from askai.core.model.ai_model import AIModel
 from askai.core.model.ai_reply import AIReply
 from typing import Any, List, Optional, Protocol
@@ -20,8 +22,18 @@
 class AIEngine(Protocol):
     """Provide an interface for AI engines."""
 
-    def lc_model(self, temperature: float, top_p: float) -> Any:
-        """Create a LangChain AI model instance."""
+    def lc_model(self, temperature: float, top_p: float) -> BaseLLM:
+        """Create a LangChain AI model instance.
+        :param temperature: The model engine temperature.
+        :param top_p: The model engine top_p.
+        """
+        ...
+
+    def lc_chat_model(self, temperature: float = 0.0) -> BaseChatModel:
+        """Create a LangChain OpenAI llm chat model instance.
+        :param temperature: The model engine temperature.
+        """
+        ...
 
     def lc_embeddings(self) -> Any:
         """Create a LangChain AI embeddings instance."""
@@ -50,8 +62,8 @@ def models(self) -> List[AIModel]:
     def ask(self, chat_context: List[dict], temperature: float = 0.8, top_p: float = 0.0) -> AIReply:
         """Ask AI assistance for the given question and expect a response.
         :param chat_context: The chat history or context.
-        :param temperature: TODO
-        :param top_p: TODO
+        :param temperature: The model engine temperature.
+        :param top_p: The model engine top_p.
         """
         ...
 

diff --git a/src/main/askai/core/engine/openai/openai_engine.py b/src/main/askai/core/engine/openai/openai_engine.py
@@ -16,11 +16,13 @@
 import logging as log
 import os
 from threading import Thread
-from typing import Any, List, Optional
+from typing import List, Optional
 
 import langchain_openai
 import pause
 from hspylib.core.preconditions import check_not_none
+from langchain_core.embeddings import Embeddings
+from langchain_core.language_models import BaseLLM, BaseChatModel
 from openai import APIError, OpenAI
 
 from askai.core.component.audio_player import AudioPlayer
@@ -52,12 +54,17 @@ def url(self) -> str:
     def client(self) -> OpenAI:
         return self._client
 
-    def lc_model(self, temperature: float = 0.8, top_p: float = 0.0) -> Any:
+    def lc_model(self, temperature: float = 0.0, top_p: float = 0.0) -> BaseLLM:
         """Create a LangChain OpenAI llm model instance."""
         return langchain_openai.OpenAI(
             openai_api_key=self._api_key, temperature=temperature, top_p=top_p)
 
-    def lc_embeddings(self) -> Any:
+    def lc_chat_model(self, temperature: float = 0.0) -> BaseChatModel:
+        """Create a LangChain OpenAI llm chat model instance."""
+        return langchain_openai.ChatOpenAI(
+            openai_api_key=self._api_key, temperature=temperature)
+
+    def lc_embeddings(self) -> Embeddings:
         """Create a LangChain AI embeddings instance."""
         return langchain_openai.OpenAIEmbeddings()
 

diff --git a/src/main/askai/core/model/chat_context.py b/src/main/askai/core/model/chat_context.py
@@ -30,7 +30,7 @@ class ChatContext:
 
     def __init__(self, token_limit: int):
         self._context = defaultdict(list)
-        self._token_limit: int = token_limit #* 1024  # The limit is given in KB
+        self._token_limit: int = token_limit * 1024  # The limit is given in KB
 
     def __str__(self):
         return os.linesep.join(f"'{k}': '{v}'" for k, v in self._context.items())
@@ -79,8 +79,9 @@ def join(self, *keys: str) -> ContextRaw:
                 context.extend(self.get(key))
         return context
 
-    def clear(self, key: str) -> int:
+    def clear(self, *keys: str) -> int:
         """Clear the all the chat context specified by key."""
-        if self._context[key]:
-            del self._context[key]
+        for key in keys:
+            if self._context[key]:
+                del self._context[key]
         return len(self._context)
diff --git a/src/main/askai/core/model/summary_result.py b/src/main/askai/core/model/summary_result.py
@@ -26,4 +26,4 @@ class SummaryResult:
     answer: str = None
 
     def __str__(self):
-        return f"Summarization: {json.dumps(self.__dict__, default=lambda obj: obj.__dict__)}"
+        return f"Summarization results: {json.dumps(self.__dict__, default=lambda obj: obj.__dict__)}"
diff --git a/src/main/askai/core/processor/generic_processor.py b/src/main/askai/core/processor/generic_processor.py
@@ -53,6 +53,7 @@ def process(self, query_response: QueryResponse) -> Tuple[bool, Optional[str]]:
             cache.save_reply(query_response.question, output)
             cache.save_query_history()
             status = True
+            shared.context.clear("INTERNET", "SUMMARY")
         else:
             output = msg.llm_error(response.message)
 

diff --git a/src/main/askai/core/processor/internet_processor.py b/src/main/askai/core/processor/internet_processor.py
@@ -55,6 +55,8 @@ def process(self, query_response: QueryResponse) -> Tuple[bool, Optional[str]]:
                     shared.context.set("INTERNET", output, "assistant")
                     cache.save_reply(query_response.question, output)
                     status = True
+                else:
+                    output = msg.search_empty()
             else:
                 output = msg.llm_error(response.message)
         else:

diff --git a/src/main/askai/core/support/langchain_support.py b/src/main/askai/core/support/langchain_support.py
@@ -1,13 +1,20 @@
-from typing import Any, Dict, List
+from functools import lru_cache
+from typing import Any, Dict, List, Type
 
 from hspylib.core.metaclass.singleton import Singleton
 from hspylib.core.preconditions import check_not_none
+from langchain_core.documents import Document
 from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
 
 from askai.core.model.chat_context import ChatContext
 from askai.core.support.shared_instances import shared
 
 
+def load_document(loader_type: Type, url: str | List[str]) -> List[Document]:
+    """TODO"""
+    return loader_type(url).load()
+
+
 class LangChainSupport(metaclass=Singleton):
     """TODO"""
 
@@ -16,18 +23,28 @@ class LangChainSupport(metaclass=Singleton):
     LANGCHAIN_ROLE_MAP: Dict = {"user": HumanMessage, "system": SystemMessage, "assistant": AIMessage}
 
     @staticmethod
-    def create_model(temperature: float = 0.8, top_p: float = 0.0) -> Any:
+    @lru_cache
+    def create_model(temperature: float = 0.0, top_p: float = 0.0) -> Any:
         """TODO"""
         check_not_none(shared.engine, "AI Engine was not created yet!")
         return shared.engine.lc_model(temperature, top_p)
 
     @staticmethod
+    @lru_cache
+    def create_chat_model(temperature: float = 0.0) -> Any:
+        """TODO"""
+        check_not_none(shared.engine, "AI Engine was not created yet!")
+        return shared.engine.lc_chat_model(temperature)
+
+    @staticmethod
+    @lru_cache
     def create_embeddings() -> Any:
         """TODO"""
         check_not_none(shared.engine, "AI Engine was not created yet!")
         return shared.engine.lc_embeddings()
 
     @classmethod
+    @lru_cache
     def get_context(cls, key: str) -> List:
         """TODO"""
         context: List[ChatContext.ContextEntry] = shared.context[key]

diff --git a/src/main/askai/core/support/utilities.py b/src/main/askai/core/support/utilities.py
@@ -12,22 +12,23 @@
 
    Copyright·(c)·2024,·HSPyLib
 """
-from askai.core.support.presets import Presets
-from askai.language.language import Language
+import hashlib
+import os
+import re
+from os.path import basename, dirname
+from pathlib import Path
+from typing import Any, Optional, Tuple
+
+import pause
 from clitt.core.term.cursor import Cursor
 from hspylib.core.enums.charset import Charset
 from hspylib.core.preconditions import check_argument
 from hspylib.core.tools.commons import file_is_not_empty, sysout
 from hspylib.core.tools.text_tools import ensure_endswith
 from hspylib.modules.cli.vt100.vt_color import VtColor
-from os.path import basename, dirname
-from pathlib import Path
-from typing import Any, Optional, Tuple
 
-import hashlib
-import os
-import pause
-import re
+from askai.core.support.presets import Presets
+from askai.language.language import Language
 
 ASKAI_CHAT_ICONS = {
     "": "%RED%",
@@ -45,8 +46,12 @@ def beautify(text: Any) -> str:
     :param text: The text to be beautified.
     """
     # fmt: off
-    re_url = r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'
+    re_url = (
+        r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|'
+        r'www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))'
+        r'[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})')
     text = str(text)
+    text = re.sub(r"\n{2,}", '\n\n', text)
     text = re.sub(r"[Hh]ints?( and tips)?[-:\s][ \n\t]*", f"{ASKAI_CHAT_ICONS['']}{''}  Tips: ", text)
     text = re.sub(r"[Aa]nalysis[-:\s][ \n\t]*", f"{ASKAI_CHAT_ICONS['']}{''}  Analysis: ", text)
     text = re.sub(r"[Ss]ummary[-:\s][ \n\t]*", f"{ASKAI_CHAT_ICONS['']}{''}  Summary:", text)
@@ -137,7 +142,7 @@ def stream_text(text: Any, tempo: int = 1, language: Language = Language.EN_US)
     for i, char in enumerate(text):
         if char == "%" and (i + 1) < len(text):
             try:
-                if (color := text[i + 1 : text.index("%", i + 1)]) in VtColor.names():
+                if (color := text[i + 1: text.index("%", i + 1)]) in VtColor.names():
                     hide, idx = True, text.index("%", i + 1)
                     sysout(f"%{color}%", end="")
                     continue
@@ -188,4 +193,17 @@ def stream_text(text: Any, tempo: int = 1, language: Language = Language.EN_US)
 
 
 if __name__ == '__main__':
-    display_text(" Error: 'LLM' returned an error: Directory not found: 'HomeSetup/docs/'")
+    display_text("""
+    Este text tem ln
+
+
+
+
+     aqui
+
+
+
+     este
+
+     """
+         )
diff --git a/src/main/askai/resources/assets/prompts/analysis-prompt.txt b/src/main/askai/resources/assets/prompts/analysis-prompt.txt
@@ -14,4 +14,4 @@ Before responding to the user, it is imperative that you follow the step-by-step
 
 - Start your response with the phrase: Analysing the provided data\n
 
-- Wrap up your reply by offering a summarized analysis about the content; prefix with: \n'Analysis:'.
+- Wrap up your reply by offering a summarized analysis about the content; prefix with: 'Analysis:'.
Original file line number	Diff line number	Diff line change
Expand Up		@@ -14,4 +14,4 @@ Before responding to the user, it is imperative that you follow the step-by-step

		- Start your response with the phrase: Analysing the provided data\n

		- Wrap up your reply by offering a summarized analysis about the content; prefix with: \n'Analysis:'.
		- Wrap up your reply by offering a summarized analysis about the content; prefix with: 'Analysis:'.