Fix InternetSearch format and query. Removed one agent run

yorevs · Sep 30, 2024 · 42d4394 · 42d4394
1 parent 45de799
commit 42d4394
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 111 deletions.
diff --git a/src/main/askai/core/component/internet_service.py b/src/main/askai/core/component/internet_service.py
@@ -18,34 +18,28 @@
 from collections import defaultdict
 from typing import List
 
-import bs4
+from googleapiclient.errors import HttpError
+from hspylib.core.metaclass.singleton import Singleton
+from hspylib.core.zoned_datetime import now
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain_core.documents import Document
+from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
+from langchain_core.tools import Tool
+from langchain_google_community import GoogleSearchAPIWrapper
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from openai import APIError
+
 from askai.__classpath__ import API_KEYS
 from askai.core.askai_configs import configs
 from askai.core.askai_events import events
 from askai.core.askai_messages import msg
 from askai.core.askai_prompt import prompt
 from askai.core.component.geo_location import geo_location
-from askai.core.component.summarizer import summarizer
 from askai.core.engine.openai.temperature import Temperature
 from askai.core.model.ai_reply import AIReply
 from askai.core.model.search_result import SearchResult
 from askai.core.support.langchain_support import lc_llm
 from askai.core.support.shared_instances import shared
-from googleapiclient.errors import HttpError
-from hspylib.core.metaclass.singleton import Singleton
-from hspylib.core.zoned_datetime import now
-from langchain.chains.combine_documents import create_stuff_documents_chain
-from langchain_community.document_loaders.web_base import WebBaseLoader
-from langchain_community.vectorstores.chroma import Chroma
-from langchain_core.documents import Document
-from langchain_core.output_parsers import StrOutputParser
-from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
-from langchain_core.runnables import RunnablePassthrough
-from langchain_core.runnables.utils import Output
-from langchain_core.tools import Tool
-from langchain_google_community import GoogleSearchAPIWrapper
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from openai import APIError
 
 
 class InternetService(metaclass=Singleton):
@@ -84,35 +78,40 @@ class InternetService(metaclass=Singleton):
         "docker.com": "",
         "dropbox.com": "",
         "google.com": "",
+        "search.google.com": "",
         "paypal.com": "",
         "wikipedia.org": "荒",
         "reddit.com": "",
-        "tiktok.com": "懲",
         "ubuntu.com": "",
         "fedora.com": "",
     })
     # fmt: on
 
     @classmethod
-    def wrap_response(cls, terms: str, output: str, result: SearchResult) -> str:
+    def _url_to_icon(cls, url: str) -> str:
+        """TODO"""
+        return url.replace(url, cls.SITE_ICONS[url]) if cls.SITE_ICONS[url] else url
+
+    @classmethod
+    def wrap_response(cls, terms: str, output: str, search: SearchResult) -> str:
         """Format and wrap the search response based on the search terms, output, and method used.
         :param terms: The search terms used in the query.
         :param output: The raw output or results from the search.
-        :param result: The search result.
+        :param search: The search result.
         :return: A formatted string that encapsulates the search response.
         """
-        terms: str = re.sub(r"\s{2,}", " ", terms)
-        sites: set[str] = set(re.findall(r"site:(.+?\..+?)\s+", terms) + result.sites)
-        sources: str = " ".join(
-            filter(len, set(sorted([f"{s.replace(s, cls.SITE_ICONS[s]):<2}".strip() or s for s in sites], key=len)))
-        )
+        re_site: str = r"site:([a-zA-Z0-9._%+-]+(?:\.[a-zA-Z]{2,})+)"
+        sites: list = re.findall(re_site, terms)
+        terms: str = re.sub(r"\s{2,}", " ", re.sub(r"OR", "", re.sub(re_site, "", terms)))
+        sources: str = " ".join(sorted([f"{cls._url_to_icon(s):<2}".strip() or s for s in sites], key=len))
         # fmt: off
         return (
-            f"Your {result.engine.title()} search has returned the following results:"
+            f"Your {search.engine.title()} search has returned the following results:"
             f"\n\n{output}\n\n---\n\n"
-            f"`{cls.CATEGORY_ICONS[result.category]:<2}{result.category}`  **Sources:** {sources}  "
+            f"`{cls.CATEGORY_ICONS[search.category]:<2}{search.category}`  "
+            f"**Sources:** *{sources}*  "
             f"**Access:** {geo_location.location} - *{now('%B %d, %Y')}*\n\n"
-            f">   Terms: {terms}")
+            f">   Terms: {terms} \n")
         # fmt: on
 
     @staticmethod
@@ -123,25 +122,35 @@ def _build_google_query(search: SearchResult) -> str:
         """
         # The order of conditions is important here, as the execution may stop early if a condition is met.
         google_query = ""
+        defaults: list[str] = []
+        # fmt: off
         match search.category.casefold():
             case "people":
+                defaults = ["linkedin.com", "github.com", "instagram.com", "facebook.com"]
                 if any((f.find("intext:") >= 0 for f in search.filters)):
                     google_query = (
-                        "description AND background AND work AND achievements "
+                        "background OR current work OR achievements "
                         f'{next((f for f in search.filters if f.startswith("intext:")), None)}'
                     )
             case "weather":
+                defaults = ["weather.com", "accuweather.com", "weather.gov"]
                 if any((f.find("weather:") >= 0 for f in search.filters)):
                     google_query = (
-                        f'{now("%B %d %Y")} {next((f for f in search.filters if f.startswith("weather:")), None)}'
+                        f"{now('%B %d %Y')}"
+                        f'{next((f for f in search.filters if f.startswith("weather:")), None)}'
                     )
+            case "programming":
+                defaults = ["stackoverflow.com", "github.com"]
+            case "general":
+                defaults = ["google.com", "bing.com", "duckduckgo.com", "ask.com"]
             case _:
                 if search.keywords:
                     # Gather the sites to be used in the search.
-                    sites = f"{' OR '.join(set('site:' + url for url in search.sites))}"
-                    google_query = f"{' '.join(set(sorted(search.keywords)))} {sites}"
+                    google_query = f"{' OR '.join(set(sorted(search.keywords)))}"
+        # fmt: on
+        sites = f"{' OR '.join(set('site:' + url for url in search.sites + defaults))}"
 
-        return google_query
+        return f"{google_query} {sites}"
 
     def __init__(self):
         API_KEYS.ensure("GOOGLE_API_KEY", "google_search")
@@ -163,19 +172,16 @@ def google_search(self, search: SearchResult) -> str:
         :return: A refined string containing the search results.
         """
         events.reply.emit(reply=AIReply.info(msg.searching()))
-        search.sites = search.sites or ["google.com", "bing.com", "duckduckgo.com", "ask.com"]
         terms: str = self._build_google_query(search).strip()
         question: str = re.sub(r"(\w+:)*|((\w+\.\w+)*)", "", terms, flags=re.DOTALL | re.MULTILINE)
         try:
             log.info("Searching Google for '%s'", terms)
             events.reply.emit(reply=AIReply.debug(msg.final_query(terms)))
             results: list[str] = str(self._tool.run(terms, verbose=configs.is_debug)).split(" ")
-            llm_prompt = ChatPromptTemplate.from_messages(
-                [
-                    ("system", "Use the following context to answer the question at the end:\\n\\n{context}"),
-                    ("human", "{question}"),
-                ]
+            refine_prompt = PromptTemplate.from_template(self.refine_template).format(
+                idiom=shared.idiom, sources=search.sites, location=geo_location.location, datetime=geo_location.datetime
             )
+            llm_prompt = ChatPromptTemplate.from_messages([("system", refine_prompt), ("human", "{question}")])
             docs: List[Document] = [Document(d) for d in results]
             chain = create_stuff_documents_chain(
                 lc_llm.create_chat_model(temperature=Temperature.COLDEST.temp), llm_prompt
@@ -184,66 +190,7 @@ def google_search(self, search: SearchResult) -> str:
         except (HttpError, APIError) as err:
             return msg.fail_to_search(str(err))
 
-        return self.refine_search(terms, output, search)
-
-    def refine_search(self, terms: str, response: str, search: SearchResult) -> str:
-        """Refine the text retrieved by the search engine.
-        :param terms: The search terms used in the search.
-        :param response: The internet search response.
-        :param search: The search result object.
-        :return: A refined version of the search result text, tailored to better answer the user's question.
-        """
-        refine_prompt = PromptTemplate.from_template(self.refine_template).format(
-            idiom=shared.idiom,
-            sources=search.sites,
-            location=geo_location.location,
-            datetime=geo_location.datetime,
-            result=response,
-            question=search.question,
-        )
-        log.info("STT::[QUESTION] '%s'", response)
-        llm = lc_llm.create_chat_model(temperature=Temperature.CREATIVE_WRITING.temp)
-
-        if (response := llm.invoke(refine_prompt)) and (output := response.content):
-            return self.wrap_response(terms, output, search)
-
-        return msg.no_good_result()
-
-    def scrap_sites(self, search: SearchResult) -> str:
-        """Scrape a web page and summarize its contents.
-        :param search: The AI search parameters encapsulated in a SearchResult object.
-        :return: A string containing the summarized contents of the scraped web page.
-        """
-        events.reply.emit(reply=AIReply.info(msg.scrapping()))
-        if len(search.sites) > 0:
-            log.info("Scrapping sites: '%s'", str(", ".join(search.sites)))
-            loader = WebBaseLoader(
-                web_paths=search.sites,
-                bs_kwargs=dict(parse_only=bs4.SoupStrainer(["article", "span", "div", "h1", "h2", "h3"])),
-            )
-            if (page_content := loader.load()) and len(page_content) > 0:
-                splits: List[Document] = summarizer.text_splitter.split_documents(page_content)
-                v_store = Chroma.from_documents(splits, lc_llm.create_embeddings())
-                retriever = v_store.as_retriever()
-                scrap_prompt = PromptTemplate(
-                    input_variables=["context", "question"], template=prompt.read_prompt("qstring")
-                )
-
-                def _format_docs(docs) -> str:
-                    return "\n\n".join(doc.page_content for doc in docs)
-
-                rag_chain = (
-                    {"context": retriever | _format_docs, "question": RunnablePassthrough()}
-                    | scrap_prompt
-                    | lc_llm.create_model()
-                    | StrOutputParser()
-                )
-
-                output: Output = rag_chain.invoke(search.question)
-                v_store.delete_collection()  # cleanup
-                log.info("Scrapping sites returned: '%s'", str(output))
-                return self.refine_search(search.question, str(output), search)
-        return msg.no_output("search")
+        return self.wrap_response(terms, output, search)
 
 
 assert (internet := InternetService().INSTANCE) is not None
diff --git a/src/main/askai/resources/prompts/refine-search.txt b/src/main/askai/resources/prompts/refine-search.txt
@@ -20,7 +20,10 @@ Refine the existing response by adding more relevant details and ensuring the ex
 3. **Response Format:**
     - Enhance the response using Markdown to format single-line code blocks for brief code snippets and multi-line code blocks for extensive code sections. Emphasize key elements or important stuff in **bold** and names in *italic*. When the response is already a markdown formatted text, just ensure everything is neat.
 
-4. **Leave it Untouched:**
+4. **References:**
+    - Ensure that any references, links, or external sources, found on the context, are included. don't try to make up any link or reference.
+
+5. **Leave it Untouched:**
     - If no improvements are possible, return the result as is without any extraneous explanation or comments.
 
 
@@ -34,10 +37,10 @@ Refine the existing response by adding more relevant details and ensuring the ex
 
 Internet Search Result:
 
-{result}
+{{context}}
 
 
-User Question: "{question}"
+User Question: "{{question}}"
 
 
 Begin refining the response!
diff --git a/src/main/askai/resources/prompts/search-builder.txt b/src/main/askai/resources/prompts/search-builder.txt
@@ -28,17 +28,15 @@ Your task is to respond to a user query following the steps below. You MUST foll
 
 3. **Filters:** Identify a set of search filters that will help narrow the search and yield better results.
 
-4. **Source Selection:** Retrieve credible sources relevant to the question. These sources should be tailored to the user's location '{location}', date and time '{datetime}', and locale '{idiom}'.
+4. **Source Selection:** Retrieve credible sources relevant to the question. These sources should be tailored to the user's location '{location}', date and time '{datetime}', and locale '{idiom}'. Include the paths from the URLs, excluding the protocol and www. Example: http://www.google.com -> google.com, https://linkedin.com -> linkedin.com.
 
-5. **Personal Inquiries:** For inquiries related to non notorious individuals, **EXTRACT THE EXACT NAME** of the referenced person **WITHOUT MODIFYING** IT and add the filter: intext:"<person name>" to your list. Default to the following websites if none are mentioned: "github.com", "linkedin.com", "facebook.com", "instagram.com", "tiktok.com", "x.com".
+5. **People Inquiries:** For inquiries related to non notorious individuals, **EXTRACT THE EXACT NAME** of the referenced person **WITHOUT MODIFYING** IT and add the filter: intext:"<person name>" to your list.
 
-6. **Technical Inquiries:** If the query relates to programming languages, operating systems, or IT, default to using the following websites: "stackoverflow.com", "github.com".
+6. **Weather Inquiries:** For weather-related inquiries, add the filter: 'weather:"<location>"' to your list.
 
-7. **Weather Inquiries:** For weather-related inquiries, add the filter: 'weather:"<location>"' to your list. Default to websites such as 'weather.com', 'accuweather.com', 'weather.gov'.
+7. **Map Inquiries:** For map-related inquiries, add the filter: 'map:"<location>"' to your list.
 
-8. **Map Inquiries:** For map-related inquiries, add the filter: 'map:"<location>"' to your list.
-
-9. **General Search:** For broad inquiries or searches where the nature of the query cannot be determined, avoid using restrictive filters. Instead, rely on general search engines such as "google.com", "bing.com", "duckduckgo.com", and "ask.com."
+8. **General Search:** For broad inquiries or searches where the nature of the query cannot be determined, avoid using restrictive filters.
 
 The response should follow this format:
 

diff --git a/src/main/askai/resources/rag/task-splitter.csv b/src/main/askai/resources/rag/task-splitter.csv
@@ -5,7 +5,6 @@ Hello, who are you?;The user is engaging in a casual conversation. The response
 find . -mxdepth 1 -type f -nme *.png;The user has provided a direct terminal command that contains syntax errors. I will correct these errors and execute the command afterward.;N/A;[{{1. Execute on terminal: 'find . -maxdepth 1 -type f -name *.png'}}]
 What is the next Flamengo match?;The user is seeking the schedule for the upcoming Flamengo match. To obtain this information, I will conduct a Google search using today's date, July 9, 2024.;N/A;[{{1. Search google for: 'Upcoming Flamengo match schedule Brazil July 9th 2024'}}]
 What is the weather in San Francisco?;The user seeks to know the current weather conditions in San Francisco, U.S. To obtain this information, I will browse Google, utilizing today's date, 2024-07-09, along with the relevant locale.;N/A;[{{1. Search google for: 'Current weather conditions in San Franciso U.S July 9th 2024.'}}]
-Who is Fulano de Tal?;I'm unable to provide real-time information about individuals, so I will search Google for Fulano de Tal.;N/A;[{{1. Search google for: 'Fulano de Tal'}}]
 List my music;The user requests a list of all his music files. The music directory is located at ${{HOME}}/Music, as per his operating system (Darwin).;N/A;[{{1. List all music files [*.mp3, *.wav, *.m4a, *.aac, *.ogg]}}]
 Summarize my markdown files at my HomeSetup docs folder.;The user requests a summary of their markdown files located in the 'HomeSetup' folder. I will utilize the Summarizer tool for this purpose.;N/A;[{{1. Summarize all *.md files from ${{HOME}}/HomeSetup/docs }}]
 Open the first reminder you find at my downloads and tell me what I must do first.;The user requests that I determine the initial task or action needed based on the first reminder file found in their downloads folder.;N/A;[{{1. List the user downloads folder}},{{2. Identify the first reminder file}},{{3. Open the first reminder file if found}},{{4. Identify the first task or action to be taken according to this reminder}}]
@@ -21,7 +20,7 @@ List my downloads using stt.;The user requests that I list the contents of their
 Create a small Python program to allow me to calculate the speed given the time and distance, save it as 'dist.py'.;The user wants me to create a Python program that calculates speed based on time and distance. I will use the generate_content tool for this purpose, keeping in mind that this tool automatically saves the generated content.;N/A;[{{1. Generate a program to calculate the speed given the time and distance and save it as 'dist.py'}}]
 rm -f /tmp/myfile.txt /tmp/trash.bak;The user provided a direct terminal command. There are no syntax errors, so I will simply forward it.;N/A;[{{1. Execute on terminal: 'rm -f /tmp/myfile.txt /tmp/trash.bak'}}]
 Describe me;The user requests to describe him, and I can utilize the webcam_capturer for this purpose.;N/A;{{1. Use the webcam_capturer to describe the person in front of it}}
-Who is Hugo Saporetti Junior;I'm unable to provide real-time information about individuals, so I will search Google for Hugo Saporetti junior.;N/A;[{{1. Search google for: "Hugo Saporetti Junior" description + background + achievements}}]
+Who is Hugo Saporetti Junior;I'm unable to provide real-time information about individuals, so I will search Google for Hugo Saporetti junior.;N/A;[{{1. Search google for: "Hugo Saporetti Junior" }}]
 Open Hugo Saporetti junior's linkedin page;The user requests that I open the LinkedIn page of 'Hugo Saporetti Junior', therefore, I need to search on linkedin for it and open it if found.;N/A;[{{1. Search google for: linkedin.com/search/results/people/?keywords=Hugo%20Saporetti%20Junior }}, {{2. Open the URL if found}}]
 Open the official wikipedia website;The user wants me to open the official wikipedia website, which is a widely known URL.;Direct: Open the URL https://www.wikipedia.org;N/A
 Open yorevs github page;The user wants me to open the official github website of a specific user.;Direct: Open the URL https://github.com/yorevs;N/A