Skip to content

Commit

Permalink
Fix InternetSearch format and query. Removed one agent run
Browse files Browse the repository at this point in the history
  • Loading branch information
yorevs committed Sep 30, 2024
1 parent 45de799 commit 42d4394
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 111 deletions.
145 changes: 46 additions & 99 deletions src/main/askai/core/component/internet_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,34 +18,28 @@
from collections import defaultdict
from typing import List

import bs4
from googleapiclient.errors import HttpError
from hspylib.core.metaclass.singleton import Singleton
from hspylib.core.zoned_datetime import now
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.tools import Tool
from langchain_google_community import GoogleSearchAPIWrapper
from langchain_text_splitters import RecursiveCharacterTextSplitter
from openai import APIError

from askai.__classpath__ import API_KEYS
from askai.core.askai_configs import configs
from askai.core.askai_events import events
from askai.core.askai_messages import msg
from askai.core.askai_prompt import prompt
from askai.core.component.geo_location import geo_location
from askai.core.component.summarizer import summarizer
from askai.core.engine.openai.temperature import Temperature
from askai.core.model.ai_reply import AIReply
from askai.core.model.search_result import SearchResult
from askai.core.support.langchain_support import lc_llm
from askai.core.support.shared_instances import shared
from googleapiclient.errors import HttpError
from hspylib.core.metaclass.singleton import Singleton
from hspylib.core.zoned_datetime import now
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.document_loaders.web_base import WebBaseLoader
from langchain_community.vectorstores.chroma import Chroma
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables.utils import Output
from langchain_core.tools import Tool
from langchain_google_community import GoogleSearchAPIWrapper
from langchain_text_splitters import RecursiveCharacterTextSplitter
from openai import APIError


class InternetService(metaclass=Singleton):
Expand Down Expand Up @@ -84,35 +78,40 @@ class InternetService(metaclass=Singleton):
"docker.com": "",
"dropbox.com": "",
"google.com": "",
"search.google.com": "",
"paypal.com": "",
"wikipedia.org": "荒",
"reddit.com": "",
"tiktok.com": "懲",
"ubuntu.com": "",
"fedora.com": "",
})
# fmt: on

@classmethod
def wrap_response(cls, terms: str, output: str, result: SearchResult) -> str:
def _url_to_icon(cls, url: str) -> str:
"""TODO"""
return url.replace(url, cls.SITE_ICONS[url]) if cls.SITE_ICONS[url] else url

@classmethod
def wrap_response(cls, terms: str, output: str, search: SearchResult) -> str:
"""Format and wrap the search response based on the search terms, output, and method used.
:param terms: The search terms used in the query.
:param output: The raw output or results from the search.
:param result: The search result.
:param search: The search result.
:return: A formatted string that encapsulates the search response.
"""
terms: str = re.sub(r"\s{2,}", " ", terms)
sites: set[str] = set(re.findall(r"site:(.+?\..+?)\s+", terms) + result.sites)
sources: str = " ".join(
filter(len, set(sorted([f"{s.replace(s, cls.SITE_ICONS[s]):<2}".strip() or s for s in sites], key=len)))
)
re_site: str = r"site:([a-zA-Z0-9._%+-]+(?:\.[a-zA-Z]{2,})+)"
sites: list = re.findall(re_site, terms)
terms: str = re.sub(r"\s{2,}", " ", re.sub(r"OR", "", re.sub(re_site, "", terms)))
sources: str = " ".join(sorted([f"{cls._url_to_icon(s):<2}".strip() or s for s in sites], key=len))
# fmt: off
return (
f"Your {result.engine.title()} search has returned the following results:"
f"Your {search.engine.title()} search has returned the following results:"
f"\n\n{output}\n\n---\n\n"
f"`{cls.CATEGORY_ICONS[result.category]:<2}{result.category}` **Sources:** {sources} "
f"`{cls.CATEGORY_ICONS[search.category]:<2}{search.category}` "
f"**Sources:** *{sources}* "
f"**Access:** {geo_location.location} - *{now('%B %d, %Y')}*\n\n"
f">  Terms: {terms}")
f">  Terms: {terms} \n")
# fmt: on

@staticmethod
Expand All @@ -123,25 +122,35 @@ def _build_google_query(search: SearchResult) -> str:
"""
# The order of conditions is important here, as the execution may stop early if a condition is met.
google_query = ""
defaults: list[str] = []
# fmt: off
match search.category.casefold():
case "people":
defaults = ["linkedin.com", "github.com", "instagram.com", "facebook.com"]
if any((f.find("intext:") >= 0 for f in search.filters)):
google_query = (
"description AND background AND work AND achievements "
"background OR current work OR achievements "
f'{next((f for f in search.filters if f.startswith("intext:")), None)}'
)
case "weather":
defaults = ["weather.com", "accuweather.com", "weather.gov"]
if any((f.find("weather:") >= 0 for f in search.filters)):
google_query = (
f'{now("%B %d %Y")} {next((f for f in search.filters if f.startswith("weather:")), None)}'
f"{now('%B %d %Y')}"
f'{next((f for f in search.filters if f.startswith("weather:")), None)}'
)
case "programming":
defaults = ["stackoverflow.com", "github.com"]
case "general":
defaults = ["google.com", "bing.com", "duckduckgo.com", "ask.com"]
case _:
if search.keywords:
# Gather the sites to be used in the search.
sites = f"{' OR '.join(set('site:' + url for url in search.sites))}"
google_query = f"{' '.join(set(sorted(search.keywords)))} {sites}"
google_query = f"{' OR '.join(set(sorted(search.keywords)))}"
# fmt: on
sites = f"{' OR '.join(set('site:' + url for url in search.sites + defaults))}"

return google_query
return f"{google_query} {sites}"

def __init__(self):
API_KEYS.ensure("GOOGLE_API_KEY", "google_search")
Expand All @@ -163,19 +172,16 @@ def google_search(self, search: SearchResult) -> str:
:return: A refined string containing the search results.
"""
events.reply.emit(reply=AIReply.info(msg.searching()))
search.sites = search.sites or ["google.com", "bing.com", "duckduckgo.com", "ask.com"]
terms: str = self._build_google_query(search).strip()
question: str = re.sub(r"(\w+:)*|((\w+\.\w+)*)", "", terms, flags=re.DOTALL | re.MULTILINE)
try:
log.info("Searching Google for '%s'", terms)
events.reply.emit(reply=AIReply.debug(msg.final_query(terms)))
results: list[str] = str(self._tool.run(terms, verbose=configs.is_debug)).split(" ")
llm_prompt = ChatPromptTemplate.from_messages(
[
("system", "Use the following context to answer the question at the end:\\n\\n{context}"),
("human", "{question}"),
]
refine_prompt = PromptTemplate.from_template(self.refine_template).format(
idiom=shared.idiom, sources=search.sites, location=geo_location.location, datetime=geo_location.datetime
)
llm_prompt = ChatPromptTemplate.from_messages([("system", refine_prompt), ("human", "{question}")])
docs: List[Document] = [Document(d) for d in results]
chain = create_stuff_documents_chain(
lc_llm.create_chat_model(temperature=Temperature.COLDEST.temp), llm_prompt
Expand All @@ -184,66 +190,7 @@ def google_search(self, search: SearchResult) -> str:
except (HttpError, APIError) as err:
return msg.fail_to_search(str(err))

return self.refine_search(terms, output, search)

def refine_search(self, terms: str, response: str, search: SearchResult) -> str:
"""Refine the text retrieved by the search engine.
:param terms: The search terms used in the search.
:param response: The internet search response.
:param search: The search result object.
:return: A refined version of the search result text, tailored to better answer the user's question.
"""
refine_prompt = PromptTemplate.from_template(self.refine_template).format(
idiom=shared.idiom,
sources=search.sites,
location=geo_location.location,
datetime=geo_location.datetime,
result=response,
question=search.question,
)
log.info("STT::[QUESTION] '%s'", response)
llm = lc_llm.create_chat_model(temperature=Temperature.CREATIVE_WRITING.temp)

if (response := llm.invoke(refine_prompt)) and (output := response.content):
return self.wrap_response(terms, output, search)

return msg.no_good_result()

def scrap_sites(self, search: SearchResult) -> str:
"""Scrape a web page and summarize its contents.
:param search: The AI search parameters encapsulated in a SearchResult object.
:return: A string containing the summarized contents of the scraped web page.
"""
events.reply.emit(reply=AIReply.info(msg.scrapping()))
if len(search.sites) > 0:
log.info("Scrapping sites: '%s'", str(", ".join(search.sites)))
loader = WebBaseLoader(
web_paths=search.sites,
bs_kwargs=dict(parse_only=bs4.SoupStrainer(["article", "span", "div", "h1", "h2", "h3"])),
)
if (page_content := loader.load()) and len(page_content) > 0:
splits: List[Document] = summarizer.text_splitter.split_documents(page_content)
v_store = Chroma.from_documents(splits, lc_llm.create_embeddings())
retriever = v_store.as_retriever()
scrap_prompt = PromptTemplate(
input_variables=["context", "question"], template=prompt.read_prompt("qstring")
)

def _format_docs(docs) -> str:
return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
{"context": retriever | _format_docs, "question": RunnablePassthrough()}
| scrap_prompt
| lc_llm.create_model()
| StrOutputParser()
)

output: Output = rag_chain.invoke(search.question)
v_store.delete_collection() # cleanup
log.info("Scrapping sites returned: '%s'", str(output))
return self.refine_search(search.question, str(output), search)
return msg.no_output("search")
return self.wrap_response(terms, output, search)


assert (internet := InternetService().INSTANCE) is not None
9 changes: 6 additions & 3 deletions src/main/askai/resources/prompts/refine-search.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@ Refine the existing response by adding more relevant details and ensuring the ex
3. **Response Format:**
- Enhance the response using Markdown to format single-line code blocks for brief code snippets and multi-line code blocks for extensive code sections. Emphasize key elements or important stuff in **bold** and names in *italic*. When the response is already a markdown formatted text, just ensure everything is neat.

4. **Leave it Untouched:**
4. **References:**
- Ensure that any references, links, or external sources, found on the context, are included. don't try to make up any link or reference.

5. **Leave it Untouched:**
- If no improvements are possible, return the result as is without any extraneous explanation or comments.


Expand All @@ -34,10 +37,10 @@ Refine the existing response by adding more relevant details and ensuring the ex

Internet Search Result:

{result}
{{context}}


User Question: "{question}"
User Question: "{{question}}"


Begin refining the response!
12 changes: 5 additions & 7 deletions src/main/askai/resources/prompts/search-builder.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,17 +28,15 @@ Your task is to respond to a user query following the steps below. You MUST foll

3. **Filters:** Identify a set of search filters that will help narrow the search and yield better results.

4. **Source Selection:** Retrieve credible sources relevant to the question. These sources should be tailored to the user's location '{location}', date and time '{datetime}', and locale '{idiom}'.
4. **Source Selection:** Retrieve credible sources relevant to the question. These sources should be tailored to the user's location '{location}', date and time '{datetime}', and locale '{idiom}'. Include the paths from the URLs, excluding the protocol and www. Example: http://www.google.com -> google.com, https://linkedin.com -> linkedin.com.

5. **Personal Inquiries:** For inquiries related to non notorious individuals, **EXTRACT THE EXACT NAME** of the referenced person **WITHOUT MODIFYING** IT and add the filter: intext:"<person name>" to your list. Default to the following websites if none are mentioned: "github.com", "linkedin.com", "facebook.com", "instagram.com", "tiktok.com", "x.com".
5. **People Inquiries:** For inquiries related to non notorious individuals, **EXTRACT THE EXACT NAME** of the referenced person **WITHOUT MODIFYING** IT and add the filter: intext:"<person name>" to your list.

6. **Technical Inquiries:** If the query relates to programming languages, operating systems, or IT, default to using the following websites: "stackoverflow.com", "github.com".
6. **Weather Inquiries:** For weather-related inquiries, add the filter: 'weather:"<location>"' to your list.

7. **Weather Inquiries:** For weather-related inquiries, add the filter: 'weather:"<location>"' to your list. Default to websites such as 'weather.com', 'accuweather.com', 'weather.gov'.
7. **Map Inquiries:** For map-related inquiries, add the filter: 'map:"<location>"' to your list.

8. **Map Inquiries:** For map-related inquiries, add the filter: 'map:"<location>"' to your list.

9. **General Search:** For broad inquiries or searches where the nature of the query cannot be determined, avoid using restrictive filters. Instead, rely on general search engines such as "google.com", "bing.com", "duckduckgo.com", and "ask.com."
8. **General Search:** For broad inquiries or searches where the nature of the query cannot be determined, avoid using restrictive filters.

The response should follow this format:

Expand Down
3 changes: 1 addition & 2 deletions src/main/askai/resources/rag/task-splitter.csv
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ Hello, who are you?;The user is engaging in a casual conversation. The response
find . -mxdepth 1 -type f -nme *.png;The user has provided a direct terminal command that contains syntax errors. I will correct these errors and execute the command afterward.;N/A;[{{1. Execute on terminal: 'find . -maxdepth 1 -type f -name *.png'}}]
What is the next Flamengo match?;The user is seeking the schedule for the upcoming Flamengo match. To obtain this information, I will conduct a Google search using today's date, July 9, 2024.;N/A;[{{1. Search google for: 'Upcoming Flamengo match schedule Brazil July 9th 2024'}}]
What is the weather in San Francisco?;The user seeks to know the current weather conditions in San Francisco, U.S. To obtain this information, I will browse Google, utilizing today's date, 2024-07-09, along with the relevant locale.;N/A;[{{1. Search google for: 'Current weather conditions in San Franciso U.S July 9th 2024.'}}]
Who is Fulano de Tal?;I'm unable to provide real-time information about individuals, so I will search Google for Fulano de Tal.;N/A;[{{1. Search google for: 'Fulano de Tal'}}]
List my music;The user requests a list of all his music files. The music directory is located at ${{HOME}}/Music, as per his operating system (Darwin).;N/A;[{{1. List all music files [*.mp3, *.wav, *.m4a, *.aac, *.ogg]}}]
Summarize my markdown files at my HomeSetup docs folder.;The user requests a summary of their markdown files located in the 'HomeSetup' folder. I will utilize the Summarizer tool for this purpose.;N/A;[{{1. Summarize all *.md files from ${{HOME}}/HomeSetup/docs }}]
Open the first reminder you find at my downloads and tell me what I must do first.;The user requests that I determine the initial task or action needed based on the first reminder file found in their downloads folder.;N/A;[{{1. List the user downloads folder}},{{2. Identify the first reminder file}},{{3. Open the first reminder file if found}},{{4. Identify the first task or action to be taken according to this reminder}}]
Expand All @@ -21,7 +20,7 @@ List my downloads using stt.;The user requests that I list the contents of their
Create a small Python program to allow me to calculate the speed given the time and distance, save it as 'dist.py'.;The user wants me to create a Python program that calculates speed based on time and distance. I will use the generate_content tool for this purpose, keeping in mind that this tool automatically saves the generated content.;N/A;[{{1. Generate a program to calculate the speed given the time and distance and save it as 'dist.py'}}]
rm -f /tmp/myfile.txt /tmp/trash.bak;The user provided a direct terminal command. There are no syntax errors, so I will simply forward it.;N/A;[{{1. Execute on terminal: 'rm -f /tmp/myfile.txt /tmp/trash.bak'}}]
Describe me;The user requests to describe him, and I can utilize the webcam_capturer for this purpose.;N/A;{{1. Use the webcam_capturer to describe the person in front of it}}
Who is Hugo Saporetti Junior;I'm unable to provide real-time information about individuals, so I will search Google for Hugo Saporetti junior.;N/A;[{{1. Search google for: "Hugo Saporetti Junior" description + background + achievements}}]
Who is Hugo Saporetti Junior;I'm unable to provide real-time information about individuals, so I will search Google for Hugo Saporetti junior.;N/A;[{{1. Search google for: "Hugo Saporetti Junior" }}]
Open Hugo Saporetti junior's linkedin page;The user requests that I open the LinkedIn page of 'Hugo Saporetti Junior', therefore, I need to search on linkedin for it and open it if found.;N/A;[{{1. Search google for: linkedin.com/search/results/people/?keywords=Hugo%20Saporetti%20Junior }}, {{2. Open the URL if found}}]
Open the official wikipedia website;The user wants me to open the official wikipedia website, which is a widely known URL.;Direct: Open the URL https://www.wikipedia.org;N/A
Open yorevs github page;The user wants me to open the official github website of a specific user.;Direct: Open the URL https://github.com/yorevs;N/A
Expand Down

0 comments on commit 42d4394

Please sign in to comment.