diff --git a/dependencies.hspd b/dependencies.hspd index 5f36341e..a4f8cf67 100644 --- a/dependencies.hspd +++ b/dependencies.hspd @@ -25,6 +25,9 @@ package: langchain-community package: unstructured package: python-magic-bin package: chromadb +package: tiktoken +package: aiohttp +package: html2text package: PyAudio package: SpeechRecognition package: openai-whisper diff --git a/src/main/askai/core/askai.py b/src/main/askai/core/askai.py index 9c486b8d..4ec6d51c 100644 --- a/src/main/askai/core/askai.py +++ b/src/main/askai/core/askai.py @@ -43,7 +43,6 @@ from askai.core.processor.ai_processor import AIProcessor from askai.core.processor.generic_processor import GenericProcessor from askai.core.processor.internet_processor import InternetProcessor -from askai.core.processor.output_processor import OutputProcessor from askai.core.processor.processor_proxy import proxy from askai.core.processor.summary_processor import SummaryProcessor from askai.core.support.object_mapper import object_mapper @@ -244,14 +243,14 @@ def _process_response(self, proxy_response: QueryResponse) -> bool: # Intrinsic features if not proxy_response.intelligible: self.reply_error(msg.intelligible(proxy_response.question)) - return False + return True elif proxy_response.terminating: log.info("User wants to terminate the conversation.") return False elif proxy_response.require_internet: log.info("Internet is required to fulfill the request.") processor = AIProcessor.get_by_name(InternetProcessor.__name__) - processor.bind(AIProcessor.get_by_query_type(proxy_response.query_type)) + processor.bind(AIProcessor.get_by_name(GenericProcessor.__name__)) elif proxy_response.require_summarization: log.info("Summarization is required to fulfill the request.") processor = AIProcessor.get_by_name(SummaryProcessor.__name__) diff --git a/src/main/askai/core/askai_messages.py b/src/main/askai/core/askai_messages.py index 6073d189..004ba8a5 100644 --- a/src/main/askai/core/askai_messages.py +++ b/src/main/askai/core/askai_messages.py @@ -75,6 +75,10 @@ def summarizing(self, path: str) -> str: def cmd_no_output(self) -> str: return self.translate(f"The command didn't return an output !") + @lru_cache + def search_empty(self) -> str: + return self.translate(f"The google research didn't return an output !") + @lru_cache def access_grant(self) -> str: return self.translate(f"'AskAI' requires access to your files, folders and apps. Continue (yes/[no])?") @@ -103,7 +107,7 @@ def cmd_failed(self, cmd_line: str) -> str: @lru_cache def intelligible(self, question: str) -> str: - return self.translate(f"Your question '{question}' is not clear, please rephrase !") + return self.translate(f"Your question '{question}' is not clear, please reformulate !") @lru_cache def llm_error(self, error: str) -> str: diff --git a/src/main/askai/core/component/internet_service.py b/src/main/askai/core/component/internet_service.py index 4014f0be..f8ccf9b0 100644 --- a/src/main/askai/core/component/internet_service.py +++ b/src/main/askai/core/component/internet_service.py @@ -13,17 +13,21 @@ Copyright·(c)·2024,·HSPyLib """ import logging as log -import os -from functools import lru_cache -from typing import Optional +from typing import Optional, List from hspylib.core.metaclass.singleton import Singleton +from langchain.chains import load_summarize_chain +from langchain.chains.combine_documents import create_stuff_documents_chain +from langchain_community.document_loaders.async_html import AsyncHtmlLoader from langchain_community.utilities import GoogleSearchAPIWrapper +from langchain_core.documents import Document +from langchain_core.prompts import ChatPromptTemplate from langchain_core.tools import Tool +from langchain_text_splitters import CharacterTextSplitter from askai.core.askai_events import AskAiEvents from askai.core.askai_messages import msg -from askai.core.support.langchain_support import lc_llm +from askai.core.support.langchain_support import lc_llm, load_document class InternetService(metaclass=Singleton): @@ -33,6 +37,15 @@ class InternetService(metaclass=Singleton): ASKAI_INTERNET_DATA_KEY = "askai-internet-data" + @staticmethod + def scrap_sites(*sites: str) -> Optional[str]: + """TODO""" + log.info("Scrapping sites: '%s'", str(sites)) + docs: List[Document] = load_document(AsyncHtmlLoader, *sites) + chain = load_summarize_chain(lc_llm.create_chat_model(), chain_type="stuff") + search_results = chain.invoke(docs) + return search_results['output_text'] + def __init__(self): self._google = GoogleSearchAPIWrapper() self._tool = Tool( @@ -40,30 +53,25 @@ def __init__(self): description="Search Google for recent results.", func=self._google.run) - @lru_cache def search_google(self, query: str, *sites: str) -> Optional[str]: """Search the web using google search API. :param query: The google search query string. :param sites: The sites you want google to search for. """ - search_results: str = '' AskAiEvents.ASKAI_BUS.events.reply.emit(message=msg.searching()) - log.info("Searching GOOGLE for '%s' url: '%s'", query, str(sites)) - if sites: + if len(sites) > 0: + log.info("Searching GOOGLE for '%s' url: '%s'", query, str(sites)) + search_results: str = '' for url in sites: search_results += str(self._tool.run(f"{query} site: {url}")) - else: - search_results += str(self._tool.run(f"{query}")) - log.debug(f"Internet search output: %s", search_results) + text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100) + docs: List[Document] = [Document(page_content=x) for x in text_splitter.split_text(search_results)] + prompt = ChatPromptTemplate.from_messages([("system", "{query}\n\n{context}")]) + chain = create_stuff_documents_chain(lc_llm.create_chat_model(), prompt) + search_results = chain.invoke({"query": query, "context": docs}) + return search_results - return search_results + return None assert (internet := InternetService().INSTANCE) is not None - - -if __name__ == '__main__': - q = 'What is the whether like in Belo Horizonte now' - embeddings = lc_llm.create_embeddings() - c = internet.search_google(q) - eq = embeddings.embed_query(q) diff --git a/src/main/askai/core/engine/ai_engine.py b/src/main/askai/core/engine/ai_engine.py index 5b447e47..98962ec2 100644 --- a/src/main/askai/core/engine/ai_engine.py +++ b/src/main/askai/core/engine/ai_engine.py @@ -12,6 +12,8 @@ Copyright·(c)·2024,·HSPyLib """ +from langchain_core.language_models import BaseLLM, BaseChatModel + from askai.core.model.ai_model import AIModel from askai.core.model.ai_reply import AIReply from typing import Any, List, Optional, Protocol @@ -20,8 +22,18 @@ class AIEngine(Protocol): """Provide an interface for AI engines.""" - def lc_model(self, temperature: float, top_p: float) -> Any: - """Create a LangChain AI model instance.""" + def lc_model(self, temperature: float, top_p: float) -> BaseLLM: + """Create a LangChain AI model instance. + :param temperature: The model engine temperature. + :param top_p: The model engine top_p. + """ + ... + + def lc_chat_model(self, temperature: float = 0.0) -> BaseChatModel: + """Create a LangChain OpenAI llm chat model instance. + :param temperature: The model engine temperature. + """ + ... def lc_embeddings(self) -> Any: """Create a LangChain AI embeddings instance.""" @@ -50,8 +62,8 @@ def models(self) -> List[AIModel]: def ask(self, chat_context: List[dict], temperature: float = 0.8, top_p: float = 0.0) -> AIReply: """Ask AI assistance for the given question and expect a response. :param chat_context: The chat history or context. - :param temperature: TODO - :param top_p: TODO + :param temperature: The model engine temperature. + :param top_p: The model engine top_p. """ ... diff --git a/src/main/askai/core/engine/openai/openai_engine.py b/src/main/askai/core/engine/openai/openai_engine.py index a0753d22..3b3f0b4f 100644 --- a/src/main/askai/core/engine/openai/openai_engine.py +++ b/src/main/askai/core/engine/openai/openai_engine.py @@ -16,11 +16,13 @@ import logging as log import os from threading import Thread -from typing import Any, List, Optional +from typing import List, Optional import langchain_openai import pause from hspylib.core.preconditions import check_not_none +from langchain_core.embeddings import Embeddings +from langchain_core.language_models import BaseLLM, BaseChatModel from openai import APIError, OpenAI from askai.core.component.audio_player import AudioPlayer @@ -52,12 +54,17 @@ def url(self) -> str: def client(self) -> OpenAI: return self._client - def lc_model(self, temperature: float = 0.8, top_p: float = 0.0) -> Any: + def lc_model(self, temperature: float = 0.0, top_p: float = 0.0) -> BaseLLM: """Create a LangChain OpenAI llm model instance.""" return langchain_openai.OpenAI( openai_api_key=self._api_key, temperature=temperature, top_p=top_p) - def lc_embeddings(self) -> Any: + def lc_chat_model(self, temperature: float = 0.0) -> BaseChatModel: + """Create a LangChain OpenAI llm chat model instance.""" + return langchain_openai.ChatOpenAI( + openai_api_key=self._api_key, temperature=temperature) + + def lc_embeddings(self) -> Embeddings: """Create a LangChain AI embeddings instance.""" return langchain_openai.OpenAIEmbeddings() diff --git a/src/main/askai/core/model/chat_context.py b/src/main/askai/core/model/chat_context.py index 205df47a..89ae550e 100644 --- a/src/main/askai/core/model/chat_context.py +++ b/src/main/askai/core/model/chat_context.py @@ -30,7 +30,7 @@ class ChatContext: def __init__(self, token_limit: int): self._context = defaultdict(list) - self._token_limit: int = token_limit #* 1024 # The limit is given in KB + self._token_limit: int = token_limit * 1024 # The limit is given in KB def __str__(self): return os.linesep.join(f"'{k}': '{v}'" for k, v in self._context.items()) @@ -79,8 +79,9 @@ def join(self, *keys: str) -> ContextRaw: context.extend(self.get(key)) return context - def clear(self, key: str) -> int: + def clear(self, *keys: str) -> int: """Clear the all the chat context specified by key.""" - if self._context[key]: - del self._context[key] + for key in keys: + if self._context[key]: + del self._context[key] return len(self._context) diff --git a/src/main/askai/core/model/summary_result.py b/src/main/askai/core/model/summary_result.py index edcb4709..8a618b71 100644 --- a/src/main/askai/core/model/summary_result.py +++ b/src/main/askai/core/model/summary_result.py @@ -26,4 +26,4 @@ class SummaryResult: answer: str = None def __str__(self): - return f"Summarization: {json.dumps(self.__dict__, default=lambda obj: obj.__dict__)}" + return f"Summarization results: {json.dumps(self.__dict__, default=lambda obj: obj.__dict__)}" diff --git a/src/main/askai/core/processor/generic_processor.py b/src/main/askai/core/processor/generic_processor.py index a4808ab4..c8901415 100644 --- a/src/main/askai/core/processor/generic_processor.py +++ b/src/main/askai/core/processor/generic_processor.py @@ -53,6 +53,7 @@ def process(self, query_response: QueryResponse) -> Tuple[bool, Optional[str]]: cache.save_reply(query_response.question, output) cache.save_query_history() status = True + shared.context.clear("INTERNET", "SUMMARY") else: output = msg.llm_error(response.message) diff --git a/src/main/askai/core/processor/internet_processor.py b/src/main/askai/core/processor/internet_processor.py index d422d69c..3ebf49b8 100644 --- a/src/main/askai/core/processor/internet_processor.py +++ b/src/main/askai/core/processor/internet_processor.py @@ -55,6 +55,8 @@ def process(self, query_response: QueryResponse) -> Tuple[bool, Optional[str]]: shared.context.set("INTERNET", output, "assistant") cache.save_reply(query_response.question, output) status = True + else: + output = msg.search_empty() else: output = msg.llm_error(response.message) else: diff --git a/src/main/askai/core/support/langchain_support.py b/src/main/askai/core/support/langchain_support.py index 8a008d67..e1784e9a 100644 --- a/src/main/askai/core/support/langchain_support.py +++ b/src/main/askai/core/support/langchain_support.py @@ -1,13 +1,20 @@ -from typing import Any, Dict, List +from functools import lru_cache +from typing import Any, Dict, List, Type from hspylib.core.metaclass.singleton import Singleton from hspylib.core.preconditions import check_not_none +from langchain_core.documents import Document from langchain_core.messages import AIMessage, HumanMessage, SystemMessage from askai.core.model.chat_context import ChatContext from askai.core.support.shared_instances import shared +def load_document(loader_type: Type, url: str | List[str]) -> List[Document]: + """TODO""" + return loader_type(url).load() + + class LangChainSupport(metaclass=Singleton): """TODO""" @@ -16,18 +23,28 @@ class LangChainSupport(metaclass=Singleton): LANGCHAIN_ROLE_MAP: Dict = {"user": HumanMessage, "system": SystemMessage, "assistant": AIMessage} @staticmethod - def create_model(temperature: float = 0.8, top_p: float = 0.0) -> Any: + @lru_cache + def create_model(temperature: float = 0.0, top_p: float = 0.0) -> Any: """TODO""" check_not_none(shared.engine, "AI Engine was not created yet!") return shared.engine.lc_model(temperature, top_p) @staticmethod + @lru_cache + def create_chat_model(temperature: float = 0.0) -> Any: + """TODO""" + check_not_none(shared.engine, "AI Engine was not created yet!") + return shared.engine.lc_chat_model(temperature) + + @staticmethod + @lru_cache def create_embeddings() -> Any: """TODO""" check_not_none(shared.engine, "AI Engine was not created yet!") return shared.engine.lc_embeddings() @classmethod + @lru_cache def get_context(cls, key: str) -> List: """TODO""" context: List[ChatContext.ContextEntry] = shared.context[key] diff --git a/src/main/askai/core/support/utilities.py b/src/main/askai/core/support/utilities.py index c2144966..74705f38 100644 --- a/src/main/askai/core/support/utilities.py +++ b/src/main/askai/core/support/utilities.py @@ -12,22 +12,23 @@ Copyright·(c)·2024,·HSPyLib """ -from askai.core.support.presets import Presets -from askai.language.language import Language +import hashlib +import os +import re +from os.path import basename, dirname +from pathlib import Path +from typing import Any, Optional, Tuple + +import pause from clitt.core.term.cursor import Cursor from hspylib.core.enums.charset import Charset from hspylib.core.preconditions import check_argument from hspylib.core.tools.commons import file_is_not_empty, sysout from hspylib.core.tools.text_tools import ensure_endswith from hspylib.modules.cli.vt100.vt_color import VtColor -from os.path import basename, dirname -from pathlib import Path -from typing import Any, Optional, Tuple -import hashlib -import os -import pause -import re +from askai.core.support.presets import Presets +from askai.language.language import Language ASKAI_CHAT_ICONS = { "": "%RED%", @@ -45,8 +46,12 @@ def beautify(text: Any) -> str: :param text: The text to be beautified. """ # fmt: off - re_url = r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})' + re_url = ( + r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|' + r'www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))' + r'[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})') text = str(text) + text = re.sub(r"\n{2,}", '\n\n', text) text = re.sub(r"[Hh]ints?( and tips)?[-:\s][ \n\t]*", f"{ASKAI_CHAT_ICONS['']}{''} Tips: ", text) text = re.sub(r"[Aa]nalysis[-:\s][ \n\t]*", f"{ASKAI_CHAT_ICONS['']}{''} Analysis: ", text) text = re.sub(r"[Ss]ummary[-:\s][ \n\t]*", f"{ASKAI_CHAT_ICONS['']}{''} Summary:", text) @@ -137,7 +142,7 @@ def stream_text(text: Any, tempo: int = 1, language: Language = Language.EN_US) for i, char in enumerate(text): if char == "%" and (i + 1) < len(text): try: - if (color := text[i + 1 : text.index("%", i + 1)]) in VtColor.names(): + if (color := text[i + 1: text.index("%", i + 1)]) in VtColor.names(): hide, idx = True, text.index("%", i + 1) sysout(f"%{color}%", end="") continue @@ -188,4 +193,17 @@ def stream_text(text: Any, tempo: int = 1, language: Language = Language.EN_US) if __name__ == '__main__': - display_text(" Error: 'LLM' returned an error: Directory not found: 'HomeSetup/docs/'") + display_text(""" + Este text tem ln + + + + + aqui + + + + este + + """ + ) diff --git a/src/main/askai/resources/assets/prompts/analysis-prompt.txt b/src/main/askai/resources/assets/prompts/analysis-prompt.txt index fe0c8ab7..973175ed 100644 --- a/src/main/askai/resources/assets/prompts/analysis-prompt.txt +++ b/src/main/askai/resources/assets/prompts/analysis-prompt.txt @@ -14,4 +14,4 @@ Before responding to the user, it is imperative that you follow the step-by-step - Start your response with the phrase: Analysing the provided data\n -- Wrap up your reply by offering a summarized analysis about the content; prefix with: \n'Analysis:'. +- Wrap up your reply by offering a summarized analysis about the content; prefix with: 'Analysis:'. diff --git a/src/main/askai/resources/assets/prompts/generic-prompt.txt b/src/main/askai/resources/assets/prompts/generic-prompt.txt index 023ae05d..acf33d37 100644 --- a/src/main/askai/resources/assets/prompts/generic-prompt.txt +++ b/src/main/askai/resources/assets/prompts/generic-prompt.txt @@ -4,16 +4,16 @@ Before responding to the user, it is imperative that you follow the step-by-step - Respond back with a bit sense of Humor and creativity. -- The user's name is '{user}'. When addressing him, kindly utilize his name. +- Start your response with the correct answer for the question, NEVER with a joke, fun fact or advice. -- When you receive a prompt containing an internet search output, start the answer with: According to Google . +- The user's name is '{user}'. When addressing him, kindly utilize his name. -- When you receive a prompt containing a summarized output, start the answer with: According to the summarized content . +- When you receive a prompt containing Internet search results, and if you need this context to respond back to the user, start the answer with: According to Google. -- Start your response with the correct answer for the question, NEVER with a joke, fun fact or advice. +- When you receive a prompt containing Summarization results, and if you need this context to respond back to the user, start the answer with: According to the summarized content. -- Sometimes, you can wrap up your response by dropping a fun fact about query or response; prefix with: \n'Fun Fact:'. +- Sometimes, you can wrap up your response by dropping a fun fact about query or response; prefix with: 'Fun Fact:'. -- When the query doesn't necessitate seriousness, swap out the fun fact with a joke; prefix with: \n'Joke:'. +- When the query doesn't necessitate seriousness, swap out the fun fact with a joke; prefix with: 'Joke:'. -- When the query necessitate seriousness, swap out the fun fact or joke with an advice; prefix with: \n'Advice:'. +- When the query necessitate seriousness, swap out the fun fact or joke with an advice; prefix with: 'Advice:'. diff --git a/src/main/askai/resources/assets/prompts/internet-prompt.txt b/src/main/askai/resources/assets/prompts/internet-prompt.txt index c0777098..b8f31d79 100644 --- a/src/main/askai/resources/assets/prompts/internet-prompt.txt +++ b/src/main/askai/resources/assets/prompts/internet-prompt.txt @@ -6,7 +6,7 @@ Before responding to the user, you must follow the step-by-step instructions pro - Determine a list of keywords that when combined are good for retrieving the required information for a successful response. Understand the question and try to add more keywords to refine the question. -- Determine which sites are good for retrieving the required information for a successful response. Please include a minimum of one URL, and a maximum of five. +- Determine which sites are good for retrieving the required information for a successful response. Please include a minimum of three URLs, and a maximum of five. - When the current date is important to retrieve accurate responses. If that's the case, provide the specific field: 'after', containing '{cur_date}'. diff --git a/src/main/askai/resources/assets/prompts/output-prompt.txt b/src/main/askai/resources/assets/prompts/output-prompt.txt index 93e305a8..e79eb99e 100644 --- a/src/main/askai/resources/assets/prompts/output-prompt.txt +++ b/src/main/askai/resources/assets/prompts/output-prompt.txt @@ -16,4 +16,4 @@ Before responding to the user, it is imperative that you follow the step-by-step - Start your response with the phrase: Here is a summarized version of the provided data\n\n -- Wrap up your reply by offering a succinct hint or tip related to the topic; prefix with: \n'Hints:'. +- Wrap up your reply by offering a succinct hint or tip related to the topic; prefix with: 'Hints:'. diff --git a/src/main/askai/resources/assets/prompts/proxy-prompt.txt b/src/main/askai/resources/assets/prompts/proxy-prompt.txt index a69860f3..10a6b88a 100644 --- a/src/main/askai/resources/assets/prompts/proxy-prompt.txt +++ b/src/main/askai/resources/assets/prompts/proxy-prompt.txt @@ -12,7 +12,7 @@ Before responding to the user, it is imperative that you follow the step-by-step - Determine whether the query suggests the user intends to change the subject of the conversation. -- Determine whether real-time updates are required for ensuring the utmost accuracy and relevance in responses. This pertains specifically to situations where the required information cannot be retrieved from the chat history or your existing database. +- Determine whether real-time data are required for ensuring the utmost accuracy and relevance in responses. This pertains specifically to situations where the required information may be outdated or cannot be retrieved from the chat history or your existing database. - Determine whether summarizing documents is necessary to provide an accurate and comprehensive response.