From 7a02cb72aff81368d8ba75c2b4106fe9f7ff1689 Mon Sep 17 00:00:00 2001 From: "Tuan Anh Nguyen Dang (Tadashi_Cin)" Date: Sun, 15 Dec 2024 23:13:52 +0700 Subject: [PATCH] feat: add URL indexing directly from chat input (#571) bump:patch * feat: enable lightrag by default and add graphrag key check * feat: add URL indexing from chatbox --- flowsettings.py | 2 +- .../index/file/graph/lightrag_pipelines.py | 6 ++- .../ktem/index/file/graph/nano_pipelines.py | 4 +- libs/ktem/ktem/index/file/graph/pipelines.py | 15 ++++++++ libs/ktem/ktem/index/file/ui.py | 5 +++ libs/ktem/ktem/pages/chat/__init__.py | 37 ++++++++++++++++--- libs/ktem/ktem/utils/__init__.py | 4 +- libs/ktem/ktem/utils/conversation.py | 10 +++++ 8 files changed, 71 insertions(+), 12 deletions(-) diff --git a/flowsettings.py b/flowsettings.py index 4fe8b4ab7..119adab12 100644 --- a/flowsettings.py +++ b/flowsettings.py @@ -296,7 +296,7 @@ } USE_NANO_GRAPHRAG = config("USE_NANO_GRAPHRAG", default=False, cast=bool) -USE_LIGHTRAG = config("USE_LIGHTRAG", default=False, cast=bool) +USE_LIGHTRAG = config("USE_LIGHTRAG", default=True, cast=bool) GRAPHRAG_INDEX_TYPES = ["ktem.index.file.graph.GraphRAGIndex"] diff --git a/libs/ktem/ktem/index/file/graph/lightrag_pipelines.py b/libs/ktem/ktem/index/file/graph/lightrag_pipelines.py index 98ee2126e..95dd58ec5 100644 --- a/libs/ktem/ktem/index/file/graph/lightrag_pipelines.py +++ b/libs/ktem/ktem/index/file/graph/lightrag_pipelines.py @@ -45,7 +45,7 @@ filestorage_path = Path(settings.KH_FILESTORAGE_PATH) / "lightrag" filestorage_path.mkdir(parents=True, exist_ok=True) -INDEX_BATCHSIZE = 2 +INDEX_BATCHSIZE = 4 def get_llm_func(model): @@ -268,7 +268,9 @@ def call_graphrag_index(self, graph_id: str, docs: list[Document]): for doc_id in range(0, len(all_docs), INDEX_BATCHSIZE): cur_docs = all_docs[doc_id : doc_id + INDEX_BATCHSIZE] - graphrag_func.insert(cur_docs) + combined_doc = "\n".join(cur_docs) + + graphrag_func.insert(combined_doc) process_doc_count += len(cur_docs) yield Document( channel="debug", diff --git a/libs/ktem/ktem/index/file/graph/nano_pipelines.py b/libs/ktem/ktem/index/file/graph/nano_pipelines.py index 9b3009384..bfee52286 100644 --- a/libs/ktem/ktem/index/file/graph/nano_pipelines.py +++ b/libs/ktem/ktem/index/file/graph/nano_pipelines.py @@ -263,7 +263,9 @@ def call_graphrag_index(self, graph_id: str, docs: list[Document]): ) for doc_id in range(0, len(all_docs), INDEX_BATCHSIZE): cur_docs = all_docs[doc_id : doc_id + INDEX_BATCHSIZE] - graphrag_func.insert(cur_docs) + combined_doc = "\n".join(cur_docs) + + graphrag_func.insert(combined_doc) process_doc_count += len(cur_docs) yield Document( channel="debug", diff --git a/libs/ktem/ktem/index/file/graph/pipelines.py b/libs/ktem/ktem/index/file/graph/pipelines.py index 42cfe6ddb..d1a12c677 100644 --- a/libs/ktem/ktem/index/file/graph/pipelines.py +++ b/libs/ktem/ktem/index/file/graph/pipelines.py @@ -47,6 +47,14 @@ filestorage_path = Path(settings.KH_FILESTORAGE_PATH) / "graphrag" filestorage_path.mkdir(parents=True, exist_ok=True) +GRAPHRAG_KEY_MISSING_MESSAGE = ( + "GRAPHRAG_API_KEY is not set. Please set it to use the GraphRAG retriever pipeline." +) + + +def check_graphrag_api_key(): + return len(os.getenv("GRAPHRAG_API_KEY", "")) > 0 + def prepare_graph_index_path(graph_id: str): root_path = Path(filestorage_path) / graph_id @@ -99,6 +107,9 @@ def write_docs_to_files(self, graph_id: str, docs: list[Document]): return root_path def call_graphrag_index(self, graph_id: str, all_docs: list[Document]): + if not check_graphrag_api_key(): + raise ValueError(GRAPHRAG_KEY_MISSING_MESSAGE) + # call GraphRAG index with docs and graph_id input_path = self.write_docs_to_files(graph_id, all_docs) input_path = str(input_path.absolute()) @@ -346,6 +357,10 @@ def run( ) -> list[RetrievedDocument]: if not self.file_ids: return [] + + if not check_graphrag_api_key(): + raise ValueError(GRAPHRAG_KEY_MISSING_MESSAGE) + context_builder = self._build_graph_search() local_context_params = { diff --git a/libs/ktem/ktem/index/file/ui.py b/libs/ktem/ktem/index/file/ui.py index 6b1fdf473..417391c5f 100644 --- a/libs/ktem/ktem/index/file/ui.py +++ b/libs/ktem/ktem/index/file/ui.py @@ -683,6 +683,11 @@ def on_register_events(self): if self._index.id == 1: self.quick_upload_state = gr.State(value=[]) print("Setting up quick upload event") + + # override indexing function from chat page + self._app.chat_page.first_indexing_url_fn = ( + self.index_fn_url_with_default_loaders + ) quickUploadedEvent = ( self._app.chat_page.quick_file_upload.upload( fn=lambda: gr.update( diff --git a/libs/ktem/ktem/pages/chat/__init__.py b/libs/ktem/ktem/pages/chat/__init__.py index 86ed46fa3..8aec594e0 100644 --- a/libs/ktem/ktem/pages/chat/__init__.py +++ b/libs/ktem/ktem/pages/chat/__init__.py @@ -22,7 +22,7 @@ from kotaemon.base import Document from kotaemon.indices.ingests.files import KH_DEFAULT_FILE_EXTRACTORS -from ...utils import SUPPORTED_LANGUAGE_MAP, get_file_names_regex +from ...utils import SUPPORTED_LANGUAGE_MAP, get_file_names_regex, get_urls from .chat_panel import ChatPanel from .common import STATE from .control import ConversationControl @@ -140,6 +140,7 @@ def on_building_ui(self): # get the file selector choices for the first index if index_id == 0: self.first_selector_choices = index_ui.selector_choices + self.first_indexing_url_fn = None if gr_index: if isinstance(gr_index, list): @@ -284,6 +285,7 @@ def on_register_events(self): self.chat_panel.text_input, self.chat_panel.chatbot, self._app.user_id, + self._app.settings_state, self.chat_control.conversation_id, self.chat_control.conversation_rn, self.first_selector_choices, @@ -634,6 +636,7 @@ def submit_msg( chat_input, chat_history, user_id, + settings, conv_id, conv_name, first_selector_choices, @@ -643,22 +646,44 @@ def submit_msg( raise ValueError("Input is empty") chat_input_text = chat_input.get("text", "") + file_ids = [] - # get all file names with pattern @"filename" in input_str - file_names, chat_input_text = get_file_names_regex(chat_input_text) first_selector_choices_map = { item[0]: item[1] for item in first_selector_choices } - file_ids = [] - if file_names: + # get all file names with pattern @"filename" in input_str + file_names, chat_input_text = get_file_names_regex(chat_input_text) + # get all urls in input_str + urls, chat_input_text = get_urls(chat_input_text) + + if urls and self.first_indexing_url_fn: + print("Detected URLs", urls) + file_ids = self.first_indexing_url_fn( + "\n".join(urls), + True, + settings, + user_id, + ) + elif file_names: for file_name in file_names: file_id = first_selector_choices_map.get(file_name) if file_id: file_ids.append(file_id) + # add new file ids to the first selector choices + first_selector_choices.extend(zip(urls, file_ids)) + + # if file_ids is not empty and chat_input_text is empty + # set the input to summary + if not chat_input_text and file_ids: + chat_input_text = "Summary" + if file_ids: - selector_output = ["select", file_ids] + selector_output = [ + "select", + gr.update(value=file_ids, choices=first_selector_choices), + ] else: selector_output = [gr.update(), gr.update()] diff --git a/libs/ktem/ktem/utils/__init__.py b/libs/ktem/ktem/utils/__init__.py index 8865bd328..e28e6f3dc 100644 --- a/libs/ktem/ktem/utils/__init__.py +++ b/libs/ktem/ktem/utils/__init__.py @@ -1,4 +1,4 @@ -from .conversation import get_file_names_regex +from .conversation import get_file_names_regex, get_urls from .lang import SUPPORTED_LANGUAGE_MAP -__all__ = ["SUPPORTED_LANGUAGE_MAP", "get_file_names_regex"] +__all__ = ["SUPPORTED_LANGUAGE_MAP", "get_file_names_regex", "get_urls"] diff --git a/libs/ktem/ktem/utils/conversation.py b/libs/ktem/ktem/utils/conversation.py index 2dc95b13a..ddc3cd114 100644 --- a/libs/ktem/ktem/utils/conversation.py +++ b/libs/ktem/ktem/utils/conversation.py @@ -29,5 +29,15 @@ def get_file_names_regex(input_str: str) -> tuple[list[str], str]: return matches, input_str +def get_urls(input_str: str) -> tuple[list[str], str]: + # get all urls in input_str + # also remove these urls from input_str + pattern = r"https?://[^\s]+" + matches = re.findall(pattern, input_str) + input_str = re.sub(pattern, "", input_str).strip() + + return matches, input_str + + if __name__ == "__main__": print(sync_retrieval_n_message([[""], [""], [""]], []))