From 7a02cb72aff81368d8ba75c2b4106fe9f7ff1689 Mon Sep 17 00:00:00 2001
From: "Tuan Anh Nguyen Dang (Tadashi_Cin)" <tadashi@cinnamon.is>
Date: Sun, 15 Dec 2024 23:13:52 +0700
Subject: [PATCH] feat: add URL indexing directly from chat input (#571)
 bump:patch

* feat: enable lightrag by default and add graphrag key check

* feat: add URL indexing from chatbox
---
 flowsettings.py                               |  2 +-
 .../index/file/graph/lightrag_pipelines.py    |  6 ++-
 .../ktem/index/file/graph/nano_pipelines.py   |  4 +-
 libs/ktem/ktem/index/file/graph/pipelines.py  | 15 ++++++++
 libs/ktem/ktem/index/file/ui.py               |  5 +++
 libs/ktem/ktem/pages/chat/__init__.py         | 37 ++++++++++++++++---
 libs/ktem/ktem/utils/__init__.py              |  4 +-
 libs/ktem/ktem/utils/conversation.py          | 10 +++++
 8 files changed, 71 insertions(+), 12 deletions(-)

diff --git a/flowsettings.py b/flowsettings.py
index 4fe8b4ab7..119adab12 100644
--- a/flowsettings.py
+++ b/flowsettings.py
@@ -296,7 +296,7 @@
 }
 
 USE_NANO_GRAPHRAG = config("USE_NANO_GRAPHRAG", default=False, cast=bool)
-USE_LIGHTRAG = config("USE_LIGHTRAG", default=False, cast=bool)
+USE_LIGHTRAG = config("USE_LIGHTRAG", default=True, cast=bool)
 
 GRAPHRAG_INDEX_TYPES = ["ktem.index.file.graph.GraphRAGIndex"]
 
diff --git a/libs/ktem/ktem/index/file/graph/lightrag_pipelines.py b/libs/ktem/ktem/index/file/graph/lightrag_pipelines.py
index 98ee2126e..95dd58ec5 100644
--- a/libs/ktem/ktem/index/file/graph/lightrag_pipelines.py
+++ b/libs/ktem/ktem/index/file/graph/lightrag_pipelines.py
@@ -45,7 +45,7 @@
 filestorage_path = Path(settings.KH_FILESTORAGE_PATH) / "lightrag"
 filestorage_path.mkdir(parents=True, exist_ok=True)
 
-INDEX_BATCHSIZE = 2
+INDEX_BATCHSIZE = 4
 
 
 def get_llm_func(model):
@@ -268,7 +268,9 @@ def call_graphrag_index(self, graph_id: str, docs: list[Document]):
 
         for doc_id in range(0, len(all_docs), INDEX_BATCHSIZE):
             cur_docs = all_docs[doc_id : doc_id + INDEX_BATCHSIZE]
-            graphrag_func.insert(cur_docs)
+            combined_doc = "\n".join(cur_docs)
+
+            graphrag_func.insert(combined_doc)
             process_doc_count += len(cur_docs)
             yield Document(
                 channel="debug",
diff --git a/libs/ktem/ktem/index/file/graph/nano_pipelines.py b/libs/ktem/ktem/index/file/graph/nano_pipelines.py
index 9b3009384..bfee52286 100644
--- a/libs/ktem/ktem/index/file/graph/nano_pipelines.py
+++ b/libs/ktem/ktem/index/file/graph/nano_pipelines.py
@@ -263,7 +263,9 @@ def call_graphrag_index(self, graph_id: str, docs: list[Document]):
         )
         for doc_id in range(0, len(all_docs), INDEX_BATCHSIZE):
             cur_docs = all_docs[doc_id : doc_id + INDEX_BATCHSIZE]
-            graphrag_func.insert(cur_docs)
+            combined_doc = "\n".join(cur_docs)
+
+            graphrag_func.insert(combined_doc)
             process_doc_count += len(cur_docs)
             yield Document(
                 channel="debug",
diff --git a/libs/ktem/ktem/index/file/graph/pipelines.py b/libs/ktem/ktem/index/file/graph/pipelines.py
index 42cfe6ddb..d1a12c677 100644
--- a/libs/ktem/ktem/index/file/graph/pipelines.py
+++ b/libs/ktem/ktem/index/file/graph/pipelines.py
@@ -47,6 +47,14 @@
 filestorage_path = Path(settings.KH_FILESTORAGE_PATH) / "graphrag"
 filestorage_path.mkdir(parents=True, exist_ok=True)
 
+GRAPHRAG_KEY_MISSING_MESSAGE = (
+    "GRAPHRAG_API_KEY is not set. Please set it to use the GraphRAG retriever pipeline."
+)
+
+
+def check_graphrag_api_key():
+    return len(os.getenv("GRAPHRAG_API_KEY", "")) > 0
+
 
 def prepare_graph_index_path(graph_id: str):
     root_path = Path(filestorage_path) / graph_id
@@ -99,6 +107,9 @@ def write_docs_to_files(self, graph_id: str, docs: list[Document]):
         return root_path
 
     def call_graphrag_index(self, graph_id: str, all_docs: list[Document]):
+        if not check_graphrag_api_key():
+            raise ValueError(GRAPHRAG_KEY_MISSING_MESSAGE)
+
         # call GraphRAG index with docs and graph_id
         input_path = self.write_docs_to_files(graph_id, all_docs)
         input_path = str(input_path.absolute())
@@ -346,6 +357,10 @@ def run(
     ) -> list[RetrievedDocument]:
         if not self.file_ids:
             return []
+
+        if not check_graphrag_api_key():
+            raise ValueError(GRAPHRAG_KEY_MISSING_MESSAGE)
+
         context_builder = self._build_graph_search()
 
         local_context_params = {
diff --git a/libs/ktem/ktem/index/file/ui.py b/libs/ktem/ktem/index/file/ui.py
index 6b1fdf473..417391c5f 100644
--- a/libs/ktem/ktem/index/file/ui.py
+++ b/libs/ktem/ktem/index/file/ui.py
@@ -683,6 +683,11 @@ def on_register_events(self):
             if self._index.id == 1:
                 self.quick_upload_state = gr.State(value=[])
                 print("Setting up quick upload event")
+
+                # override indexing function from chat page
+                self._app.chat_page.first_indexing_url_fn = (
+                    self.index_fn_url_with_default_loaders
+                )
                 quickUploadedEvent = (
                     self._app.chat_page.quick_file_upload.upload(
                         fn=lambda: gr.update(
diff --git a/libs/ktem/ktem/pages/chat/__init__.py b/libs/ktem/ktem/pages/chat/__init__.py
index 86ed46fa3..8aec594e0 100644
--- a/libs/ktem/ktem/pages/chat/__init__.py
+++ b/libs/ktem/ktem/pages/chat/__init__.py
@@ -22,7 +22,7 @@
 from kotaemon.base import Document
 from kotaemon.indices.ingests.files import KH_DEFAULT_FILE_EXTRACTORS
 
-from ...utils import SUPPORTED_LANGUAGE_MAP, get_file_names_regex
+from ...utils import SUPPORTED_LANGUAGE_MAP, get_file_names_regex, get_urls
 from .chat_panel import ChatPanel
 from .common import STATE
 from .control import ConversationControl
@@ -140,6 +140,7 @@ def on_building_ui(self):
                         # get the file selector choices for the first index
                         if index_id == 0:
                             self.first_selector_choices = index_ui.selector_choices
+                            self.first_indexing_url_fn = None
 
                         if gr_index:
                             if isinstance(gr_index, list):
@@ -284,6 +285,7 @@ def on_register_events(self):
                     self.chat_panel.text_input,
                     self.chat_panel.chatbot,
                     self._app.user_id,
+                    self._app.settings_state,
                     self.chat_control.conversation_id,
                     self.chat_control.conversation_rn,
                     self.first_selector_choices,
@@ -634,6 +636,7 @@ def submit_msg(
         chat_input,
         chat_history,
         user_id,
+        settings,
         conv_id,
         conv_name,
         first_selector_choices,
@@ -643,22 +646,44 @@ def submit_msg(
             raise ValueError("Input is empty")
 
         chat_input_text = chat_input.get("text", "")
+        file_ids = []
 
-        # get all file names with pattern @"filename" in input_str
-        file_names, chat_input_text = get_file_names_regex(chat_input_text)
         first_selector_choices_map = {
             item[0]: item[1] for item in first_selector_choices
         }
-        file_ids = []
 
-        if file_names:
+        # get all file names with pattern @"filename" in input_str
+        file_names, chat_input_text = get_file_names_regex(chat_input_text)
+        # get all urls in input_str
+        urls, chat_input_text = get_urls(chat_input_text)
+
+        if urls and self.first_indexing_url_fn:
+            print("Detected URLs", urls)
+            file_ids = self.first_indexing_url_fn(
+                "\n".join(urls),
+                True,
+                settings,
+                user_id,
+            )
+        elif file_names:
             for file_name in file_names:
                 file_id = first_selector_choices_map.get(file_name)
                 if file_id:
                     file_ids.append(file_id)
 
+        # add new file ids to the first selector choices
+        first_selector_choices.extend(zip(urls, file_ids))
+
+        # if file_ids is not empty and chat_input_text is empty
+        # set the input to summary
+        if not chat_input_text and file_ids:
+            chat_input_text = "Summary"
+
         if file_ids:
-            selector_output = ["select", file_ids]
+            selector_output = [
+                "select",
+                gr.update(value=file_ids, choices=first_selector_choices),
+            ]
         else:
             selector_output = [gr.update(), gr.update()]
 
diff --git a/libs/ktem/ktem/utils/__init__.py b/libs/ktem/ktem/utils/__init__.py
index 8865bd328..e28e6f3dc 100644
--- a/libs/ktem/ktem/utils/__init__.py
+++ b/libs/ktem/ktem/utils/__init__.py
@@ -1,4 +1,4 @@
-from .conversation import get_file_names_regex
+from .conversation import get_file_names_regex, get_urls
 from .lang import SUPPORTED_LANGUAGE_MAP
 
-__all__ = ["SUPPORTED_LANGUAGE_MAP", "get_file_names_regex"]
+__all__ = ["SUPPORTED_LANGUAGE_MAP", "get_file_names_regex", "get_urls"]
diff --git a/libs/ktem/ktem/utils/conversation.py b/libs/ktem/ktem/utils/conversation.py
index 2dc95b13a..ddc3cd114 100644
--- a/libs/ktem/ktem/utils/conversation.py
+++ b/libs/ktem/ktem/utils/conversation.py
@@ -29,5 +29,15 @@ def get_file_names_regex(input_str: str) -> tuple[list[str], str]:
     return matches, input_str
 
 
+def get_urls(input_str: str) -> tuple[list[str], str]:
+    # get all urls in input_str
+    # also remove these urls from input_str
+    pattern = r"https?://[^\s]+"
+    matches = re.findall(pattern, input_str)
+    input_str = re.sub(pattern, "", input_str).strip()
+
+    return matches, input_str
+
+
 if __name__ == "__main__":
     print(sync_retrieval_n_message([[""], [""], [""]], []))