Skip to content

Commit

Permalink
feat: add URL indexing directly from chat input (#571) bump:patch
Browse files Browse the repository at this point in the history
* feat: enable lightrag by default and add graphrag key check

* feat: add URL indexing from chatbox
  • Loading branch information
taprosoft authored Dec 15, 2024
1 parent a0c9a6e commit 7a02cb7
Show file tree
Hide file tree
Showing 8 changed files with 71 additions and 12 deletions.
2 changes: 1 addition & 1 deletion flowsettings.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@
}

USE_NANO_GRAPHRAG = config("USE_NANO_GRAPHRAG", default=False, cast=bool)
USE_LIGHTRAG = config("USE_LIGHTRAG", default=False, cast=bool)
USE_LIGHTRAG = config("USE_LIGHTRAG", default=True, cast=bool)

GRAPHRAG_INDEX_TYPES = ["ktem.index.file.graph.GraphRAGIndex"]

Expand Down
6 changes: 4 additions & 2 deletions libs/ktem/ktem/index/file/graph/lightrag_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
filestorage_path = Path(settings.KH_FILESTORAGE_PATH) / "lightrag"
filestorage_path.mkdir(parents=True, exist_ok=True)

INDEX_BATCHSIZE = 2
INDEX_BATCHSIZE = 4


def get_llm_func(model):
Expand Down Expand Up @@ -268,7 +268,9 @@ def call_graphrag_index(self, graph_id: str, docs: list[Document]):

for doc_id in range(0, len(all_docs), INDEX_BATCHSIZE):
cur_docs = all_docs[doc_id : doc_id + INDEX_BATCHSIZE]
graphrag_func.insert(cur_docs)
combined_doc = "\n".join(cur_docs)

graphrag_func.insert(combined_doc)
process_doc_count += len(cur_docs)
yield Document(
channel="debug",
Expand Down
4 changes: 3 additions & 1 deletion libs/ktem/ktem/index/file/graph/nano_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,9 @@ def call_graphrag_index(self, graph_id: str, docs: list[Document]):
)
for doc_id in range(0, len(all_docs), INDEX_BATCHSIZE):
cur_docs = all_docs[doc_id : doc_id + INDEX_BATCHSIZE]
graphrag_func.insert(cur_docs)
combined_doc = "\n".join(cur_docs)

graphrag_func.insert(combined_doc)
process_doc_count += len(cur_docs)
yield Document(
channel="debug",
Expand Down
15 changes: 15 additions & 0 deletions libs/ktem/ktem/index/file/graph/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,14 @@
filestorage_path = Path(settings.KH_FILESTORAGE_PATH) / "graphrag"
filestorage_path.mkdir(parents=True, exist_ok=True)

GRAPHRAG_KEY_MISSING_MESSAGE = (
"GRAPHRAG_API_KEY is not set. Please set it to use the GraphRAG retriever pipeline."
)


def check_graphrag_api_key():
return len(os.getenv("GRAPHRAG_API_KEY", "")) > 0


def prepare_graph_index_path(graph_id: str):
root_path = Path(filestorage_path) / graph_id
Expand Down Expand Up @@ -99,6 +107,9 @@ def write_docs_to_files(self, graph_id: str, docs: list[Document]):
return root_path

def call_graphrag_index(self, graph_id: str, all_docs: list[Document]):
if not check_graphrag_api_key():
raise ValueError(GRAPHRAG_KEY_MISSING_MESSAGE)

# call GraphRAG index with docs and graph_id
input_path = self.write_docs_to_files(graph_id, all_docs)
input_path = str(input_path.absolute())
Expand Down Expand Up @@ -346,6 +357,10 @@ def run(
) -> list[RetrievedDocument]:
if not self.file_ids:
return []

if not check_graphrag_api_key():
raise ValueError(GRAPHRAG_KEY_MISSING_MESSAGE)

context_builder = self._build_graph_search()

local_context_params = {
Expand Down
5 changes: 5 additions & 0 deletions libs/ktem/ktem/index/file/ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -683,6 +683,11 @@ def on_register_events(self):
if self._index.id == 1:
self.quick_upload_state = gr.State(value=[])
print("Setting up quick upload event")

# override indexing function from chat page
self._app.chat_page.first_indexing_url_fn = (
self.index_fn_url_with_default_loaders
)
quickUploadedEvent = (
self._app.chat_page.quick_file_upload.upload(
fn=lambda: gr.update(
Expand Down
37 changes: 31 additions & 6 deletions libs/ktem/ktem/pages/chat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from kotaemon.base import Document
from kotaemon.indices.ingests.files import KH_DEFAULT_FILE_EXTRACTORS

from ...utils import SUPPORTED_LANGUAGE_MAP, get_file_names_regex
from ...utils import SUPPORTED_LANGUAGE_MAP, get_file_names_regex, get_urls
from .chat_panel import ChatPanel
from .common import STATE
from .control import ConversationControl
Expand Down Expand Up @@ -140,6 +140,7 @@ def on_building_ui(self):
# get the file selector choices for the first index
if index_id == 0:
self.first_selector_choices = index_ui.selector_choices
self.first_indexing_url_fn = None

if gr_index:
if isinstance(gr_index, list):
Expand Down Expand Up @@ -284,6 +285,7 @@ def on_register_events(self):
self.chat_panel.text_input,
self.chat_panel.chatbot,
self._app.user_id,
self._app.settings_state,
self.chat_control.conversation_id,
self.chat_control.conversation_rn,
self.first_selector_choices,
Expand Down Expand Up @@ -634,6 +636,7 @@ def submit_msg(
chat_input,
chat_history,
user_id,
settings,
conv_id,
conv_name,
first_selector_choices,
Expand All @@ -643,22 +646,44 @@ def submit_msg(
raise ValueError("Input is empty")

chat_input_text = chat_input.get("text", "")
file_ids = []

# get all file names with pattern @"filename" in input_str
file_names, chat_input_text = get_file_names_regex(chat_input_text)
first_selector_choices_map = {
item[0]: item[1] for item in first_selector_choices
}
file_ids = []

if file_names:
# get all file names with pattern @"filename" in input_str
file_names, chat_input_text = get_file_names_regex(chat_input_text)
# get all urls in input_str
urls, chat_input_text = get_urls(chat_input_text)

if urls and self.first_indexing_url_fn:
print("Detected URLs", urls)
file_ids = self.first_indexing_url_fn(
"\n".join(urls),
True,
settings,
user_id,
)
elif file_names:
for file_name in file_names:
file_id = first_selector_choices_map.get(file_name)
if file_id:
file_ids.append(file_id)

# add new file ids to the first selector choices
first_selector_choices.extend(zip(urls, file_ids))

# if file_ids is not empty and chat_input_text is empty
# set the input to summary
if not chat_input_text and file_ids:
chat_input_text = "Summary"

if file_ids:
selector_output = ["select", file_ids]
selector_output = [
"select",
gr.update(value=file_ids, choices=first_selector_choices),
]
else:
selector_output = [gr.update(), gr.update()]

Expand Down
4 changes: 2 additions & 2 deletions libs/ktem/ktem/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .conversation import get_file_names_regex
from .conversation import get_file_names_regex, get_urls
from .lang import SUPPORTED_LANGUAGE_MAP

__all__ = ["SUPPORTED_LANGUAGE_MAP", "get_file_names_regex"]
__all__ = ["SUPPORTED_LANGUAGE_MAP", "get_file_names_regex", "get_urls"]
10 changes: 10 additions & 0 deletions libs/ktem/ktem/utils/conversation.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,15 @@ def get_file_names_regex(input_str: str) -> tuple[list[str], str]:
return matches, input_str


def get_urls(input_str: str) -> tuple[list[str], str]:
# get all urls in input_str
# also remove these urls from input_str
pattern = r"https?://[^\s]+"
matches = re.findall(pattern, input_str)
input_str = re.sub(pattern, "", input_str).strip()

return matches, input_str


if __name__ == "__main__":
print(sync_retrieval_n_message([[""], [""], [""]], []))

0 comments on commit 7a02cb7

Please sign in to comment.