Cinnamon · lone17 · May 25, 2024 · May 2, 2024 · May 10, 2024 · May 25, 2024
diff --git a/libs/kotaemon/kotaemon/base/component.py b/libs/kotaemon/kotaemon/base/component.py
@@ -1,5 +1,5 @@
 from abc import abstractmethod
-from typing import AsyncGenerator, Iterator, Optional
+from typing import Any, AsyncGenerator, Iterator, Optional
 
 from theflow import Function, Node, Param, lazy
 
@@ -58,7 +58,7 @@ def astream(self, *args, **kwargs) -> AsyncGenerator[Document, None] | None:
     @abstractmethod
     def run(
         self, *args, **kwargs
-    ) -> Document | list[Document] | Iterator[Document] | None:
+    ) -> Document | list[Document] | Iterator[Document] | None | Any:
         """Run the component."""
         ...
 

diff --git a/libs/kotaemon/kotaemon/base/schema.py b/libs/kotaemon/kotaemon/base/schema.py
@@ -32,12 +32,13 @@ class Document(BaseDocument):
         channel: the channel to show the document. Optional.:
             - chat: show in chat message
             - info: show in information panel
+            - index: show in index panel
             - debug: show in debug panel
     """
 
     content: Any = None
     source: Optional[str] = None
-    channel: Optional[Literal["chat", "info", "debug"]] = None
+    channel: Optional[Literal["chat", "info", "index", "debug"]] = None
 
     def __init__(self, content: Optional[Any] = None, *args, **kwargs):
         if content is None:

diff --git a/libs/kotaemon/kotaemon/indices/ingests/files.py b/libs/kotaemon/kotaemon/indices/ingests/files.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 from typing import Type
 
+from llama_index.readers import PDFReader
 from llama_index.readers.base import BaseReader
 
 from kotaemon.base import BaseComponent, Document, Param
@@ -17,18 +18,20 @@
     UnstructuredReader,
 )
 
-KH_DEFAULT_FILE_EXTRACTORS: dict[str, Type[BaseReader]] = {
-    ".xlsx": PandasExcelReader,
-    ".docx": UnstructuredReader,
-    ".xls": UnstructuredReader,
-    ".doc": UnstructuredReader,
-    ".html": HtmlReader,
-    ".mhtml": MhtmlReader,
-    ".png": UnstructuredReader,
-    ".jpeg": UnstructuredReader,
-    ".jpg": UnstructuredReader,
-    ".tiff": UnstructuredReader,
-    ".tif": UnstructuredReader,
+unstructured = UnstructuredReader()
+KH_DEFAULT_FILE_EXTRACTORS: dict[str, BaseReader] = {
+    ".xlsx": PandasExcelReader(),
+    ".docx": unstructured,
+    ".xls": unstructured,
+    ".doc": unstructured,
+    ".html": HtmlReader(),
+    ".mhtml": MhtmlReader(),
+    ".png": unstructured,
+    ".jpeg": unstructured,
+    ".jpg": unstructured,
+    ".tiff": unstructured,
+    ".tif": unstructured,
+    ".pdf": PDFReader(),
 }
 
 
@@ -64,7 +67,7 @@ class DocumentIngestor(BaseComponent):
     def _get_reader(self, input_files: list[str | Path]):
         """Get appropriate readers for the input files based on file extension"""
         file_extractors: dict[str, BaseReader] = {
-            ext: cls() for ext, cls in KH_DEFAULT_FILE_EXTRACTORS.items()
+            ext: reader for ext, reader in KH_DEFAULT_FILE_EXTRACTORS.items()
         }
         for ext, cls in self.override_file_extractors.items():
             file_extractors[ext] = cls()

diff --git a/libs/kotaemon/kotaemon/loaders/base.py b/libs/kotaemon/kotaemon/loaders/base.py
@@ -8,6 +8,8 @@
 
 
 class BaseReader(BaseComponent):
+    """The base class for all readers"""
+
     ...
 
 

diff --git a/libs/ktem/ktem/index/base.py b/libs/ktem/ktem/index/base.py
@@ -126,7 +126,7 @@ def get_indexing_pipeline(
         ...
 
     def get_retriever_pipelines(
-        self, settings: dict, selected: Any = None
+        self, settings: dict, user_id: int, selected: Any = None
     ) -> list["BaseComponent"]:
         """Return the retriever pipelines to retrieve the entity from the index"""
         return []
diff --git a/libs/ktem/ktem/index/file/base.py b/libs/ktem/ktem/index/file/base.py
@@ -1,10 +1,18 @@
 from pathlib import Path
-from typing import Optional
+from typing import Generator, Optional
 
-from kotaemon.base import BaseComponent
+from kotaemon.base import BaseComponent, Document, Param
 
 
 class BaseFileIndexRetriever(BaseComponent):
+
+    Source = Param(help="The SQLAlchemy Source table")
+    Index = Param(help="The SQLAlchemy Index table")
+    VS = Param(help="The VectorStore")
+    DS = Param(help="The DocStore")
+    FSPath = Param(help="The file storage path")
+    user_id = Param(help="The user id")
+
     @classmethod
     def get_user_settings(cls) -> dict:
         """Get the user settings for indexing
@@ -24,20 +32,6 @@ def get_pipeline(
     ) -> "BaseFileIndexRetriever":
         raise NotImplementedError
 
-    def set_resources(self, resources: dict):
-        """Set the resources for the indexing pipeline
-
-        This will setup the tables, the vector store and docstore.
-
-        Args:
-            resources (dict): the resources for the indexing pipeline
-        """
-        self._Source = resources["Source"]
-        self._Index = resources["Index"]
-        self._VS = resources["VectorStore"]
-        self._DS = resources["DocStore"]
-        self._fs_path = resources["FileStoragePath"]
-
 
 class BaseFileIndexIndexing(BaseComponent):
     """The pipeline to index information into the data store
@@ -54,11 +48,45 @@ class BaseFileIndexIndexing(BaseComponent):
         - self._DS: the docstore
     """
 
-    def run(self, file_paths: str | Path | list[str | Path], *args, **kwargs):
+    Source = Param(help="The SQLAlchemy Source table")
+    Index = Param(help="The SQLAlchemy Index table")
+    VS = Param(help="The VectorStore")
+    DS = Param(help="The DocStore")
+    FSPath = Param(help="The file storage path")
+    user_id = Param(help="The user id")
+
+    def run(
+        self, file_paths: str | Path | list[str | Path], *args, **kwargs
+    ) -> tuple[list[str | None], list[str | None]]:
         """Run the indexing pipeline
 
         Args:
             file_paths (str | Path | list[str | Path]): the file paths to index
+
+        Returns:
+            - the indexed file ids (each file id corresponds to an input file path, or
+                None if the indexing failed for that file path)
+            - the error messages (each error message corresponds to an input file path,
+                or None if the indexing was successful for that file path)
+        """
+        raise NotImplementedError
+
+    def stream(
+        self, file_paths: str | Path | list[str | Path], *args, **kwargs
+    ) -> Generator[Document, None, tuple[list[str | None], list[str | None]]]:
+        """Stream the indexing pipeline
+
+        Args:
+            file_paths (str | Path | list[str | Path]): the file paths to index
+
+        Yields:
+            Document: the output message to the UI, must have channel == index or debug
+
+        Returns:
+            - the indexed file ids (each file id corresponds to an input file path, or
+                None if the indexing failed for that file path)
+            - the error messages (each error message corresponds to an input file path,
+                or None if the indexing was successful for that file path)
         """
         raise NotImplementedError
 
@@ -78,20 +106,6 @@ def get_user_settings(cls) -> dict:
         """
         return {}
 
-    def set_resources(self, resources: dict):
-        """Set the resources for the indexing pipeline
-
-        This will setup the tables, the vector store and docstore.
-
-        Args:
-            resources (dict): the resources for the indexing pipeline
-        """
-        self._Source = resources["Source"]
-        self._Index = resources["Index"]
-        self._VS = resources["VectorStore"]
-        self._DS = resources["DocStore"]
-        self._fs_path = resources["FileStoragePath"]
-
     def copy_to_filestorage(
         self, file_paths: str | Path | list[str | Path]
     ) -> list[str]:
@@ -113,7 +127,7 @@ def copy_to_filestorage(
         for file_path in file_paths:
             with open(file_path, "rb") as f:
                 paths.append(sha256(f.read()).hexdigest())
-            shutil.copy(file_path, self._fs_path / paths[-1])
+            shutil.copy(file_path, self.FSPath / paths[-1])
 
         return paths
 

diff --git a/libs/ktem/ktem/index/file/index.py b/libs/ktem/ktem/index/file/index.py
@@ -362,13 +362,17 @@ def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing:
                 stripped_settings[key] = value
 
         obj = self._indexing_pipeline_cls.get_pipeline(stripped_settings, self.config)
-        obj.set_resources(resources=self._resources)
-        obj._user_id = user_id
+        obj.Source = self._resources["Source"]
+        obj.Index = self._resources["Index"]
+        obj.VS = self._vs
+        obj.DS = self._docstore
+        obj.FSPath = self._fs_path
+        obj.user_id = user_id
 
         return obj
 
     def get_retriever_pipelines(
-        self, settings: dict, selected: Any = None
+        self, settings: dict, user_id: int, selected: Any = None
     ) -> list["BaseFileIndexRetriever"]:
         # retrieval settings
         prefix = f"index.options.{self.id}."
@@ -387,7 +391,12 @@ def get_retriever_pipelines(
             obj = cls.get_pipeline(stripped_settings, self.config, selected_ids)
             if obj is None:
                 continue
-            obj.set_resources(self._resources)
+            obj.Source = self._resources["Source"]
+            obj.Index = self._resources["Index"]
+            obj.VS = self._vs
+            obj.DS = self._docstore
+            obj.FSPath = self._fs_path
+            obj.user_id = user_id
             retrievers.append(obj)
 
         return retrievers
Original file line number	Diff line number	Diff line change
Expand Up		@@ -8,6 +8,8 @@


		class BaseReader(BaseComponent):
		"""The base class for all readers"""

		...


Expand Down