Fix UI bugs (Cinnamon#8)

* Auto create conversation when the user starts * Add conversation rename rule check * Fix empty name during save * Confirm deleting conversation * Show warning if users don't select file when upload files in the File Index * Feedback when user uploads duplicated file * Limit the file types * Fix valid username * Allow login when username with leading and trailing whitespaces * Improve the user * Disable admin panel for non-admnin user * Refresh user lists after creating/deleting users * Auto logging in * Clear admin information upon signing out * Fix unable to receive uploaded filename that include special characters, like !@#$%^&*().pdf * Set upload validation for FileIndex * Improve user management UI/UIX * Show extraction error when indexing file * Return selected user -1 when signing out * Fix default supported file types in file index * Validate changing password * Allow the selector to contain mulitple gradio components * A more tolerable placeholder screen * Allow chat suggestion box * Increase concurrency limit * Make adobe loader optional * Use BaseReasoning --------- Co-authored-by: trducng <trungduc1992@gmail.com>
phv2312 · Sep 11, 2024 · 506979c · 506979c
1 parent 28a02f4
commit 506979c
Show file tree

Hide file tree

Showing 23 changed files with 936 additions and 255 deletions.
diff --git a/.gitignore b/.gitignore
@@ -452,6 +452,7 @@ $RECYCLE.BIN/
 .theflow/
 
 # End of https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm
+*.py[coid]
 
 logs/
 .gitsecret/keys/random_seed

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -52,7 +52,12 @@ repos:
     hooks:
       - id: mypy
         additional_dependencies:
-          [types-PyYAML==6.0.12.11, "types-requests", "sqlmodel"]
+          [
+            types-PyYAML==6.0.12.11,
+            "types-requests",
+            "sqlmodel",
+            "types-Markdown",
+          ]
         args: ["--check-untyped-defs", "--ignore-missing-imports"]
         exclude: "^templates/"
   - repo: https://github.com/codespell-project/codespell

diff --git a/libs/kotaemon/kotaemon/indices/qa/citation.py b/libs/kotaemon/kotaemon/indices/qa/citation.py
@@ -104,18 +104,16 @@ def invoke(self, context: str, question: str):
             print("CitationPipeline: invoking LLM")
             llm_output = self.get_from_path("llm").invoke(messages, **llm_kwargs)
             print("CitationPipeline: finish invoking LLM")
+            if not llm_output.messages:
+                return None
+            function_output = llm_output.messages[0].additional_kwargs["function_call"][
+                "arguments"
+            ]
+            output = QuestionAnswer.parse_raw(function_output)
         except Exception as e:
             print(e)
             return None
 
-        if not llm_output.messages:
-            return None
-
-        function_output = llm_output.messages[0].additional_kwargs["function_call"][
-            "arguments"
-        ]
-        output = QuestionAnswer.parse_raw(function_output)
-
         return output
 
     async def ainvoke(self, context: str, question: str):

diff --git a/libs/kotaemon/kotaemon/loaders/__init__.py b/libs/kotaemon/kotaemon/loaders/__init__.py
@@ -5,14 +5,15 @@
 from .excel_loader import PandasExcelReader
 from .html_loader import HtmlReader
 from .mathpix_loader import MathpixPDFReader
-from .ocr_loader import OCRReader
+from .ocr_loader import ImageReader, OCRReader
 from .unstructured_loader import UnstructuredReader
 
 __all__ = [
     "AutoReader",
     "BaseReader",
     "PandasExcelReader",
     "MathpixPDFReader",
+    "ImageReader",
     "OCRReader",
     "DirectoryReader",
     "UnstructuredReader",

diff --git a/libs/kotaemon/kotaemon/loaders/adobe_loader.py b/libs/kotaemon/kotaemon/loaders/adobe_loader.py
@@ -10,14 +10,6 @@
 
 from kotaemon.base import Document
 
-from .utils.adobe import (
-    generate_figure_captions,
-    load_json,
-    parse_figure_paths,
-    parse_table_paths,
-    request_adobe_service,
-)
-
 logger = logging.getLogger(__name__)
 
 DEFAULT_VLM_ENDPOINT = (
@@ -74,6 +66,13 @@ def load_data(
                 includes 3 types: text, table, and image
 
         """
+        from .utils.adobe import (
+            generate_figure_captions,
+            load_json,
+            parse_figure_paths,
+            parse_table_paths,
+            request_adobe_service,
+        )
 
         filename = file.name
         filepath = str(Path(file).resolve())

diff --git a/libs/kotaemon/kotaemon/loaders/ocr_loader.py b/libs/kotaemon/kotaemon/loaders/ocr_loader.py
@@ -125,3 +125,70 @@ def load_data(
         )
 
         return documents
+
+
+class ImageReader(BaseReader):
+    """Read PDF using OCR, with high focus on table extraction
+
+    Example:
+        ```python
+        >> from knowledgehub.loaders import OCRReader
+        >> reader = OCRReader()
+        >> documents = reader.load_data("path/to/pdf")
+        ```
+
+    Args:
+        endpoint: URL to FullOCR endpoint. If not provided, will look for
+            environment variable `OCR_READER_ENDPOINT` or use the default
+            `knowledgehub.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT`
+            (http://127.0.0.1:8000/v2/ai/infer/)
+        use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF
+            If False, only the table and text within table cells will be extracted.
+    """
+
+    def __init__(self, endpoint: Optional[str] = None):
+        """Init the OCR reader with OCR endpoint (FullOCR pipeline)"""
+        super().__init__()
+        self.ocr_endpoint = endpoint or os.getenv(
+            "OCR_READER_ENDPOINT", DEFAULT_OCR_ENDPOINT
+        )
+
+    def load_data(
+        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
+    ) -> List[Document]:
+        """Load data using OCR reader
+
+        Args:
+            file_path (Path): Path to PDF file
+            debug_path (Path): Path to store debug image output
+            artifact_path (Path): Path to OCR endpoints artifacts directory
+
+        Returns:
+            List[Document]: list of documents extracted from the PDF file
+        """
+        file_path = Path(file_path).resolve()
+
+        with file_path.open("rb") as content:
+            files = {"input": content}
+            data = {"job_id": uuid4(), "table_only": False}
+
+            # call the API from FullOCR endpoint
+            if "response_content" in kwargs:
+                # overriding response content if specified
+                ocr_results = kwargs["response_content"]
+            else:
+                # call original API
+                resp = tenacious_api_post(url=self.ocr_endpoint, files=files, data=data)
+                ocr_results = resp.json()["result"]
+
+        extra_info = extra_info or {}
+        result = []
+        for ocr_result in ocr_results:
+            result.append(
+                Document(
+                    content=ocr_result["csv_string"],
+                    metadata=extra_info,
+                )
+            )
+
+        return result
diff --git a/libs/ktem/ktem/app.py b/libs/ktem/ktem/app.py
@@ -229,7 +229,9 @@ def on_register_events(self):
     def _on_app_created(self):
         """Called when the app is created"""
 
-    def as_gradio_component(self) -> Optional[gr.components.Component]:
+    def as_gradio_component(
+        self,
+    ) -> Optional[gr.components.Component | list[gr.components.Component]]:
         """Return the gradio components responsible for events
 
         Note: in ideal scenario, this method shouldn't be necessary.

diff --git a/libs/ktem/ktem/index/base.py b/libs/ktem/ktem/index/base.py
@@ -1,6 +1,6 @@
 import abc
 import logging
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Any, Optional
 
 if TYPE_CHECKING:
     from ktem.app import BasePage
@@ -57,7 +57,7 @@ def __init__(self, app, id, name, config):
         self._app = app
         self.id = id
         self.name = name
-        self._config = config  # admin settings
+        self.config = config  # admin settings
 
     def on_create(self):
         """Create the index for the first time"""
@@ -121,7 +121,7 @@ def get_indexing_pipeline(self, settings: dict) -> "BaseComponent":
         ...
 
     def get_retriever_pipelines(
-        self, settings: dict, selected: Optional[list]
+        self, settings: dict, selected: Any = None
     ) -> list["BaseComponent"]:
         """Return the retriever pipelines to retrieve the entity from the index"""
         return []
diff --git a/libs/ktem/ktem/index/file/base.py b/libs/ktem/ktem/index/file/base.py
@@ -127,3 +127,11 @@ def get_filestorage_path(self, rel_paths: str | list[str]) -> list[str]:
             the absolute file storage path to the file
         """
         raise NotImplementedError
+
+    def warning(self, msg):
+        """Log a warning message
+
+        Args:
+            msg: the message to log
+        """
+        print(msg)