From 54518cef18d05daa7caed89cd9bd8f8ddf12d483 Mon Sep 17 00:00:00 2001 From: trducng Date: Wed, 24 Apr 2024 18:54:36 +0700 Subject: [PATCH 1/4] Add azureai document intelligence loader --- libs/kotaemon/kotaemon/loaders/__init__.py | 2 + .../azureai_document_intelligence_loader.py | 52 +++++++++++++++++++ libs/kotaemon/pyproject.toml | 1 + libs/kotaemon/tests/test_reader.py | 14 +++++ 4 files changed, 69 insertions(+) create mode 100644 libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py diff --git a/libs/kotaemon/kotaemon/loaders/__init__.py b/libs/kotaemon/kotaemon/loaders/__init__.py index f0e7d0f7f..6ccdbda6b 100644 --- a/libs/kotaemon/kotaemon/loaders/__init__.py +++ b/libs/kotaemon/kotaemon/loaders/__init__.py @@ -1,4 +1,5 @@ from .adobe_loader import AdobeReader +from .azureai_document_intelligence_loader import AzureAIDocumentIntelligenceLoader from .base import AutoReader, BaseReader from .composite_loader import DirectoryReader from .docx_loader import DocxReader @@ -10,6 +11,7 @@ __all__ = [ "AutoReader", + "AzureAIDocumentIntelligenceLoader", "BaseReader", "PandasExcelReader", "MathpixPDFReader", diff --git a/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py b/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py new file mode 100644 index 000000000..c50591ada --- /dev/null +++ b/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py @@ -0,0 +1,52 @@ +from pathlib import Path + +from kotaemon.base import Document, Param + +from .base import BaseReader + + +class AzureAIDocumentIntelligenceLoader(BaseReader): + """Utilize Azure AI Document Intelligence to parse document + + As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff, + heif, docx, xlsx, pptx and html. + """ + + _dependencies = ["azure-ai-documentintelligence"] + + endpoint: str = Param("Endpoint of Azure AI Document Intelligence") + credential: str = Param("Credential of Azure AI Document Intelligence") + model: str = Param( + "prebuilt-layout", + help=( + "Model to use for document analysis. Default is prebuilt-layout. " + "As of April 24, you can view the supported models [here]" + "(https://learn.microsoft.com/en-us/azure/ai-services/" + "document-intelligence/concept-model-overview?view=doc-intel-4.0.0" + "#model-analysis-features)" + ), + ) + + @Param.auto(depends_on=["endpoint", "credential"]) + def client_(self): + try: + from azure.ai.documentintelligence import DocumentIntelligenceClient + from azure.core.credentials import AzureKeyCredential + except ImportError: + raise ImportError("Please install azure-ai-documentintelligence") + + return DocumentIntelligenceClient( + self.endpoint, AzureKeyCredential(self.credential) + ) + + def run(self, file_path: str | Path, **kwargs) -> list[Document]: + with open(file_path, "rb") as fi: + poller = self.client_.begin_analyze_document( + self.model, + analyze_request=fi, + content_type="application/octet-stream", + output_content_format="markdown", + ) + result = poller.result() + + return [Document(content=result.content)] diff --git a/libs/kotaemon/pyproject.toml b/libs/kotaemon/pyproject.toml index 43678b4f8..29b833f80 100644 --- a/libs/kotaemon/pyproject.toml +++ b/libs/kotaemon/pyproject.toml @@ -64,6 +64,7 @@ adv = [ "pdfservices-sdk @ git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements", "fastembed", "beautifulsoup4", + "azure-ai-documentintelligence", ] dev = [ "ipython", diff --git a/libs/kotaemon/tests/test_reader.py b/libs/kotaemon/tests/test_reader.py index 0aa2f2b48..0cdfa5180 100644 --- a/libs/kotaemon/tests/test_reader.py +++ b/libs/kotaemon/tests/test_reader.py @@ -1,4 +1,5 @@ from pathlib import Path +from unittest.mock import patch from langchain.schema import Document as LangchainDocument from llama_index.node_parser import SimpleNodeParser @@ -6,6 +7,7 @@ from kotaemon.base import Document from kotaemon.loaders import ( AutoReader, + AzureAIDocumentIntelligenceLoader, DocxReader, HtmlReader, MhtmlReader, @@ -76,3 +78,15 @@ def test_mhtml_reader(): assert len(docs) == 1 assert docs[0].text.startswith("This is a test") + + +@patch("azure.ai.documentintelligence.DocumentIntelligenceClient") +def test_azureai_document_intelligence_reader(mock_client): + reader = AzureAIDocumentIntelligenceLoader( + endpoint="https://endpoint.com", + credential="credential", + ) + docs = reader(Path(__file__).parent / "resources" / "dummy.pdf") + + assert len(docs) == 1 + mock_client.assert_called_once() From e86237a425886df998a6d6b104684dcd56cad854 Mon Sep 17 00:00:00 2001 From: trducng Date: Thu, 25 Apr 2024 19:13:42 +0700 Subject: [PATCH 2/4] Add load_data interface to Azure DI --- .../loaders/azureai_document_intelligence_loader.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py b/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py index c50591ada..2a831c35f 100644 --- a/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py +++ b/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py @@ -1,4 +1,5 @@ from pathlib import Path +from typing import Optional from kotaemon.base import Document, Param @@ -39,7 +40,15 @@ def client_(self): self.endpoint, AzureKeyCredential(self.credential) ) - def run(self, file_path: str | Path, **kwargs) -> list[Document]: + def run( + self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs + ) -> list[Document]: + return self.load_data(Path(file_path), extra_info=extra_info, **kwargs) + + def load_data( + self, file_path: Path, extra_info: Optional[dict] = None, **kwargs + ) -> list[Document]: + metadata = extra_info or {} with open(file_path, "rb") as fi: poller = self.client_.begin_analyze_document( self.model, @@ -49,4 +58,4 @@ def run(self, file_path: str | Path, **kwargs) -> list[Document]: ) result = poller.result() - return [Document(content=result.content)] + return [Document(content=result.content, metadata=metadata)] From f32539e4faa305acbb97d48f369ce895bd866206 Mon Sep 17 00:00:00 2001 From: trducng Date: Thu, 25 Apr 2024 22:15:57 +0700 Subject: [PATCH 3/4] Bump version --- libs/kotaemon/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/kotaemon/pyproject.toml b/libs/kotaemon/pyproject.toml index 29b833f80..b218b046b 100644 --- a/libs/kotaemon/pyproject.toml +++ b/libs/kotaemon/pyproject.toml @@ -11,7 +11,7 @@ packages.find.exclude = ["tests*", "env*"] # metadata and dependencies [project] name = "kotaemon" -version = "0.3.11" +version = "0.3.12" requires-python = ">= 3.10" description = "Kotaemon core library for AI development." dependencies = [ From 524d91c5c3291be80139ff192f1207d8d9fd8756 Mon Sep 17 00:00:00 2001 From: trducng Date: Sat, 27 Apr 2024 14:45:48 +0700 Subject: [PATCH 4/4] Access azure credentials from environment variables --- .../loaders/azureai_document_intelligence_loader.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py b/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py index 2a831c35f..7e4c516bd 100644 --- a/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py +++ b/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py @@ -1,3 +1,4 @@ +import os from pathlib import Path from typing import Optional @@ -15,8 +16,14 @@ class AzureAIDocumentIntelligenceLoader(BaseReader): _dependencies = ["azure-ai-documentintelligence"] - endpoint: str = Param("Endpoint of Azure AI Document Intelligence") - credential: str = Param("Credential of Azure AI Document Intelligence") + endpoint: str = Param( + os.environ.get("AZUREAI_DOCUMENT_INTELLIGENT_ENDPOINT", None), + help="Endpoint of Azure AI Document Intelligence", + ) + credential: str = Param( + os.environ.get("AZUREAI_DOCUMENT_INTELLIGENT_CREDENTIAL", None), + help="Credential of Azure AI Document Intelligence", + ) model: str = Param( "prebuilt-layout", help=(