Cinnamon · lone17 · Apr 29, 2024 · Apr 24, 2024 · Apr 25, 2024 · Apr 25, 2024
diff --git a/libs/kotaemon/kotaemon/loaders/__init__.py b/libs/kotaemon/kotaemon/loaders/__init__.py
@@ -1,4 +1,5 @@
 from .adobe_loader import AdobeReader
+from .azureai_document_intelligence_loader import AzureAIDocumentIntelligenceLoader
 from .base import AutoReader, BaseReader
 from .composite_loader import DirectoryReader
 from .docx_loader import DocxReader
@@ -10,6 +11,7 @@
 
 __all__ = [
     "AutoReader",
+    "AzureAIDocumentIntelligenceLoader",
     "BaseReader",
     "PandasExcelReader",
     "MathpixPDFReader",

diff --git a/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py b/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py
@@ -0,0 +1,61 @@
+from pathlib import Path
+from typing import Optional
+
+from kotaemon.base import Document, Param
+
+from .base import BaseReader
+
+
+class AzureAIDocumentIntelligenceLoader(BaseReader):
+    """Utilize Azure AI Document Intelligence to parse document
+
+    As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff,
+    heif, docx, xlsx, pptx and html.
+    """
+
+    _dependencies = ["azure-ai-documentintelligence"]
+
+    endpoint: str = Param("Endpoint of Azure AI Document Intelligence")
+    credential: str = Param("Credential of Azure AI Document Intelligence")
+    model: str = Param(
+        "prebuilt-layout",
+        help=(
+            "Model to use for document analysis. Default is prebuilt-layout. "
+            "As of April 24, you can view the supported models [here]"
+            "(https://learn.microsoft.com/en-us/azure/ai-services/"
+            "document-intelligence/concept-model-overview?view=doc-intel-4.0.0"
+            "#model-analysis-features)"
+        ),
+    )
+
+    @Param.auto(depends_on=["endpoint", "credential"])
+    def client_(self):
+        try:
+            from azure.ai.documentintelligence import DocumentIntelligenceClient
+            from azure.core.credentials import AzureKeyCredential
+        except ImportError:
+            raise ImportError("Please install azure-ai-documentintelligence")
+
+        return DocumentIntelligenceClient(
+            self.endpoint, AzureKeyCredential(self.credential)
+        )
+
+    def run(
+        self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
+    ) -> list[Document]:
+        return self.load_data(Path(file_path), extra_info=extra_info, **kwargs)
+
+    def load_data(
+        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
+    ) -> list[Document]:
+        metadata = extra_info or {}
+        with open(file_path, "rb") as fi:
+            poller = self.client_.begin_analyze_document(
+                self.model,
+                analyze_request=fi,
+                content_type="application/octet-stream",
+                output_content_format="markdown",
+            )
+            result = poller.result()
+
+        return [Document(content=result.content, metadata=metadata)]
diff --git a/libs/kotaemon/pyproject.toml b/libs/kotaemon/pyproject.toml
@@ -11,7 +11,7 @@ packages.find.exclude = ["tests*", "env*"]
 # metadata and dependencies
 [project]
 name = "kotaemon"
-version = "0.3.11"
+version = "0.3.12"
 requires-python = ">= 3.10"
 description = "Kotaemon core library for AI development."
 dependencies = [
@@ -64,6 +64,7 @@ adv = [
     "pdfservices-sdk @  git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements",
     "fastembed",
     "beautifulsoup4",
+    "azure-ai-documentintelligence",
 ]
 dev = [
     "ipython",

diff --git a/libs/kotaemon/tests/test_reader.py b/libs/kotaemon/tests/test_reader.py
@@ -1,11 +1,13 @@
 from pathlib import Path
+from unittest.mock import patch
 
 from langchain.schema import Document as LangchainDocument
 from llama_index.node_parser import SimpleNodeParser
 
 from kotaemon.base import Document
 from kotaemon.loaders import (
     AutoReader,
+    AzureAIDocumentIntelligenceLoader,
     DocxReader,
     HtmlReader,
     MhtmlReader,
@@ -76,3 +78,15 @@ def test_mhtml_reader():
 
     assert len(docs) == 1
     assert docs[0].text.startswith("This is a test")
+
+
+@patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
+def test_azureai_document_intelligence_reader(mock_client):
+    reader = AzureAIDocumentIntelligenceLoader(
+        endpoint="https://endpoint.com",
+        credential="credential",
+    )
+    docs = reader(Path(__file__).parent / "resources" / "dummy.pdf")
+
+    assert len(docs) == 1
+    mock_client.assert_called_once()