Fixes issue with user uploaded files (#2830)

pamelafox · web-flow · commit 5edb53666b21 · 2025-11-12T13:49:57.000-08:00
* Fix issue with upload

* Revert unneeded changes

* Add test for non-seekable content
diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py
@@ -68,21 +68,27 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
         async with DocumentIntelligenceClient(
             endpoint=self.endpoint, credential=self.credential
         ) as document_intelligence_client:
-            file_analyzed = False
+            # Always convert to bytes up front to avoid passing a FileStorage/stream object
+            try:
+                content.seek(0)
+            except Exception:
+                pass
+            content_bytes = content.read()
+
+            poller = None
+            doc_for_pymupdf = None
+
             if self.process_figures:
-                content_bytes = content.read()
                 try:
                     poller = await document_intelligence_client.begin_analyze_document(
                         model_id="prebuilt-layout",
-                        analyze_request=AnalyzeDocumentRequest(bytes_source=content_bytes),
+                        body=AnalyzeDocumentRequest(bytes_source=content_bytes),
                         output=["figures"],
                         features=["ocrHighResolution"],
                         output_content_format="markdown",
                     )
                     doc_for_pymupdf = pymupdf.open(stream=io.BytesIO(content_bytes))
-                    file_analyzed = True
                 except HttpResponseError as e:
-                    content.seek(0)
                     if e.error and e.error.code == "InvalidArgument":
                         logger.error(
                             "This document type does not support media description. Proceeding with standard analysis."
@@ -92,10 +98,12 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
                             "Unexpected error analyzing document for media description: %s. Proceeding with standard analysis.",
                             e,
                         )
+                    poller = None
 
-            if file_analyzed is False:
+            if poller is None:
                 poller = await document_intelligence_client.begin_analyze_document(
-                    model_id=self.model_id, analyze_request=content, content_type="application/octet-stream"
+                    model_id=self.model_id,
+                    body=AnalyzeDocumentRequest(bytes_source=content_bytes),
                 )
             analyze_result: AnalyzeResult = await poller.result()
 
diff --git a/app/backend/requirements.in b/app/backend/requirements.in
@@ -5,7 +5,7 @@ quart-cors
 openai>=1.109.1
 tiktoken
 tenacity
-azure-ai-documentintelligence==1.0.0b4
+azure-ai-documentintelligence==1.0.2
 azure-cognitiveservices-speech
 azure-cosmos
 azure-search-documents==11.7.0b1
diff --git a/app/backend/requirements.txt b/app/backend/requirements.txt
@@ -24,7 +24,7 @@ async-timeout==5.0.1
     # via aiohttp
 attrs==25.3.0
     # via aiohttp
-azure-ai-documentintelligence==1.0.0b4
+azure-ai-documentintelligence==1.0.2
     # via -r requirements.in
 azure-cognitiveservices-speech==1.40.0
     # via -r requirements.in
diff --git a/tests/test_pdfparser.py b/tests/test_pdfparser.py
@@ -9,6 +9,7 @@
 import pytest
 from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
 from azure.ai.documentintelligence.models import (
+    AnalyzeDocumentRequest,
     AnalyzeResult,
     BoundingRegion,
     DocumentCaption,
@@ -21,6 +22,7 @@
 from azure.core.credentials import AzureKeyCredential
 from azure.core.exceptions import HttpResponseError
 from PIL import Image, ImageChops
+from werkzeug.datastructures import FileStorage
 
 from prepdocslib.figureprocessor import (
     FigureProcessor,
@@ -178,8 +180,11 @@ def mock_crop_image_from_pdf_page(doc, page_number, bounding_box):
 @pytest.mark.asyncio
 async def test_parse_simple(monkeypatch):
     mock_poller = MagicMock()
+    captured_bodies: list[AnalyzeDocumentRequest] = []
 
-    async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
+    async def mock_begin_analyze_document(self, model_id, **kwargs):
+        body = kwargs["body"]
+        captured_bodies.append(body)
         return mock_poller
 
     async def mock_poller_result():
@@ -205,13 +210,106 @@ async def mock_poller_result():
     assert pages[0].page_num == 0
     assert pages[0].offset == 0
     assert pages[0].text == "Page content"
+    assert len(captured_bodies) == 1
+    assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)
+    assert captured_bodies[0].bytes_source == b"pdf content bytes"
+
+
+@pytest.mark.asyncio
+async def test_parse_with_filestorage(monkeypatch):
+    mock_poller = MagicMock()
+    captured_bodies: list[AnalyzeDocumentRequest] = []
+
+    async def mock_begin_analyze_document(self, model_id, **kwargs):
+        captured_bodies.append(kwargs["body"])
+        return mock_poller
+
+    async def mock_poller_result():
+        return AnalyzeResult(
+            content="Page content",
+            pages=[DocumentPage(page_number=1, spans=[DocumentSpan(offset=0, length=12)])],
+            tables=[],
+            figures=[],
+        )
+
+    monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document)
+    monkeypatch.setattr(mock_poller, "result", mock_poller_result)
+
+    parser = DocumentAnalysisParser(
+        endpoint="https://example.com",
+        credential=MockAzureCredential(),
+    )
+    stream = io.BytesIO(b"pdf content bytes")
+    file_storage = FileStorage(stream=stream, filename="upload.pdf")
+    file_storage.name = "upload.pdf"
+    pages = [page async for page in parser.parse(file_storage)]
+
+    assert len(pages) == 1
+    assert pages[0].page_num == 0
+    assert pages[0].offset == 0
+    assert pages[0].text == "Page content"
+    assert len(captured_bodies) == 1
+    assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)
+    assert captured_bodies[0].bytes_source == b"pdf content bytes"
+
+
+@pytest.mark.asyncio
+async def test_parse_with_non_seekable_stream(monkeypatch):
+    mock_poller = MagicMock()
+    captured_bodies: list[AnalyzeDocumentRequest] = []
+
+    async def mock_begin_analyze_document(self, model_id, **kwargs):
+        captured_bodies.append(kwargs["body"])
+        return mock_poller
+
+    async def mock_poller_result():
+        return AnalyzeResult(
+            content="Page content",
+            pages=[DocumentPage(page_number=1, spans=[DocumentSpan(offset=0, length=12)])],
+            tables=[],
+            figures=[],
+        )
+
+    monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document)
+    monkeypatch.setattr(mock_poller, "result", mock_poller_result)
+
+    class NonSeekableStream:
+        def __init__(self, data: bytes, name: str):
+            self._data = data
+            self._name = name
+            self._consumed = False
+
+        @property
+        def name(self) -> str:  # type: ignore[override]
+            return self._name
+
+        def read(self) -> bytes:
+            return self._data
+
+    parser = DocumentAnalysisParser(
+        endpoint="https://example.com",
+        credential=MockAzureCredential(),
+    )
+
+    stream = NonSeekableStream(b"pdf content bytes", "nonseekable.pdf")
+    pages = [page async for page in parser.parse(stream)]
+
+    assert len(pages) == 1
+    assert pages[0].page_num == 0
+    assert pages[0].offset == 0
+    assert pages[0].text == "Page content"
+    assert len(captured_bodies) == 1
+    assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)
+    assert captured_bodies[0].bytes_source == b"pdf content bytes"
 
 
 @pytest.mark.asyncio
 async def test_parse_doc_with_tables(monkeypatch):
     mock_poller = MagicMock()
+    captured_bodies: list[AnalyzeDocumentRequest] = []
 
-    async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
+    async def mock_begin_analyze_document(self, model_id, **kwargs):
+        captured_bodies.append(kwargs["body"])
         return mock_poller
 
     async def mock_poller_result():
@@ -281,13 +379,17 @@ async def mock_poller_result():
         pages[0].text
         == "# Simple HTML Table\n\n\n<figure><table><tr><th>Header 1</th><th>Header 2</th></tr><tr><td>Cell 1</td><td>Cell 2</td></tr><tr><td>Cell 3</td><td>Cell 4</td></tr></table></figure>"
     )
+    assert len(captured_bodies) == 1
+    assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)
 
 
 @pytest.mark.asyncio
 async def test_parse_doc_with_figures(monkeypatch):
     mock_poller = MagicMock()
+    captured_kwargs: list[dict] = []
 
-    async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
+    async def mock_begin_analyze_document(self, model_id, **kwargs):
+        captured_kwargs.append(kwargs)
         return mock_poller
 
     async def mock_poller_result():
@@ -330,13 +432,20 @@ async def mock_poller_result():
         == '# Simple Figure\n\nThis text is before the figure and NOT part of it.\n\n\n<figure id="1.1"></figure>\n\n\nThis is text after the figure that\'s not part of it.'
     )
     assert pages[0].images[0].placeholder == '<figure id="1.1"></figure>'
+    assert len(captured_kwargs) == 1
+    body = captured_kwargs[0]["body"]
+    assert isinstance(body, AnalyzeDocumentRequest)
+    assert captured_kwargs[0]["output"] == ["figures"]
+    assert captured_kwargs[0]["features"] == ["ocrHighResolution"]
 
 
 @pytest.mark.asyncio
 async def test_parse_unsupportedformat(monkeypatch, caplog):
     mock_poller = MagicMock()
+    captured_kwargs: list[dict] = []
 
-    async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
+    async def mock_begin_analyze_document(self, model_id, **kwargs):
+        captured_kwargs.append(kwargs)
 
         if kwargs.get("features") == ["ocrHighResolution"]:
 
@@ -387,6 +496,11 @@ async def mock_poller_result():
     assert pages[0].page_num == 0
     assert pages[0].offset == 0
     assert pages[0].text == "Page content"
+    assert len(captured_kwargs) == 2
+    assert captured_kwargs[0]["features"] == ["ocrHighResolution"]
+    assert isinstance(captured_kwargs[0]["body"], AnalyzeDocumentRequest)
+    assert captured_kwargs[1].get("features") is None
+    assert isinstance(captured_kwargs[1]["body"], AnalyzeDocumentRequest)
 
 
 @pytest.mark.asyncio