Skip to content

Commit 5edb536

Browse files
authored
Fixes issue with user uploaded files (#2830)
* Fix issue with upload * Revert unneeded changes * Add test for non-seekable content
1 parent bb6b81c commit 5edb536

File tree

4 files changed

+135
-13
lines changed

4 files changed

+135
-13
lines changed

app/backend/prepdocslib/pdfparser.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -68,21 +68,27 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
6868
async with DocumentIntelligenceClient(
6969
endpoint=self.endpoint, credential=self.credential
7070
) as document_intelligence_client:
71-
file_analyzed = False
71+
# Always convert to bytes up front to avoid passing a FileStorage/stream object
72+
try:
73+
content.seek(0)
74+
except Exception:
75+
pass
76+
content_bytes = content.read()
77+
78+
poller = None
79+
doc_for_pymupdf = None
80+
7281
if self.process_figures:
73-
content_bytes = content.read()
7482
try:
7583
poller = await document_intelligence_client.begin_analyze_document(
7684
model_id="prebuilt-layout",
77-
analyze_request=AnalyzeDocumentRequest(bytes_source=content_bytes),
85+
body=AnalyzeDocumentRequest(bytes_source=content_bytes),
7886
output=["figures"],
7987
features=["ocrHighResolution"],
8088
output_content_format="markdown",
8189
)
8290
doc_for_pymupdf = pymupdf.open(stream=io.BytesIO(content_bytes))
83-
file_analyzed = True
8491
except HttpResponseError as e:
85-
content.seek(0)
8692
if e.error and e.error.code == "InvalidArgument":
8793
logger.error(
8894
"This document type does not support media description. Proceeding with standard analysis."
@@ -92,10 +98,12 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
9298
"Unexpected error analyzing document for media description: %s. Proceeding with standard analysis.",
9399
e,
94100
)
101+
poller = None
95102

96-
if file_analyzed is False:
103+
if poller is None:
97104
poller = await document_intelligence_client.begin_analyze_document(
98-
model_id=self.model_id, analyze_request=content, content_type="application/octet-stream"
105+
model_id=self.model_id,
106+
body=AnalyzeDocumentRequest(bytes_source=content_bytes),
99107
)
100108
analyze_result: AnalyzeResult = await poller.result()
101109

app/backend/requirements.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ quart-cors
55
openai>=1.109.1
66
tiktoken
77
tenacity
8-
azure-ai-documentintelligence==1.0.0b4
8+
azure-ai-documentintelligence==1.0.2
99
azure-cognitiveservices-speech
1010
azure-cosmos
1111
azure-search-documents==11.7.0b1

app/backend/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ async-timeout==5.0.1
2424
# via aiohttp
2525
attrs==25.3.0
2626
# via aiohttp
27-
azure-ai-documentintelligence==1.0.0b4
27+
azure-ai-documentintelligence==1.0.2
2828
# via -r requirements.in
2929
azure-cognitiveservices-speech==1.40.0
3030
# via -r requirements.in

tests/test_pdfparser.py

Lines changed: 118 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import pytest
1010
from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
1111
from azure.ai.documentintelligence.models import (
12+
AnalyzeDocumentRequest,
1213
AnalyzeResult,
1314
BoundingRegion,
1415
DocumentCaption,
@@ -21,6 +22,7 @@
2122
from azure.core.credentials import AzureKeyCredential
2223
from azure.core.exceptions import HttpResponseError
2324
from PIL import Image, ImageChops
25+
from werkzeug.datastructures import FileStorage
2426

2527
from prepdocslib.figureprocessor import (
2628
FigureProcessor,
@@ -178,8 +180,11 @@ def mock_crop_image_from_pdf_page(doc, page_number, bounding_box):
178180
@pytest.mark.asyncio
179181
async def test_parse_simple(monkeypatch):
180182
mock_poller = MagicMock()
183+
captured_bodies: list[AnalyzeDocumentRequest] = []
181184

182-
async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
185+
async def mock_begin_analyze_document(self, model_id, **kwargs):
186+
body = kwargs["body"]
187+
captured_bodies.append(body)
183188
return mock_poller
184189

185190
async def mock_poller_result():
@@ -205,13 +210,106 @@ async def mock_poller_result():
205210
assert pages[0].page_num == 0
206211
assert pages[0].offset == 0
207212
assert pages[0].text == "Page content"
213+
assert len(captured_bodies) == 1
214+
assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)
215+
assert captured_bodies[0].bytes_source == b"pdf content bytes"
216+
217+
218+
@pytest.mark.asyncio
219+
async def test_parse_with_filestorage(monkeypatch):
220+
mock_poller = MagicMock()
221+
captured_bodies: list[AnalyzeDocumentRequest] = []
222+
223+
async def mock_begin_analyze_document(self, model_id, **kwargs):
224+
captured_bodies.append(kwargs["body"])
225+
return mock_poller
226+
227+
async def mock_poller_result():
228+
return AnalyzeResult(
229+
content="Page content",
230+
pages=[DocumentPage(page_number=1, spans=[DocumentSpan(offset=0, length=12)])],
231+
tables=[],
232+
figures=[],
233+
)
234+
235+
monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document)
236+
monkeypatch.setattr(mock_poller, "result", mock_poller_result)
237+
238+
parser = DocumentAnalysisParser(
239+
endpoint="https://example.com",
240+
credential=MockAzureCredential(),
241+
)
242+
stream = io.BytesIO(b"pdf content bytes")
243+
file_storage = FileStorage(stream=stream, filename="upload.pdf")
244+
file_storage.name = "upload.pdf"
245+
pages = [page async for page in parser.parse(file_storage)]
246+
247+
assert len(pages) == 1
248+
assert pages[0].page_num == 0
249+
assert pages[0].offset == 0
250+
assert pages[0].text == "Page content"
251+
assert len(captured_bodies) == 1
252+
assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)
253+
assert captured_bodies[0].bytes_source == b"pdf content bytes"
254+
255+
256+
@pytest.mark.asyncio
257+
async def test_parse_with_non_seekable_stream(monkeypatch):
258+
mock_poller = MagicMock()
259+
captured_bodies: list[AnalyzeDocumentRequest] = []
260+
261+
async def mock_begin_analyze_document(self, model_id, **kwargs):
262+
captured_bodies.append(kwargs["body"])
263+
return mock_poller
264+
265+
async def mock_poller_result():
266+
return AnalyzeResult(
267+
content="Page content",
268+
pages=[DocumentPage(page_number=1, spans=[DocumentSpan(offset=0, length=12)])],
269+
tables=[],
270+
figures=[],
271+
)
272+
273+
monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document)
274+
monkeypatch.setattr(mock_poller, "result", mock_poller_result)
275+
276+
class NonSeekableStream:
277+
def __init__(self, data: bytes, name: str):
278+
self._data = data
279+
self._name = name
280+
self._consumed = False
281+
282+
@property
283+
def name(self) -> str: # type: ignore[override]
284+
return self._name
285+
286+
def read(self) -> bytes:
287+
return self._data
288+
289+
parser = DocumentAnalysisParser(
290+
endpoint="https://example.com",
291+
credential=MockAzureCredential(),
292+
)
293+
294+
stream = NonSeekableStream(b"pdf content bytes", "nonseekable.pdf")
295+
pages = [page async for page in parser.parse(stream)]
296+
297+
assert len(pages) == 1
298+
assert pages[0].page_num == 0
299+
assert pages[0].offset == 0
300+
assert pages[0].text == "Page content"
301+
assert len(captured_bodies) == 1
302+
assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)
303+
assert captured_bodies[0].bytes_source == b"pdf content bytes"
208304

209305

210306
@pytest.mark.asyncio
211307
async def test_parse_doc_with_tables(monkeypatch):
212308
mock_poller = MagicMock()
309+
captured_bodies: list[AnalyzeDocumentRequest] = []
213310

214-
async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
311+
async def mock_begin_analyze_document(self, model_id, **kwargs):
312+
captured_bodies.append(kwargs["body"])
215313
return mock_poller
216314

217315
async def mock_poller_result():
@@ -281,13 +379,17 @@ async def mock_poller_result():
281379
pages[0].text
282380
== "# Simple HTML Table\n\n\n<figure><table><tr><th>Header 1</th><th>Header 2</th></tr><tr><td>Cell 1</td><td>Cell 2</td></tr><tr><td>Cell 3</td><td>Cell 4</td></tr></table></figure>"
283381
)
382+
assert len(captured_bodies) == 1
383+
assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)
284384

285385

286386
@pytest.mark.asyncio
287387
async def test_parse_doc_with_figures(monkeypatch):
288388
mock_poller = MagicMock()
389+
captured_kwargs: list[dict] = []
289390

290-
async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
391+
async def mock_begin_analyze_document(self, model_id, **kwargs):
392+
captured_kwargs.append(kwargs)
291393
return mock_poller
292394

293395
async def mock_poller_result():
@@ -330,13 +432,20 @@ async def mock_poller_result():
330432
== '# Simple Figure\n\nThis text is before the figure and NOT part of it.\n\n\n<figure id="1.1"></figure>\n\n\nThis is text after the figure that\'s not part of it.'
331433
)
332434
assert pages[0].images[0].placeholder == '<figure id="1.1"></figure>'
435+
assert len(captured_kwargs) == 1
436+
body = captured_kwargs[0]["body"]
437+
assert isinstance(body, AnalyzeDocumentRequest)
438+
assert captured_kwargs[0]["output"] == ["figures"]
439+
assert captured_kwargs[0]["features"] == ["ocrHighResolution"]
333440

334441

335442
@pytest.mark.asyncio
336443
async def test_parse_unsupportedformat(monkeypatch, caplog):
337444
mock_poller = MagicMock()
445+
captured_kwargs: list[dict] = []
338446

339-
async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
447+
async def mock_begin_analyze_document(self, model_id, **kwargs):
448+
captured_kwargs.append(kwargs)
340449

341450
if kwargs.get("features") == ["ocrHighResolution"]:
342451

@@ -387,6 +496,11 @@ async def mock_poller_result():
387496
assert pages[0].page_num == 0
388497
assert pages[0].offset == 0
389498
assert pages[0].text == "Page content"
499+
assert len(captured_kwargs) == 2
500+
assert captured_kwargs[0]["features"] == ["ocrHighResolution"]
501+
assert isinstance(captured_kwargs[0]["body"], AnalyzeDocumentRequest)
502+
assert captured_kwargs[1].get("features") is None
503+
assert isinstance(captured_kwargs[1]["body"], AnalyzeDocumentRequest)
390504

391505

392506
@pytest.mark.asyncio

0 commit comments

Comments
 (0)