diff --git a/docs/agents/FEATURE_SPEC.md b/docs/agents/FEATURE_SPEC.md index 435001e..55753aa 100644 --- a/docs/agents/FEATURE_SPEC.md +++ b/docs/agents/FEATURE_SPEC.md @@ -4,46 +4,6 @@ --- -## セル結合データのコンテキスト量圧縮 - -- 現状の `merged_cells` がコンテキスト量を非常に多く持っているため、データ構造の見直しで圧縮する -- `rows` と `merged_cells` でセル値を重複して持っているため、出力時に `rows` 側の結合セル値を落とす運用を検討する - -### 仕様(v1.1 予定) - -- `merged_cells` を **schema + items** 形式へ変更して冗長なキーを削減する -- 結合セルの値は `merged_cells` に集約し、`rows` 側に保持するかはフラグで切替可能にする - -#### merged_cells の新フォーマット(例) - -```json -{ - "merged_cells": { - "schema": ["r1", "c1", "r2", "c2", "v"], - "items": [ - [1, 0, 2, 1, "A1-B2 merged"], - [3, 4, 3, 6, "merged value"] - ] - } -} -``` - -- `r1/c1/r2/c2` は従来同様の座標(row: 1-based, col: 0-based) -- `v` は結合セルの代表値(セル値がない場合でも `" "` を出力する) - -#### rows 側の結合セル値の扱い - -- 新しいフラグ `include_merged_values_in_rows: bool` を導入 -- `True` の場合は互換モード(従来どおり `rows` に結合セル値を残す) -- `False` の場合は `rows` から結合セル値を排除し、`merged_cells` のみで値を保持 - -#### 互換性 - -- デフォルトは `True` として破壊的変更を回避 -- 将来的にデフォルト切替の可能性があるため、出力仕様に明記する - ---- - ## 今後のオプション検討メモ - 表検知スコアリングの閾値を CLI/環境変数で調整可能にする diff --git a/docs/agents/TASKS.md b/docs/agents/TASKS.md index 4a643d7..742e3b4 100644 --- a/docs/agents/TASKS.md +++ b/docs/agents/TASKS.md @@ -2,10 +2,7 @@ 未完了 [ ], 完了 [x] -- [x] 仕様: `merged_cells` の新フォーマット(schema + items)をモデルと出力仕様に反映 -- [x] 仕様: `include_merged_values_in_rows` フラグ追加(デフォルト True) -- [x] 実装: 既存の `merged_cells` 生成ロジックを新構造へ置換 -- [x] 実装: `rows` から結合セル値を排除する分岐を追加(フラグ制御) -- [x] 実装: 結合セルの値がない場合は `" "` を出力 -- [ ] 更新: 既存の JSON 出力例・ドキュメントの整合性確認 -- [x] テスト: 結合セルが多いケースの JSON 量削減を確認 +- [x] 仕様確認: 画像出力は DPI を維持しつつ、メモリリーク/クラッシュ回避のためサブプロセス化で処理する方針を明記 +- [x] 実装方針: シートごとに PDF を分割 → サブプロセスで PDF ページを PNG へ変換 → 終了時にメモリを解放する設計(親は進捗/結果を集約) +- [x] 実装方針: 子プロセスは `pypdfium2` をロードしてページごとにレンダリングし、書き込み済みパスを親に返す +- [x] 実装方針: 例外時は子プロセスでエラーを返し、親が RenderError として集約して返す diff --git a/docs/release-notes/v0.3.6.md b/docs/release-notes/v0.3.6.md new file mode 100644 index 0000000..913d0ab --- /dev/null +++ b/docs/release-notes/v0.3.6.md @@ -0,0 +1,17 @@ +# v0.3.6 Release Notes + +This release improves rendering robustness for image export and large Excel +files, with better support for multi-page sheets and legacy .xls inputs. + +## Highlights + +- Sheet image export now renders all PDF pages per sheet, with `_pNN` suffixes + for page 2+ (fixes multi-print-range sheets outputting only the first image). +- .xls rendering now uses Excel SaveAs to a temporary .xlsx before PDF export, + avoiding failures when outputting images from legacy files. +- Image rendering can run in a subprocess to isolate memory usage and reduce + crashes on large workbooks (enabled by default). + +## Notes + +- Set `EXSTRUCT_RENDER_SUBPROCESS=0` to disable subprocess rendering. diff --git a/mkdocs.yml b/mkdocs.yml index 126d2dc..33a0b0f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -27,6 +27,7 @@ nav: - CLI Guide: cli.md - Concept / Why ExStruct?: concept.md - Release Notes: + - v0.3.6: release-notes/v0.3.6.md - v0.3.5: release-notes/v0.3.5.md - v0.3.2: release-notes/v0.3.2.md - v0.3.1: release-notes/v0.3.1.md diff --git a/pyproject.toml b/pyproject.toml index 731ae9a..7b6fe5c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "exstruct" -version = "0.3.5" +version = "0.3.6" description = "Excel to structured JSON (tables, shapes, charts) for LLM/RAG pipelines" readme = "README.md" license = { file = "LICENSE" } diff --git a/src/exstruct/render/__init__.py b/src/exstruct/render/__init__.py index fde18a4..9c76481 100644 --- a/src/exstruct/render/__init__.py +++ b/src/exstruct/render/__init__.py @@ -1,6 +1,8 @@ from __future__ import annotations import logging +import multiprocessing as mp +import os from pathlib import Path import shutil import tempfile @@ -35,14 +37,15 @@ def export_pdf(excel_path: str | Path, output_pdf: str | Path) -> list[str]: temp_dir = Path(td) temp_xlsx = temp_dir / "book.xlsx" temp_pdf = temp_dir / "book.pdf" - shutil.copy(normalized_excel_path, temp_xlsx) app: xw.App | None = None wb: xw.Book | None = None try: app = _require_excel_app() - wb = app.books.open(str(temp_xlsx)) + app.display_alerts = False + wb = app.books.open(str(normalized_excel_path)) sheet_names = [s.name for s in wb.sheets] + wb.api.SaveAs(str(temp_xlsx)) wb.api.ExportAsFixedFormat(0, str(temp_pdf)) shutil.copy(temp_pdf, normalized_output_pdf) except RenderError: @@ -77,28 +80,55 @@ def export_sheet_images( excel_path: str | Path, output_dir: str | Path, dpi: int = 144 ) -> list[Path]: """Export each sheet as PNG (via PDF then pypdfium2 rasterization) and return paths in sheet order.""" - pdfium = cast(Any, _require_pdfium()) normalized_excel_path = Path(excel_path) normalized_output_dir = Path(output_dir) normalized_output_dir.mkdir(parents=True, exist_ok=True) + use_subprocess = _use_render_subprocess() + if not use_subprocess: + pdfium = cast(Any, _require_pdfium()) + else: + _require_pdfium() try: with tempfile.TemporaryDirectory() as td: - tmp_pdf = Path(td) / "book.pdf" - sheet_names = export_pdf(normalized_excel_path, tmp_pdf) - - scale = dpi / 72.0 written: list[Path] = [] - with pdfium.PdfDocument(str(tmp_pdf)) as pdf: - for i, sheet_name in enumerate(sheet_names): - page = pdf[i] - bitmap = page.render(scale=scale) - pil_image = bitmap.to_pil() + app: xw.App | None = None + wb: xw.Book | None = None + try: + app = _require_excel_app() + wb = app.books.open(str(normalized_excel_path)) + for sheet_index, sheet in enumerate(wb.sheets): + sheet_name = sheet.name + sheet_pdf = Path(td) / f"sheet_{sheet_index + 1:02d}.pdf" + sheet.api.ExportAsFixedFormat(0, str(sheet_pdf)) safe_name = _sanitize_sheet_filename(sheet_name) - img_path = normalized_output_dir / f"{i + 1:02d}_{safe_name}.png" - pil_image.save(img_path, format="PNG", dpi=(dpi, dpi)) - written.append(img_path) - return written + if use_subprocess: + written.extend( + _render_pdf_pages_subprocess( + sheet_pdf, + normalized_output_dir, + sheet_index, + safe_name, + dpi, + ) + ) + else: + written.extend( + _render_pdf_pages_in_process( + pdfium, + sheet_pdf, + normalized_output_dir, + sheet_index, + safe_name, + dpi, + ) + ) + return written + finally: + if wb is not None: + wb.close() + if app is not None: + app.quit() except RenderError: raise except Exception as exc: @@ -111,4 +141,99 @@ def _sanitize_sheet_filename(name: str) -> str: return "".join("_" if c in '\\/:*?"<>|' else c for c in name).strip() or "sheet" +def _use_render_subprocess() -> bool: + """Return True when PDF->PNG rendering should run in a subprocess.""" + return os.getenv("EXSTRUCT_RENDER_SUBPROCESS", "1").lower() not in {"0", "false"} + + +def _render_pdf_pages_in_process( + pdfium: ModuleType, + pdf_path: Path, + output_dir: Path, + sheet_index: int, + safe_name: str, + dpi: int, +) -> list[Path]: + """Render PDF pages to PNGs in the current process.""" + scale = dpi / 72.0 + written: list[Path] = [] + with pdfium.PdfDocument(str(pdf_path)) as pdf: + for page_index in range(len(pdf)): + page = pdf[page_index] + bitmap = page.render(scale=scale) + pil_image = bitmap.to_pil() + page_suffix = f"_p{page_index + 1:02d}" if page_index > 0 else "" + img_path = ( + output_dir / f"{sheet_index + 1:02d}_{safe_name}{page_suffix}.png" + ) + pil_image.save(img_path, format="PNG", dpi=(dpi, dpi)) + written.append(img_path) + return written + + +def _render_pdf_pages_subprocess( + pdf_path: Path, + output_dir: Path, + sheet_index: int, + safe_name: str, + dpi: int, +) -> list[Path]: + """Render PDF pages to PNGs in a subprocess for memory isolation.""" + ctx = mp.get_context("spawn") + queue: mp.Queue[dict[str, list[str] | str]] = ctx.Queue() + process = ctx.Process( + target=_render_pdf_pages_worker, + args=(pdf_path, output_dir, sheet_index, safe_name, dpi, queue), + ) + process.start() + process.join() + result = _get_subprocess_result(queue) + if process.exitcode != 0 or "error" in result: + message = result.get("error", "subprocess failed") + raise RenderError(f"Failed to render PDF pages: {message}") + paths = result.get("paths", []) + return [Path(path) for path in paths] + + +def _get_subprocess_result( + queue: mp.Queue[dict[str, list[str] | str]], +) -> dict[str, list[str] | str]: + """Fetch the worker result from the queue with a timeout.""" + try: + return queue.get(timeout=5) + except Exception as exc: + return {"error": f"subprocess did not return results ({exc})"} + + +def _render_pdf_pages_worker( + pdf_path: Path, + output_dir: Path, + sheet_index: int, + safe_name: str, + dpi: int, + queue: mp.Queue[dict[str, list[str] | str]], +) -> None: + """Worker process to render PDF pages into PNG files.""" + try: + import pypdfium2 as pdfium + + scale = dpi / 72.0 + output_dir.mkdir(parents=True, exist_ok=True) + written: list[str] = [] + with pdfium.PdfDocument(str(pdf_path)) as pdf: + for page_index in range(len(pdf)): + page = pdf[page_index] + bitmap = page.render(scale=scale) + pil_image = bitmap.to_pil() + page_suffix = f"_p{page_index + 1:02d}" if page_index > 0 else "" + img_path = ( + output_dir / f"{sheet_index + 1:02d}_{safe_name}{page_suffix}.png" + ) + pil_image.save(img_path, format="PNG", dpi=(dpi, dpi)) + written.append(str(img_path)) + queue.put({"paths": written}) + except Exception as exc: + queue.put({"error": str(exc)}) + + __all__ = ["export_pdf", "export_sheet_images"] diff --git a/tests/assets/multiple_print_ranges_4sheets.xlsx b/tests/assets/multiple_print_ranges_4sheets.xlsx new file mode 100644 index 0000000..114b2f0 Binary files /dev/null and b/tests/assets/multiple_print_ranges_4sheets.xlsx differ diff --git a/tests/assets/sample.xls b/tests/assets/sample.xls new file mode 100644 index 0000000..480c49c Binary files /dev/null and b/tests/assets/sample.xls differ diff --git a/tests/com/test_render_smoke.py b/tests/com/test_render_smoke.py index 917a715..fb4a38f 100644 --- a/tests/com/test_render_smoke.py +++ b/tests/com/test_render_smoke.py @@ -34,3 +34,36 @@ def test_render_smoke_pdf_and_png(tmp_path: Path) -> None: assert pdf_path.exists() assert images_dir.exists() assert any(images_dir.glob("*.png")) + + +def test_render_multiple_print_ranges_images(tmp_path: Path) -> None: + xlsx = ( + Path(__file__).resolve().parents[1] + / "assets" + / "multiple_print_ranges_4sheets.xlsx" + ) + out_json = tmp_path / "out.json" + process_excel( + xlsx, + output_path=out_json, + out_fmt="json", + image=True, + dpi=72, + mode="standard", + pretty=True, + ) + images_dir = out_json.parent / f"{out_json.stem}_images" + images = list(images_dir.glob("*.png")) + assert images_dir.exists() + prefixes = {_strip_page_suffix(image.stem) for image in images} + assert len(prefixes) == 4 + + +def _strip_page_suffix(stem: str) -> str: + """Return the image stem without the _pNN page suffix.""" + if "_p" not in stem: + return stem + base, suffix = stem.rsplit("_p", 1) + if len(suffix) == 2 and suffix.isdigit(): + return base + return stem diff --git a/tests/render/test_render_init.py b/tests/render/test_render_init.py index 197ffd5..d9b7134 100644 --- a/tests/render/test_render_init.py +++ b/tests/render/test_render_init.py @@ -3,9 +3,13 @@ import builtins from collections.abc import Callable from pathlib import Path -from types import SimpleNamespace +import shutil +import sys +from types import ModuleType, SimpleNamespace +from typing import Any, cast import pytest +import xlwings as xw from exstruct.errors import MissingDependencyError, RenderError import exstruct.render as render @@ -16,6 +20,15 @@ class FakeSheet: def __init__(self, name: str) -> None: self.name = name + self.api = FakeSheetApi() + + +class FakeSheetApi: + """Stub of xlwings Sheet.api for PDF export.""" + + def ExportAsFixedFormat(self, file_format: int, output_path: str) -> None: + _ = file_format + Path(output_path).write_bytes(b"%PDF-1.4") class FakeBookApi: @@ -25,6 +38,9 @@ def ExportAsFixedFormat(self, file_format: int, output_path: str) -> None: _ = file_format Path(output_path).write_bytes(b"%PDF-1.4") + def SaveAs(self, output_path: str) -> None: + Path(output_path).write_bytes(b"XLSX") + class FakeBook: """Stub of xlwings Book.""" @@ -59,6 +75,7 @@ class FakeApp: def __init__(self, sheet_names: list[str], raise_on_open: bool) -> None: self.books = FakeBooks(sheet_names, raise_on_open) + self.display_alerts = True self.quit_called = False def quit(self) -> None: @@ -87,6 +104,7 @@ class FakePdfDocument: def __init__(self, path: str) -> None: self._path = path + self._page_count = 2 if "sheet_01" in path else 1 def __enter__(self) -> FakePdfDocument: return self @@ -106,6 +124,30 @@ def __getitem__(self, index: int) -> FakePage: _ = index return FakePage() + def __len__(self) -> int: + return self._page_count + + +class ExplodingPdfDocument: + """PdfDocument stub that raises on enter.""" + + def __init__(self, path: str) -> None: + _ = path + + def __enter__(self) -> ExplodingPdfDocument: + raise RuntimeError("boom") + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: object | None, + ) -> bool | None: + _ = exc_type + _ = exc + _ = tb + return None + class FakeImage: """Stub of a PIL image with a save method.""" @@ -139,7 +181,7 @@ def _factory(*args: object, **kwargs: object) -> FakeApp: def test_require_excel_app_success(monkeypatch: pytest.MonkeyPatch) -> None: """_require_excel_app returns the constructed app instance.""" fake_app = FakeApp(["Sheet1"], raise_on_open=False) - monkeypatch.setattr(render.xw, "App", lambda *a, **k: fake_app) + monkeypatch.setattr(xw, "App", lambda *a, **k: fake_app) assert render._require_excel_app() is fake_app @@ -152,7 +194,7 @@ def _raise(*args: object, **kwargs: object) -> None: _ = kwargs raise RuntimeError("boom") - monkeypatch.setattr(render.xw, "App", _raise) + monkeypatch.setattr(xw, "App", _raise) with pytest.raises(RenderError, match="Excel \\(COM\\) is not available"): render._require_excel_app() @@ -164,7 +206,7 @@ def test_export_pdf_success(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> xlsx.write_bytes(b"dummy") output_pdf = tmp_path / "out.pdf" sheet_names = ["Sheet1", "Summary"] - monkeypatch.setattr(render.xw, "App", _fake_app_factory(sheet_names)) + monkeypatch.setattr(xw, "App", _fake_app_factory(sheet_names)) result = render.export_pdf(xlsx, output_pdf) @@ -179,9 +221,7 @@ def test_export_pdf_wraps_failure( xlsx = tmp_path / "input.xlsx" xlsx.write_bytes(b"dummy") output_pdf = tmp_path / "out.pdf" - monkeypatch.setattr( - render.xw, "App", _fake_app_factory(["Sheet1"], raise_on_open=True) - ) + monkeypatch.setattr(xw, "App", _fake_app_factory(["Sheet1"], raise_on_open=True)) with pytest.raises(RenderError, match="Failed to export PDF for"): render.export_pdf(xlsx, output_pdf) @@ -194,9 +234,9 @@ def test_export_pdf_missing_output_raises( xlsx = tmp_path / "input.xlsx" xlsx.write_bytes(b"dummy") output_pdf = tmp_path / "out.pdf" - monkeypatch.setattr(render.xw, "App", _fake_app_factory(["Sheet1"])) + monkeypatch.setattr(xw, "App", _fake_app_factory(["Sheet1"])) - real_copy = render.shutil.copy + real_copy = shutil.copy def _copy( src: Path | str, dst: Path | str, *args: object, **kwargs: object @@ -207,7 +247,7 @@ def _copy( return Path(dst) return Path(real_copy(src, dst)) - monkeypatch.setattr(render.shutil, "copy", _copy) + monkeypatch.setattr(shutil, "copy", _copy) with pytest.raises(RenderError, match="Failed to export PDF to"): render.export_pdf(xlsx, output_pdf) @@ -241,39 +281,36 @@ def test_export_sheet_images_success( xlsx = tmp_path / "input.xlsx" xlsx.write_bytes(b"dummy") out_dir = tmp_path / "images" - - def _fake_export_pdf(excel_path: Path, output_pdf: Path) -> list[str]: - _ = excel_path - output_pdf.write_bytes(b"%PDF-1.4") - return ["Sheet/1", " "] + monkeypatch.setenv("EXSTRUCT_RENDER_SUBPROCESS", "0") fake_pdfium = SimpleNamespace(PdfDocument=FakePdfDocument) monkeypatch.setattr(render, "_require_pdfium", lambda: fake_pdfium) - monkeypatch.setattr(render, "export_pdf", _fake_export_pdf) + monkeypatch.setattr( + render, "_require_excel_app", lambda: FakeApp(["Sheet/1", " "], False) + ) written = render.export_sheet_images(xlsx, out_dir, dpi=144) assert written[0].name == "01_Sheet_1.png" - assert written[1].name == "02_sheet.png" + assert written[1].name == "01_Sheet_1_p02.png" + assert written[2].name == "02_sheet.png" assert all(path.exists() for path in written) def test_export_sheet_images_propagates_render_error( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - """export_sheet_images re-raises RenderError from export_pdf.""" + """export_sheet_images re-raises RenderError from _require_excel_app.""" xlsx = tmp_path / "input.xlsx" xlsx.write_bytes(b"dummy") out_dir = tmp_path / "images" - - def _fake_export_pdf(excel_path: Path, output_pdf: Path) -> list[str]: - _ = excel_path - _ = output_pdf - raise RenderError("boom") + monkeypatch.setenv("EXSTRUCT_RENDER_SUBPROCESS", "0") fake_pdfium = SimpleNamespace(PdfDocument=FakePdfDocument) monkeypatch.setattr(render, "_require_pdfium", lambda: fake_pdfium) - monkeypatch.setattr(render, "export_pdf", _fake_export_pdf) + monkeypatch.setattr( + render, "_require_excel_app", lambda: (_ for _ in ()).throw(RenderError("boom")) + ) with pytest.raises(RenderError, match="boom"): render.export_sheet_images(xlsx, out_dir) @@ -286,38 +323,218 @@ def test_export_sheet_images_wraps_unknown_error( xlsx = tmp_path / "input.xlsx" xlsx.write_bytes(b"dummy") out_dir = tmp_path / "images" + monkeypatch.setenv("EXSTRUCT_RENDER_SUBPROCESS", "0") - def _fake_export_pdf(excel_path: Path, output_pdf: Path) -> list[str]: - _ = excel_path - output_pdf.write_bytes(b"%PDF-1.4") - return ["Sheet1"] + fake_pdfium = SimpleNamespace(PdfDocument=ExplodingPdfDocument) + monkeypatch.setattr(render, "_require_pdfium", lambda: fake_pdfium) + monkeypatch.setattr( + render, "_require_excel_app", lambda: FakeApp(["Sheet1"], False) + ) - class ExplodingPdfDocument: - """PdfDocument stub that raises on enter.""" + with pytest.raises(RenderError, match="Failed to export sheet images"): + render.export_sheet_images(xlsx, out_dir) - def __init__(self, path: str) -> None: - _ = path - def __enter__(self) -> ExplodingPdfDocument: - raise RuntimeError("boom") +def test_export_sheet_images_uses_subprocess_when_enabled( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """export_sheet_images delegates to subprocess rendering when enabled.""" + xlsx = tmp_path / "input.xlsx" + xlsx.write_bytes(b"dummy") + out_dir = tmp_path / "images" - def __exit__( - self, - exc_type: type[BaseException] | None, - exc: BaseException | None, - tb: object | None, - ) -> bool | None: - _ = exc_type - _ = exc - _ = tb - return None + calls: list[tuple[Path, Path, int, str, int]] = [] - fake_pdfium = SimpleNamespace(PdfDocument=ExplodingPdfDocument) - monkeypatch.setattr(render, "_require_pdfium", lambda: fake_pdfium) - monkeypatch.setattr(render, "export_pdf", _fake_export_pdf) + def _fake_subprocess( + pdf_path: Path, + output_dir: Path, + sheet_index: int, + safe_name: str, + dpi: int, + ) -> list[Path]: + calls.append((pdf_path, output_dir, sheet_index, safe_name, dpi)) + return [output_dir / f"{sheet_index + 1:02d}_{safe_name}.png"] - with pytest.raises(RenderError, match="Failed to export sheet images"): - render.export_sheet_images(xlsx, out_dir) + monkeypatch.setenv("EXSTRUCT_RENDER_SUBPROCESS", "1") + monkeypatch.setattr( + render, "_require_excel_app", lambda: FakeApp(["SheetA", "SheetB"], False) + ) + monkeypatch.setattr(render, "_require_pdfium", lambda: SimpleNamespace()) + monkeypatch.setattr(render, "_render_pdf_pages_subprocess", _fake_subprocess) + + written = render.export_sheet_images(xlsx, out_dir, dpi=144) + + assert len(calls) == 2 + assert written[0].name == "01_SheetA.png" + assert written[1].name == "02_SheetB.png" + + +def test_use_render_subprocess_env_toggle(monkeypatch: pytest.MonkeyPatch) -> None: + """_use_render_subprocess respects the env toggle.""" + monkeypatch.setenv("EXSTRUCT_RENDER_SUBPROCESS", "1") + assert render._use_render_subprocess() is True + monkeypatch.setenv("EXSTRUCT_RENDER_SUBPROCESS", "0") + assert render._use_render_subprocess() is False + + +class FakeQueue: + """Stub queue for subprocess tests.""" + + def __init__(self) -> None: + self.payload: dict[str, list[str] | str] | None = None + + def put(self, payload: dict[str, list[str] | str]) -> None: + self.payload = payload + + def get(self, timeout: float | None = None) -> dict[str, list[str] | str]: + _ = timeout + if self.payload is None: + raise TimeoutError("timeout") + return self.payload + + def empty(self) -> bool: + return self.payload is None + + +class FakeProcess: + """Stub process for subprocess tests.""" + + def __init__( + self, + queue: FakeQueue, + exitcode: int, + payload: dict[str, list[str] | str] | None = None, + ) -> None: + self._queue = queue + self.exitcode = exitcode + if payload is not None: + self._queue.put(payload) + + def start(self) -> None: + if self._queue.payload is None: + self._queue.put({"paths": ["dummy"]}) + + def join(self) -> None: + return None + + +class FakeContext: + """Stub multiprocessing context for subprocess tests.""" + + def __init__(self, queue: FakeQueue, process: FakeProcess) -> None: + self._queue = queue + self._process = process + + def Queue(self) -> FakeQueue: + return self._queue + + def Process(self, target: object, args: tuple[object, ...]) -> FakeProcess: + _ = target + _ = args + return self._process + + +def test_render_pdf_pages_subprocess_success(tmp_path: Path) -> None: + """_render_pdf_pages_subprocess returns paths when worker succeeds.""" + queue = FakeQueue() + process = FakeProcess( + queue, + exitcode=0, + payload={"paths": [str(tmp_path / "images" / "01_Sheet1.png")]}, + ) + context = FakeContext(queue, process) + render_mp = cast(Any, render).mp + + def _get_context(_: str) -> FakeContext: + return context + + pdf_path = tmp_path / "sheet_01.pdf" + pdf_path.write_bytes(b"%PDF-1.4") + output_dir = tmp_path / "images" + + with pytest.MonkeyPatch.context() as monkeypatch: + monkeypatch.setattr(render_mp, "get_context", _get_context) + result = render._render_pdf_pages_subprocess( + pdf_path, output_dir, 0, "Sheet1", 144 + ) + + assert result == [output_dir / "01_Sheet1.png"] + + +def test_render_pdf_pages_subprocess_error(tmp_path: Path) -> None: + """_render_pdf_pages_subprocess raises when worker reports error.""" + queue = FakeQueue() + process = FakeProcess(queue, exitcode=0, payload={"error": "boom"}) + context = FakeContext(queue, process) + render_mp = cast(Any, render).mp + + def _get_context(_: str) -> FakeContext: + return context + + pdf_path = tmp_path / "sheet_01.pdf" + pdf_path.write_bytes(b"%PDF-1.4") + output_dir = tmp_path / "images" + + with pytest.MonkeyPatch.context() as monkeypatch: + monkeypatch.setattr(render_mp, "get_context", _get_context) + with pytest.raises(RenderError, match="boom"): + render._render_pdf_pages_subprocess(pdf_path, output_dir, 0, "Sheet1", 144) + + +def test_get_subprocess_result_timeout() -> None: + """_get_subprocess_result returns an error payload on timeout.""" + queue = FakeQueue() + result = render._get_subprocess_result(cast(Any, queue)) + + error = cast(str, result["error"]) + assert error.startswith("subprocess did not return results") + + +def test_render_pdf_pages_worker_success(tmp_path: Path) -> None: + """_render_pdf_pages_worker writes images and returns paths.""" + pdf_path = tmp_path / "sheet_01.pdf" + pdf_path.write_bytes(b"%PDF-1.4") + output_dir = tmp_path / "images" + queue = FakeQueue() + fake_pdfium = cast(Any, ModuleType("pypdfium2")) + fake_pdfium.PdfDocument = FakePdfDocument + + sys.modules["pypdfium2"] = fake_pdfium + try: + render._render_pdf_pages_worker( + pdf_path, output_dir, 0, "Sheet1", 144, cast(Any, queue) + ) + finally: + sys.modules.pop("pypdfium2", None) + + assert queue.payload == { + "paths": [ + str(output_dir / "01_Sheet1.png"), + str(output_dir / "01_Sheet1_p02.png"), + ] + } + assert (output_dir / "01_Sheet1.png").exists() + assert (output_dir / "01_Sheet1_p02.png").exists() + + +def test_render_pdf_pages_worker_error(tmp_path: Path) -> None: + """_render_pdf_pages_worker reports errors via queue.""" + pdf_path = tmp_path / "sheet_01.pdf" + pdf_path.write_bytes(b"%PDF-1.4") + output_dir = tmp_path / "images" + queue = FakeQueue() + + fake_pdfium = cast(Any, ModuleType("pypdfium2")) + fake_pdfium.PdfDocument = ExplodingPdfDocument + sys.modules["pypdfium2"] = fake_pdfium + try: + render._render_pdf_pages_worker( + pdf_path, output_dir, 0, "Sheet1", 144, cast(Any, queue) + ) + finally: + sys.modules.pop("pypdfium2", None) + + assert queue.payload == {"error": "boom"} def test_sanitize_sheet_filename() -> None: diff --git a/uv.lock b/uv.lock index 9bfcbde..29c3683 100644 --- a/uv.lock +++ b/uv.lock @@ -298,7 +298,7 @@ wheels = [ [[package]] name = "exstruct" -version = "0.3.2" +version = "0.3.5" source = { editable = "." } dependencies = [ { name = "numpy" },