From 69de15267000879ae5f93f3b510cb3d8db347716 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Tue, 6 Jan 2026 20:33:55 +0900 Subject: [PATCH 01/16] update --- .gitignore | 1 + uv.lock | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index fe57ac3..db1a07e 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ __pycache__/ *.py[oc] build/ dist/ +drafts/ wheels/ *.egg-info diff --git a/uv.lock b/uv.lock index 4ee6c6d..9bfcbde 100644 --- a/uv.lock +++ b/uv.lock @@ -298,7 +298,7 @@ wheels = [ [[package]] name = "exstruct" -version = "0.3.1" +version = "0.3.2" source = { editable = "." } dependencies = [ { name = "numpy" }, From 0559412e4e9aeb41e341fb78550cbc72d7b3ff1b Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Tue, 6 Jan 2026 20:38:16 +0900 Subject: [PATCH 02/16] =?UTF-8?q?=E4=BB=95=E6=A7=98=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/agents/FEATURE_SPEC.md | 7 +++---- docs/agents/ROADMAP.md | 8 ++++++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/docs/agents/FEATURE_SPEC.md b/docs/agents/FEATURE_SPEC.md index 32ae620..532332b 100644 --- a/docs/agents/FEATURE_SPEC.md +++ b/docs/agents/FEATURE_SPEC.md @@ -4,11 +4,10 @@ --- -## セル結合範囲取得機能の追加 +## セル結合範囲モデルのコンテキスト量圧縮 -- 新たに`MergedCell`モデルを作成、座標情報とセル値を持つ -- `SheetData`に`merged_ranges`フィールドを追加し、list[MergedRange]を持つ -- デフォルトモード以上(standard/verbose)で取得するようにする。engineには無効化オプションをつけておく(outputoptionsで出力時に削除する方向で) +- 現状のmerged_cellsがコンテキスト量を非常に多く持っているため、データ構造の見直しをしてコンテキスト量の圧縮を計る +- rowsとmerged_cellsでセル値を重複して持っているので、出力時にrowsにある結合セルの値を落とすようにする ## 今後のオプション(検討メモ) diff --git a/docs/agents/ROADMAP.md b/docs/agents/ROADMAP.md index 55aaefa..f2c7d9c 100644 --- a/docs/agents/ROADMAP.md +++ b/docs/agents/ROADMAP.md @@ -40,11 +40,15 @@ - セル結合範囲取得機能追加 -## v0.3.3 +## v0.4.0 + +- セル結合範囲データ圧縮とrowsデータ構造見直しによるコンテキスト圧縮 + +## v0.4.1 - 数式取得オプション追加 -## v0.4.0 +## v0.5.0 - Excel Form Controls 解析 From 44920dd9419a042ca2ccd80e40eb64c57a94f77b Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Tue, 6 Jan 2026 20:43:15 +0900 Subject: [PATCH 03/16] =?UTF-8?q?agents.md=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AGENTS.md | 10 ++++++++++ docs/agents/CODE_REVIEW.md | 0 2 files changed, 10 insertions(+) delete mode 100644 docs/agents/CODE_REVIEW.md diff --git a/AGENTS.md b/AGENTS.md index 68582f5..6a7fb01 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -199,4 +199,14 @@ AI エージェントが ExStruct のコードを書く場合でも: --- +# 10. 各種仕様の確認 + +AI エージェントは必要に応じて以下のドキュメントを参照して ExStruct の開発をする + +- 処理アーキテクチャ: `docs/architecture/pipeline.md` +- プロジェクトアーキテクチャ: `docs/contributors/architecture.md` +- コーディングガイドライン: `docs/agents/CODING_GUIDELINES.md` +- データモデル: `docs/agents/DATA_MODEL.md` +- タスク: `docs/agents/TASKS.md` + **以上。AI はこのガイドラインに従って ExStruct の開発に参加してください。** diff --git a/docs/agents/CODE_REVIEW.md b/docs/agents/CODE_REVIEW.md deleted file mode 100644 index e69de29..0000000 From 8c40fcd9d423fc44cb45cc409e326d0d68e8edd2 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Tue, 6 Jan 2026 20:54:23 +0900 Subject: [PATCH 04/16] =?UTF-8?q?spec,=20task=E5=AE=9A=E7=BE=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/agents/FEATURE_SPEC.md | 55 ++++++++++++++++++++++++++++++------- docs/agents/TASKS.md | 24 ++++++---------- 2 files changed, 54 insertions(+), 25 deletions(-) diff --git a/docs/agents/FEATURE_SPEC.md b/docs/agents/FEATURE_SPEC.md index 532332b..435001e 100644 --- a/docs/agents/FEATURE_SPEC.md +++ b/docs/agents/FEATURE_SPEC.md @@ -1,23 +1,58 @@ -# Feature Spec for AI Agent (Phase-by-Phase) +# Feature Spec for AI Agent (Phase-by-Phase) 本ドキュメントは AI エージェント向けに、段階的に実装を進めるための仕様メモです。 --- -## セル結合範囲モデルのコンテキスト量圧縮 +## セル結合データのコンテキスト量圧縮 -- 現状のmerged_cellsがコンテキスト量を非常に多く持っているため、データ構造の見直しをしてコンテキスト量の圧縮を計る -- rowsとmerged_cellsでセル値を重複して持っているので、出力時にrowsにある結合セルの値を落とすようにする +- 現状の `merged_cells` がコンテキスト量を非常に多く持っているため、データ構造の見直しで圧縮する +- `rows` と `merged_cells` でセル値を重複して持っているため、出力時に `rows` 側の結合セル値を落とす運用を検討する -## 今後のオプション(検討メモ) +### 仕様(v1.1 予定) -- 表検出スコアリングの閾値を CLI/環境変数で調整可能にする。 -- 出力モード(light/standard/verbose)に応じてテーブル候補数を制限するオプション。 +- `merged_cells` を **schema + items** 形式へ変更して冗長なキーを削減する +- 結合セルの値は `merged_cells` に集約し、`rows` 側に保持するかはフラグで切替可能にする + +#### merged_cells の新フォーマット(例) + +```json +{ + "merged_cells": { + "schema": ["r1", "c1", "r2", "c2", "v"], + "items": [ + [1, 0, 2, 1, "A1-B2 merged"], + [3, 4, 3, 6, "merged value"] + ] + } +} +``` + +- `r1/c1/r2/c2` は従来同様の座標(row: 1-based, col: 0-based) +- `v` は結合セルの代表値(セル値がない場合でも `" "` を出力する) + +#### rows 側の結合セル値の扱い + +- 新しいフラグ `include_merged_values_in_rows: bool` を導入 +- `True` の場合は互換モード(従来どおり `rows` に結合セル値を残す) +- `False` の場合は `rows` から結合セル値を排除し、`merged_cells` のみで値を保持 + +#### 互換性 + +- デフォルトは `True` として破壊的変更を回避 +- 将来的にデフォルト切替の可能性があるため、出力仕様に明記する + +--- + +## 今後のオプション検討メモ + +- 表検知スコアリングの閾値を CLI/環境変数で調整可能にする +- 出力モード(light/standard/verbose)に応じてテーブル候補数を制限するオプション --- ## 実装方針 -- 各ステップ完了ごとにテスト追加または既存フィクスチャで手動確認。 -- 短い定数・関数分割でスコアリングの調整をしやすくする。 -- 外部公開前なので、破壊的変更は適宜コメント・メモに残す。 +- 小さなステップごとにテスト追加、または既存フィクスチャで手動確認 +- 短い関数・責務分割でスコアリング調整をしやすくする +- 外部公開前なので、破壊的変更はコメントや仕様に明示して段階的に移行する diff --git a/docs/agents/TASKS.md b/docs/agents/TASKS.md index fbccecc..ffe5376 100644 --- a/docs/agents/TASKS.md +++ b/docs/agents/TASKS.md @@ -1,17 +1,11 @@ -# Task List +# Task List -## 結合セル(MergedCell)テスト +未完了 [ ], 完了 [x] -- [ ] フィクスチャ作成:結合セルあり/なし、複数範囲、値あり/空文字の Excel を用意 -- [ ] OpenpyxlBackend.extract_merged_cells の正常系(座標・代表値)をユニットテスト -- [ ] OpenpyxlBackend.extract_merged_cells の例外時フォールバック(空マップ)をテスト -- [ ] ComBackend.extract_merged_cells が NotImplementedError を送出することをテスト -- [ ] Pipeline: standard/verbose で merged_cells を含み、light では空になることをテスト -- [ ] Pipeline: include_merged_cells=False で抽出ステップが無効化されることをテスト -- [ ] Modeling: SheetRawData→SheetData で merged_cells が保持されることをテスト -- [ ] Engine: OutputOptions.filters.include_merged_cells=False で出力から除外されることをテスト -- [ ] Export: dict_without_empty_values により merged_cells 空リストが出力されないことをテスト - -## カバレッジ対応 - -- [ ] 追加テストで 78% 以上の全体カバレッジを満たすことを確認 \ No newline at end of file +- [ ] 仕様: `merged_cells` の新フォーマット(schema + items)をモデルと出力仕様に反映 +- [ ] 仕様: `include_merged_values_in_rows` フラグ追加(デフォルト True) +- [ ] 実装: 既存の `merged_cells` 生成ロジックを新構造へ置換 +- [ ] 実装: `rows` から結合セル値を排除する分岐を追加(フラグ制御) +- [ ] 実装: 結合セルの値がない場合は `" "` を出力 +- [ ] 更新: 既存の JSON 出力例・ドキュメントの整合性確認 +- [ ] テスト: 結合セルが多いケースの JSON 量削減を確認 From a8fbb3555a12db758d7270d2cd31da5779044df6 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Tue, 6 Jan 2026 21:33:56 +0900 Subject: [PATCH 05/16] =?UTF-8?q?-=20merged=5Fcells=20=E3=81=AF=20MergedCe?= =?UTF-8?q?lls(schema+items)=20=E3=81=AB=E5=A4=89=E6=8F=9B=E3=81=97?= =?UTF-8?q?=E3=81=A6=E5=87=BA=E5=8A=9B=E3=80=81=E7=A9=BA=E5=80=A4=E3=81=AF?= =?UTF-8?q?=20"=20"=20=E3=81=AB=E6=AD=A3=E8=A6=8F=E5=8C=96=20-=20include?= =?UTF-8?q?=5Fmerged=5Fvalues=5Fin=5Frows=EF=BC=88=E6=97=A2=E5=AE=9A=20Tru?= =?UTF-8?q?e=EF=BC=89=E3=81=A7=20rows=20=E3=81=8B=E3=82=89=E7=B5=90?= =?UTF-8?q?=E5=90=88=E3=82=BB=E3=83=AB=E5=80=A4=E3=82=92=E9=99=A4=E5=A4=96?= =?UTF-8?q?=E5=8F=AF=E8=83=BD=E3=81=AB=20-=20include=5Fmerged=5Fcells=20?= =?UTF-8?q?=E3=82=92=20false=20=E3=81=AE=E5=A0=B4=E5=90=88=E3=81=AF=20merg?= =?UTF-8?q?ed=5Fcells=20=E3=82=92=20None=20=E5=87=BA=E5=8A=9B=E3=81=AB?= =?UTF-8?q?=E7=B5=B1=E4=B8=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/agents/DATA_MODEL.md | 121 +++++++++++++------------- docs/agents/ROADMAP.md | 4 +- src/exstruct/core/backends/base.py | 6 +- src/exstruct/core/cells.py | 23 +++-- src/exstruct/core/integrate.py | 3 + src/exstruct/core/modeling.py | 24 ++++- src/exstruct/core/pipeline.py | 135 +++++++++++++++++++++++++++-- src/exstruct/engine.py | 5 +- src/exstruct/io/__init__.py | 20 +++-- src/exstruct/models/__init__.py | 42 ++++++--- 10 files changed, 286 insertions(+), 97 deletions(-) diff --git a/docs/agents/DATA_MODEL.md b/docs/agents/DATA_MODEL.md index ff2284b..0d7d056 100644 --- a/docs/agents/DATA_MODEL.md +++ b/docs/agents/DATA_MODEL.md @@ -1,14 +1,16 @@ -# ExStruct データモデル仕様 +# ExStruct データモデル仕様 -**Version**: 0.14 -**Status**: Authoritative — 本ドキュメントは ExStruct が返す全モデルの唯一の正準ソースです。 -core / io / integrate は必ずこの仕様に従うこと。モデルは **pydantic v2** で実装します。 +**Version**: 0.15 +**Status**: Authoritative + +本ドキュメントは ExStruct が返す全モデルの唯一の正準ソースです。 +core / io / integrate はこの仕様に従うこと。モデルは **pydantic v2** で実装します。 --- # 1. Overview -ExStruct は Excel ワークブックを LLM が扱いやすい **意味構造(Semantic Structure)** として JSON 化します。 +ExStruct は Excel ワークブックを LLM が扱いやすい **意味構造** として JSON 化します。 特記がない限り、以下のモデルはすべて Pydantic の `BaseModel` です。 --- @@ -19,12 +21,12 @@ ExStruct は Excel ワークブックを LLM が扱いやすい **意味構造 ```jsonc BaseShape { - id: int | null // sheet 内の通番 id(線/矢印は null の場合あり) + id: int | null // sheet 連番 id(矢印は null の場合あり) text: str l: int // left (px) t: int // top (px) w: int | null // width (px) - h: int | null // height(px) + h: int | null // height (px) rotation: float | null } @@ -37,8 +39,8 @@ Arrow extends BaseShape { kind: "arrow" begin_arrow_style: int | null end_arrow_style: int | null - begin_id: int | null // コネクタ始点の接続先 Shape.id - end_id: int | null // コネクタ終点の接続先 Shape.id + begin_id: int | null // コネクタ始点の接続 Shape.id + end_id: int | null // コネクタ終点の接続 Shape.id direction: "E"|"SE"|"S"|"SW"|"W"|"NW"|"N"|"NE" | null } @@ -56,10 +58,10 @@ SmartArt extends BaseShape { 補足: -- `direction` は線や矢印の向きを 8 方位に正規化したもの。 -- 矢印スタイルは Excel の enum に対応。 -- `begin_id` / `end_id` は、コネクタが接続している図形の `id` に対応(`ConnectorFormat.BeginConnectedShape` / `EndConnectedShape`)。 -- `SmartArtNode` はネスト構造で表現し、`nodes` がツリーの根。 +- `direction` は線や矢印の向きを 8 方位に正規化 +- 矢印スタイルは Excel の enum に対応 +- `begin_id` / `end_id` はコネクタが接続している図形の `id` +- `SmartArtNode` はネスト構造で表現し、`nodes` がツリーの根 --- @@ -67,9 +69,9 @@ SmartArt extends BaseShape { ```jsonc CellRow { - r: int // 行番号(Excel 由来、1-based) - c: { [colIndex: str]: str | int | float } // 非空セルのみ、列インデックスは文字列 ("0","1",...) - links: { [colIndex: str]: url } | null // ハイパーリンク(有効化時のみ) + r: int // 行番号 (1-based) + c: { [colIndex: str]: str | int | float } // 非空セルのみ、キーは列インデックス文字列 + links: { [colIndex: str]: url } | null // ハイパーリンク有効化時のみ } ``` @@ -123,8 +125,8 @@ PrintArea { 補足: -- シートごとに複数保持可能。 -- `standard` / `verbose` で取得できる場合に含まれる。 +- シートごとに複数保持可能 +- `standard` / `verbose` で取得できる場合に含まれる --- @@ -144,25 +146,22 @@ PrintAreaView { 補足: -- 座標はデフォルトでシート基準。`normalize` 指定時は範囲左上を原点に再基準化。 +- 座標はデフォルトでシート基準。`normalize` 指定時は範囲左上を原点に再基準化 --- -# 8. MergedCell Model +# 8. MergedCells Model ```jsonc -MergedCell { - r1: int // 開始行 (1-based) - c1: int // 開始列 (0-based) - r2: int // 終了行 (1-based, inclusive) - c2: int // 終了列 (0-based, inclusive) - v: str // 結合セルの代表値 (空文字の可能性あり) +MergedCells { + schema: ["r1", "c1", "r2", "c2", "v"] + items: [[int, int, int, int, str]] } ``` -補足: - -- `standard` / `verbose` で取得される +- `items` は `(r1, c1, r2, c2, v)` の配列 +- row は 1-based、col は 0-based +- `v` は結合セルの代表値。値がない場合でも `" "` を出力する --- @@ -175,16 +174,18 @@ SheetData { charts: [Chart] table_candidates: [str] print_areas: [PrintArea] - auto_print_areas: [PrintArea] // 自動改ページ矩形(COM 前提、デフォルト無効) - colors_map: {[colorHex: str]: [[int, int]]} // (row=1-based, col=0-based) のセル座標を列挙 - merged_cells: [MergedCell] // ?????? (standard/verbose) + auto_print_areas: [PrintArea] // 自動改ページ矩形 (COM 前提、デフォルト無効) + colors_map: {[colorHex: str]: [[int, int]]} // (row=1-based, col=0-based) + merged_cells: MergedCells | null } ``` 補足: -- `table_candidates` はテーブル検出結果。 -- `print_areas` は定義済み印刷範囲。`auto_print_areas` は Excel COM の自動改ページから取得し、明示的に有効化した場合のみ含まれる。 +- `table_candidates` はテーブル検知結果 +- `print_areas` は定義済み印刷範囲 +- `auto_print_areas` は Excel COM の自動改ページから取得 +- `rows` の結合セル値の出力は `include_merged_values_in_rows` フラグで制御(既定: `True`) --- @@ -199,7 +200,7 @@ WorkbookData { 補足: -- シート名は Excel の Unicode 名をそのまま保持。 +- シート名は Excel の Unicode 名をそのまま保持 --- @@ -208,41 +209,45 @@ WorkbookData { 共通: - `to_json(pretty=False, indent=None)` -- `to_yaml()`(`pyyaml` 必須) -- `to_toon()`(`python-toon` 必須) -- `save(path, pretty=False, indent=None)` — 拡張子から `.json` / `.yaml` / `.yml` / `.toon` を自動判別。非対応拡張子は `ValueError`。 -- `model_dump(exclude_none=True)` 後に `dict_without_empty_values` で空値を除去。 +- `to_yaml()` (`pyyaml` 必須) +- `to_toon()` (`python-toon` 必須) +- `save(path, pretty=False, indent=None)` + - 拡張子 `.json` / `.yaml` / `.yml` / `.toon` を自動判別 + - 非対応拡張子は `ValueError` +- `model_dump(exclude_none=True)` 後に `dict_without_empty_values` で空値を除去 `SheetData`: -- シリアライズ時に `book_name` は含まない(シート単体)。 +- シリアライズ時に `book_name` は含まない(シート単体) `WorkbookData`: -- ペイロードに `book_name` と `sheets` を含む。 -- `__getitem__(sheet_name)` で SheetData を取得、`__iter__()` で `(sheet_name, SheetData)` を順序付きで返す。 +- ペイロードに `book_name` と `sheets` を含む +- `__getitem__(sheet_name)` で SheetData を取得 +- `__iter__()` で `(sheet_name, SheetData)` を順に返す --- -# 12. Versioning Principles(エージェント向け) +# 12. Versioning Principles -- モデル変更時は必ず本ファイルを先に更新する。 -- モデルは純粋なデータコンテナとし、副作用を持たせない。 -- core / io / integrate は本仕様に忠実なモデルのみを返し、独自フィールドを追加しない。 +- モデル変更時は本ファイルを先に更新する +- モデルは純粋なデータコンテナとし、副作用を持たせない +- core / io / integrate は本仕様に忠実なモデルのみを返し、独自フィールドを追加しない --- # 13. Changelog -- 0.3: serialize/save ヘルパーを追加、`WorkbookData` に `__iter__` / `__getitem__` を定義。 -- 0.4: `CellRow.links` を追加(ハイパーリンクは opt-in、verbose でデフォルト有効)。 -- 0.5: `PrintArea` を追加し、`SheetData.print_areas` で保持。standard / verbose で出力。 -- 0.6: PrintArea をデフォルト抽出。テーブル検出は従来通り。 -- 0.7: Chart にサイズフィールド `w` / `h`(optional)を追加。 -- 0.8: `SheetData.auto_print_areas` を追加(COM の自動改ページ矩形、デフォルト無効)。ヘルパーとデフォルト挙動を明確化。 -- 0.9: Shape に `name` / `begin_connected_shape` / `end_connected_shape` を追加し、コネクタの接続元/接続先を表現(後に `begin_id` / `end_id` に名称変更)。 -- 0.10: Shape に `id` を追加し、コネクタの接続元/接続先を `id` 参照に変更し、`name` をペイロードから除去。 -- 0.11: コネクタのフィールド名を `begin_id` / `end_id` にリネーム。 -- 0.12: SheetData に背景色情報を格納する`colors_map`を追加。 -- 0.13: Shape を `Shape` / `Arrow` / `SmartArt` に分離し、`SmartArtNode` のネスト構造を追加。 -- 0.14: セル結合範囲データを持つ`MergedCell`,`SheetData.merged_cells` を追加 +- 0.3: serialize/save ヘルパー追加、`WorkbookData` に `__iter__` / `__getitem__` を定義 +- 0.4: `CellRow.links` を追加(ハイパーリンクは opt-in) +- 0.5: `PrintArea` を追加し、`SheetData.print_areas` で保持 +- 0.6: PrintArea をデフォルト抽出。テーブル検知は従来通り +- 0.7: Chart にサイズ `w` / `h` を追加 +- 0.8: `SheetData.auto_print_areas` を追加(COM 自動改ページ矩形、デフォルト無効) +- 0.9: Shape に `name` / `begin_connected_shape` / `end_connected_shape` を追加し、後に `begin_id` / `end_id` に変更 +- 0.10: Shape に `id` を追加し、`name` を削除 +- 0.11: コネクタのフィールド名を `begin_id` / `end_id` に統一 +- 0.12: `SheetData.colors_map` を追加 +- 0.13: Shape を `Shape` / `Arrow` / `SmartArt` に分割し、`SmartArtNode` のネスト構造を追加 +- 0.14: `MergedCell` / `SheetData.merged_cells` を追加 +- 0.15: `MergedCells` を schema + items 形式に変更し圧縮形式を導入 diff --git a/docs/agents/ROADMAP.md b/docs/agents/ROADMAP.md index f2c7d9c..a69533c 100644 --- a/docs/agents/ROADMAP.md +++ b/docs/agents/ROADMAP.md @@ -40,11 +40,11 @@ - セル結合範囲取得機能追加 -## v0.4.0 +## v0.3.5 - セル結合範囲データ圧縮とrowsデータ構造見直しによるコンテキスト圧縮 -## v0.4.1 +## v0.3.6 - 数式取得オプション追加 diff --git a/src/exstruct/core/backends/base.py b/src/exstruct/core/backends/base.py index 27d8bdc..f678e54 100644 --- a/src/exstruct/core/backends/base.py +++ b/src/exstruct/core/backends/base.py @@ -3,12 +3,12 @@ from dataclasses import dataclass from typing import Protocol -from ...models import CellRow, MergedCell, PrintArea -from ..cells import WorkbookColorsMap +from ...models import CellRow, PrintArea +from ..cells import MergedCellRange, WorkbookColorsMap CellData = dict[str, list[CellRow]] PrintAreaData = dict[str, list[PrintArea]] -MergedCellData = dict[str, list[MergedCell]] +MergedCellData = dict[str, list[MergedCellRange]] @dataclass(frozen=True) diff --git a/src/exstruct/core/cells.py b/src/exstruct/core/cells.py index 5aaead9..de3b352 100644 --- a/src/exstruct/core/cells.py +++ b/src/exstruct/core/cells.py @@ -15,7 +15,7 @@ import pandas as pd import xlwings as xw -from ..models import CellRow, MergedCell +from ..models import CellRow from .workbook import openpyxl_workbook logger = logging.getLogger(__name__) @@ -67,6 +67,17 @@ def get_sheet(self, sheet_name: str) -> SheetColorsMap | None: return self.sheets.get(sheet_name) +@dataclass(frozen=True) +class MergedCellRange: + """Merged cell range with normalized value.""" + + r1: int + c1: int + r2: int + c2: int + v: str + + def extract_sheet_colors_map( file_path: Path, *, include_default_background: bool, ignore_colors: set[str] | None ) -> WorkbookColorsMap: @@ -526,7 +537,7 @@ def extract_sheet_cells_with_links(file_path: Path) -> dict[str, list[CellRow]]: return merged -def extract_sheet_merged_cells(file_path: Path) -> dict[str, list[MergedCell]]: +def extract_sheet_merged_cells(file_path: Path) -> dict[str, list[MergedCellRange]]: """Extract merged cell ranges per sheet via openpyxl. Args: @@ -535,21 +546,23 @@ def extract_sheet_merged_cells(file_path: Path) -> dict[str, list[MergedCell]]: Returns: Mapping of sheet name to merged cell ranges. """ - merged_by_sheet: dict[str, list[MergedCell]] = {} + merged_by_sheet: dict[str, list[MergedCellRange]] = {} with openpyxl_workbook(file_path, data_only=True, read_only=False) as wb: for ws in wb.worksheets: merged_ranges = getattr(ws, "merged_cells", None) if merged_ranges is None: merged_by_sheet[ws.title] = [] continue - results: list[MergedCell] = [] + results: list[MergedCellRange] = [] for merged_range in getattr(merged_ranges, "ranges", []): bounds = range_boundaries(str(merged_range)) min_col, min_row, max_col, max_row = bounds cell_value = ws.cell(row=min_row, column=min_col).value value_str = "" if cell_value is None else str(cell_value) + if value_str == "": + value_str = " " results.append( - MergedCell( + MergedCellRange( r1=min_row, c1=min_col - 1, r2=max_row, diff --git a/src/exstruct/core/integrate.py b/src/exstruct/core/integrate.py index 77815aa..03017ce 100644 --- a/src/exstruct/core/integrate.py +++ b/src/exstruct/core/integrate.py @@ -18,6 +18,7 @@ def extract_workbook( # noqa: C901 include_default_background: bool = False, ignore_colors: set[str] | None = None, include_merged_cells: bool | None = None, + include_merged_values_in_rows: bool = True, ) -> WorkbookData: """Extract workbook and return WorkbookData. @@ -33,6 +34,7 @@ def extract_workbook( # noqa: C901 include_default_background: Whether to include default background color. ignore_colors: Optional set of color keys to ignore. include_merged_cells: Whether to include merged cell ranges; None uses mode defaults. + include_merged_values_in_rows: Whether to keep merged values in rows. Returns: Extracted WorkbookData. @@ -50,6 +52,7 @@ def extract_workbook( # noqa: C901 include_default_background=include_default_background, ignore_colors=ignore_colors, include_merged_cells=include_merged_cells, + include_merged_values_in_rows=include_merged_values_in_rows, ) result = run_extraction_pipeline(inputs) return result.workbook diff --git a/src/exstruct/core/modeling.py b/src/exstruct/core/modeling.py index fb3e665..f92f32a 100644 --- a/src/exstruct/core/modeling.py +++ b/src/exstruct/core/modeling.py @@ -6,13 +6,14 @@ Arrow, CellRow, Chart, - MergedCell, + MergedCells, PrintArea, Shape, SheetData, SmartArt, WorkbookData, ) +from .cells import MergedCellRange @dataclass(frozen=True) @@ -37,7 +38,7 @@ class SheetRawData: print_areas: list[PrintArea] auto_print_areas: list[PrintArea] colors_map: dict[str, list[tuple[int, int]]] - merged_cells: list[MergedCell] + merged_cells: list[MergedCellRange] @dataclass(frozen=True) @@ -70,10 +71,27 @@ def build_sheet_data(raw: SheetRawData) -> SheetData: print_areas=raw.print_areas, auto_print_areas=raw.auto_print_areas, colors_map=raw.colors_map, - merged_cells=raw.merged_cells, + merged_cells=_build_merged_cells(raw.merged_cells), ) +def _build_merged_cells( + merged_cells: list[MergedCellRange], +) -> MergedCells | None: + """Build a compressed merged_cells model from raw ranges. + + Args: + merged_cells: Raw merged cell ranges. + + Returns: + MergedCells model or None when empty. + """ + if not merged_cells: + return None + items = [(cell.r1, cell.c1, cell.r2, cell.c2, cell.v) for cell in merged_cells] + return MergedCells(items=items) + + def build_workbook_data(raw: WorkbookRawData) -> WorkbookData: """Build a WorkbookData model from raw workbook data. diff --git a/src/exstruct/core/pipeline.py b/src/exstruct/core/pipeline.py index 5199b30..3a4657a 100644 --- a/src/exstruct/core/pipeline.py +++ b/src/exstruct/core/pipeline.py @@ -14,7 +14,6 @@ Arrow, CellRow, Chart, - MergedCell, PrintArea, Shape, SmartArt, @@ -22,7 +21,7 @@ ) from .backends.com_backend import ComBackend from .backends.openpyxl_backend import OpenpyxlBackend -from .cells import WorkbookColorsMap, detect_tables +from .cells import MergedCellRange, WorkbookColorsMap, detect_tables from .charts import get_charts from .logging_utils import log_fallback from .modeling import SheetRawData, WorkbookRawData, build_workbook_data @@ -32,7 +31,7 @@ ExtractionMode = Literal["light", "standard", "verbose"] CellData = dict[str, list[CellRow]] PrintAreaData = dict[str, list[PrintArea]] -MergedCellData = dict[str, list[MergedCell]] +MergedCellData = dict[str, list[MergedCellRange]] ShapeData = dict[str, list[Shape | Arrow | SmartArt]] ChartData = dict[str, list[Chart]] @@ -53,6 +52,7 @@ class ExtractionInputs: include_default_background: Whether to include default background color. ignore_colors: Optional set of color keys to ignore. include_merged_cells: Whether to include merged cell ranges. + include_merged_values_in_rows: Whether to keep merged values in rows. """ file_path: Path @@ -64,6 +64,7 @@ class ExtractionInputs: include_default_background: bool ignore_colors: set[str] | None include_merged_cells: bool + include_merged_values_in_rows: bool @dataclass @@ -179,6 +180,7 @@ def resolve_extraction_inputs( include_default_background: bool, ignore_colors: set[str] | None, include_merged_cells: bool | None, + include_merged_values_in_rows: bool, ) -> ExtractionInputs: """Resolve include flags and normalize inputs for the pipeline. @@ -192,6 +194,7 @@ def resolve_extraction_inputs( include_default_background: Include default background colors when colors_map is enabled. ignore_colors: Optional set of colors to ignore when colors_map is enabled. include_merged_cells: Whether to include merged cell ranges; None uses mode defaults. + include_merged_values_in_rows: Whether to keep merged values in rows. Returns: Resolved ExtractionInputs. @@ -233,6 +236,7 @@ def resolve_extraction_inputs( include_default_background=resolved_default_background, ignore_colors=resolved_ignore_colors, include_merged_cells=resolved_merged_cells, + include_merged_values_in_rows=include_merged_values_in_rows, ) @@ -583,6 +587,108 @@ def _resolve_sheet_colors_map( return sheet_colors.colors_map +def _filter_rows_excluding_merged_values( + rows: list[CellRow], + merged_cells: list[MergedCellRange], +) -> list[CellRow]: + """Remove merged-cell values from rows. + + Args: + rows: Extracted rows. + merged_cells: Merged cell ranges. + + Returns: + Filtered rows with merged-cell values removed. + """ + if not rows or not merged_cells: + return rows + intervals_by_row = _build_merged_row_intervals(merged_cells) + if not intervals_by_row: + return rows + filtered_rows: list[CellRow] = [] + for row in rows: + intervals = intervals_by_row.get(row.r) + if not intervals: + filtered_rows.append(row) + continue + filtered_cells: dict[str, int | float | str] = {} + for col_key, value in row.c.items(): + col_index = _safe_col_index(col_key) + if col_index is None: + filtered_cells[col_key] = value + continue + if not _col_in_intervals(col_index, intervals): + filtered_cells[col_key] = value + if not filtered_cells: + continue + filtered_links = None + if row.links: + filtered_links = { + col_key: link + for col_key, link in row.links.items() + if col_key in filtered_cells + } + if not filtered_links: + filtered_links = None + filtered_rows.append(CellRow(r=row.r, c=filtered_cells, links=filtered_links)) + return filtered_rows + + +def _build_merged_row_intervals( + merged_cells: list[MergedCellRange], +) -> dict[int, list[tuple[int, int]]]: + """Build row -> merged column intervals lookup. + + Args: + merged_cells: Merged cell ranges. + + Returns: + Mapping of row index to merged column intervals. + """ + intervals_by_row: dict[int, list[tuple[int, int]]] = {} + for cell in merged_cells: + for row in range(cell.r1, cell.r2 + 1): + intervals_by_row.setdefault(row, []).append((cell.c1, cell.c2)) + for row, intervals in intervals_by_row.items(): + intervals_by_row[row] = _merge_intervals(intervals) + return intervals_by_row + + +def _merge_intervals(intervals: list[tuple[int, int]]) -> list[tuple[int, int]]: + """Merge overlapping or adjacent intervals.""" + if not intervals: + return [] + sorted_intervals = sorted(intervals) + merged: list[tuple[int, int]] = [] + current_start, current_end = sorted_intervals[0] + for start, end in sorted_intervals[1:]: + if start <= current_end + 1: + current_end = max(current_end, end) + continue + merged.append((current_start, current_end)) + current_start, current_end = start, end + merged.append((current_start, current_end)) + return merged + + +def _col_in_intervals(col_index: int, intervals: list[tuple[int, int]]) -> bool: + """Check whether a column index falls in any interval.""" + for start, end in intervals: + if col_index < start: + return False + if start <= col_index <= end: + return True + return False + + +def _safe_col_index(col_key: str) -> int | None: + """Parse a column key to int, returning None on failure.""" + try: + return int(col_key) + except ValueError: + return None + + def collect_sheet_raw_data( *, cell_data: CellData, @@ -591,6 +697,7 @@ def collect_sheet_raw_data( merged_cell_data: MergedCellData, workbook: xw.Book, mode: ExtractionMode = "standard", + include_merged_values_in_rows: bool, print_area_data: PrintAreaData | None = None, auto_page_break_data: PrintAreaData | None = None, colors_map_data: WorkbookColorsMap | None = None, @@ -607,6 +714,7 @@ def collect_sheet_raw_data( print_area_data: Optional print area data per sheet. auto_page_break_data: Optional auto page-break data per sheet. colors_map_data: Optional colors map data. + include_merged_values_in_rows: Whether to keep merged values in rows. Returns: Mapping of sheet name to raw sheet data. @@ -614,8 +722,14 @@ def collect_sheet_raw_data( result: dict[str, SheetRawData] = {} for sheet_name, rows in cell_data.items(): sheet = workbook.sheets[sheet_name] + merged_cells = merged_cell_data.get(sheet_name, []) + filtered_rows = ( + rows + if include_merged_values_in_rows + else _filter_rows_excluding_merged_values(rows, merged_cells) + ) sheet_raw = SheetRawData( - rows=rows, + rows=filtered_rows, shapes=shape_data.get(sheet_name, []), charts=chart_data.get(sheet_name, []) if mode != "light" else [], table_candidates=detect_tables(sheet), @@ -624,7 +738,7 @@ def collect_sheet_raw_data( if auto_page_break_data else [], colors_map=_resolve_sheet_colors_map(colors_map_data, sheet_name), - merged_cells=merged_cell_data.get(sheet_name, []), + merged_cells=merged_cells, ) result[sheet_name] = sheet_raw return result @@ -674,6 +788,7 @@ def _fallback(message: str, reason: FallbackReason) -> PipelineResult: merged_cell_data=artifacts.merged_cell_data, workbook=workbook, mode=inputs.mode, + include_merged_values_in_rows=inputs.include_merged_values_in_rows, print_area_data=artifacts.print_area_data if inputs.include_print_areas else None, @@ -733,8 +848,14 @@ def build_cells_tables_workbook( colors_map_data.get_sheet(sheet_name) if colors_map_data else None ) tables = backend.detect_tables(sheet_name) + merged_cells = artifacts.merged_cell_data.get(sheet_name, []) + filtered_rows = ( + rows + if inputs.include_merged_values_in_rows + else _filter_rows_excluding_merged_values(rows, merged_cells) + ) sheets[sheet_name] = SheetRawData( - rows=rows, + rows=filtered_rows, shapes=[], charts=[], table_candidates=tables, @@ -743,7 +864,7 @@ def build_cells_tables_workbook( else [], auto_print_areas=[], colors_map=sheet_colors.colors_map if sheet_colors else {}, - merged_cells=artifacts.merged_cell_data.get(sheet_name, []), + merged_cells=merged_cells, ) raw = WorkbookRawData(book_name=inputs.file_path.name, sheets=sheets) return build_workbook_data(raw) diff --git a/src/exstruct/engine.py b/src/exstruct/engine.py index 86a2e11..58a51e6 100644 --- a/src/exstruct/engine.py +++ b/src/exstruct/engine.py @@ -71,6 +71,7 @@ class StructOptions: per engine instance without touching global state. include_colors_map: Whether to extract background color maps. include_merged_cells: Whether to extract merged cell ranges. + include_merged_values_in_rows: Whether to keep merged values in rows. colors: Color extraction options. """ @@ -81,6 +82,7 @@ class StructOptions: include_cell_links: bool | None = None # None -> auto: verbose=True, others=False include_colors_map: bool | None = None # None -> auto: verbose=True, others=False include_merged_cells: bool | None = None # None -> auto: light=False, others=True + include_merged_values_in_rows: bool = True colors: ColorsOptions = field(default_factory=ColorsOptions) @@ -286,7 +288,7 @@ def _filter_sheet( auto_print_areas=sheet.auto_print_areas if include_auto_print_areas else [], merged_cells=sheet.merged_cells if self.output.filters.include_merged_cells - else [], + else None, ) def _filter_workbook( @@ -357,6 +359,7 @@ def extract( include_default_background=self.options.colors.include_default_background, ignore_colors=self.options.colors.ignore_colors_set(), include_merged_cells=self.options.include_merged_cells, + include_merged_values_in_rows=self.options.include_merged_values_in_rows, ) def serialize( diff --git a/src/exstruct/io/__init__.py b/src/exstruct/io/__init__.py index 4e7c83d..11a8a44 100644 --- a/src/exstruct/io/__init__.py +++ b/src/exstruct/io/__init__.py @@ -62,7 +62,9 @@ def dict_without_empty_values(obj: object) -> JsonStructure: | Arrow | SmartArt, ): - return dict_without_empty_values(obj.model_dump(exclude_none=True)) + return dict_without_empty_values( + obj.model_dump(exclude_none=True, by_alias=True) + ) return cast(JsonStructure, obj) @@ -369,7 +371,9 @@ def save_print_area_views( f"_area{idx + 1}_r{area.r1}-{area.r2}_c{area.c1}-{area.c2}{suffix}" ) path = output_dir / file_name - payload = dict_without_empty_values(view.model_dump(exclude_none=True)) + payload = dict_without_empty_values( + view.model_dump(exclude_none=True, by_alias=True) + ) text = _serialize_payload_from_hint( payload, format_hint, pretty=pretty, indent=indent ) @@ -428,7 +432,9 @@ def save_auto_page_break_views( f"_auto_page{idx + 1}_r{area.r1}-{area.r2}_c{area.c1}-{area.c2}{suffix}" ) path = output_dir / file_name - payload = dict_without_empty_values(view.model_dump(exclude_none=True)) + payload = dict_without_empty_values( + view.model_dump(exclude_none=True, by_alias=True) + ) text = _serialize_payload_from_hint( payload, format_hint, pretty=pretty, indent=indent ) @@ -453,7 +459,9 @@ def serialize_workbook( error_type=SerializationError, error_message="Unsupported export format '{fmt}'. Allowed: json, yaml, yml, toon.", ) - filtered_dict = dict_without_empty_values(model.model_dump(exclude_none=True)) + filtered_dict = dict_without_empty_values( + model.model_dump(exclude_none=True, by_alias=True) + ) return _serialize_payload_from_hint( filtered_dict, format_hint, pretty=pretty, indent=indent ) @@ -478,7 +486,7 @@ def save_sheets_as_json( { "book_name": workbook.book_name, "sheet_name": sheet_name, - "sheet": sheet_data.model_dump(exclude_none=True), + "sheet": sheet_data.model_dump(exclude_none=True, by_alias=True), } ) file_name = f"{_sanitize_sheet_filename(sheet_name)}.json" @@ -517,7 +525,7 @@ def save_sheets( { "book_name": workbook.book_name, "sheet_name": sheet_name, - "sheet": sheet_data.model_dump(exclude_none=True), + "sheet": sheet_data.model_dump(exclude_none=True, by_alias=True), } ) suffix = {"json": ".json", "yaml": ".yaml", "toon": ".toon"}[format_hint] diff --git a/src/exstruct/models/__init__.py b/src/exstruct/models/__init__.py index 9578ea9..4041aeb 100644 --- a/src/exstruct/models/__init__.py +++ b/src/exstruct/models/__init__.py @@ -5,7 +5,12 @@ from pathlib import Path from typing import Literal -from pydantic import BaseModel, Field +from pydantic import BaseModel, ConfigDict, Field + + +def _default_merged_cells_schema() -> list[Literal["r1", "c1", "r2", "c2", "v"]]: + """Return default schema for merged cell items.""" + return ["r1", "c1", "r2", "c2", "v"] class BaseShape(BaseModel): @@ -76,14 +81,23 @@ class SmartArt(BaseShape): ) -class MergedCell(BaseModel): - """Metadata for a merged cell range.""" +class MergedCells(BaseModel): + """Compressed merged cell ranges using schema + items.""" - r1: int = Field(description="Start row (1-based).") - c1: int = Field(description="Start column (0-based).") - r2: int = Field(description="End row (1-based, inclusive).") - c2: int = Field(description="End column (0-based, inclusive).") - v: str = "" + model_config = ConfigDict(populate_by_name=True) + + schema_: list[Literal["r1", "c1", "r2", "c2", "v"]] = Field( + default_factory=_default_merged_cells_schema, + alias="schema", + description="Ordered field names for each item.", + ) + items: list[tuple[int, int, int, int, str]] = Field( + default_factory=list, + description=( + "Merged cell items as (r1, c1, r2, c2, v) tuples where rows are 1-based " + "and columns are 0-based." + ), + ) class CellRow(BaseModel): @@ -170,14 +184,16 @@ class SheetData(BaseModel): "where row is 1-based and column is 0-based." ), ) - merged_cells: list[MergedCell] = Field( - default_factory=list, description="Merged cell ranges on the sheet." + merged_cells: MergedCells | None = Field( + default=None, description="Merged cell ranges on the sheet." ) def _as_payload(self) -> dict[str, object]: from ..io import dict_without_empty_values - return dict_without_empty_values(self.model_dump(exclude_none=True)) # type: ignore + return dict_without_empty_values( + self.model_dump(exclude_none=True, by_alias=True) + ) # type: ignore def to_json(self, *, pretty: bool = False, indent: int | None = None) -> str: """ @@ -325,7 +341,9 @@ class PrintAreaView(BaseModel): def _as_payload(self) -> dict[str, object]: from ..io import dict_without_empty_values - return dict_without_empty_values(self.model_dump(exclude_none=True)) # type: ignore + return dict_without_empty_values( + self.model_dump(exclude_none=True, by_alias=True) + ) # type: ignore def to_json(self, *, pretty: bool = False, indent: int | None = None) -> str: """ From 61a8af1e30670f7b9f6d3e261d9c3896342d7972 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Tue, 6 Jan 2026 21:51:19 +0900 Subject: [PATCH 06/16] =?UTF-8?q?=E3=83=86=E3=82=B9=E3=83=88=E3=82=B1?= =?UTF-8?q?=E3=83=BC=E3=82=B9=E3=81=AE=E8=BF=BD=E5=8A=A0=E3=81=A8=E4=BF=AE?= =?UTF-8?q?=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/agents/TASKS.md | 12 +++---- tests/backends/test_auto_page_breaks.py | 1 + tests/backends/test_merged_cells.py | 6 ++-- tests/core/test_merged_cells_core.py | 2 +- tests/core/test_pipeline.py | 37 +++++++++++++++++++- tests/core/test_pipeline_fallbacks.py | 3 ++ tests/engine/test_engine.py | 5 +-- tests/export/test_export_requirements.py | 2 +- tests/integration/test_integrate_raw_data.py | 2 ++ tests/models/test_modeling.py | 4 ++- tests/models/test_models_export.py | 19 +++++++++- 11 files changed, 77 insertions(+), 16 deletions(-) diff --git a/docs/agents/TASKS.md b/docs/agents/TASKS.md index ffe5376..4a643d7 100644 --- a/docs/agents/TASKS.md +++ b/docs/agents/TASKS.md @@ -2,10 +2,10 @@ 未完了 [ ], 完了 [x] -- [ ] 仕様: `merged_cells` の新フォーマット(schema + items)をモデルと出力仕様に反映 -- [ ] 仕様: `include_merged_values_in_rows` フラグ追加(デフォルト True) -- [ ] 実装: 既存の `merged_cells` 生成ロジックを新構造へ置換 -- [ ] 実装: `rows` から結合セル値を排除する分岐を追加(フラグ制御) -- [ ] 実装: 結合セルの値がない場合は `" "` を出力 +- [x] 仕様: `merged_cells` の新フォーマット(schema + items)をモデルと出力仕様に反映 +- [x] 仕様: `include_merged_values_in_rows` フラグ追加(デフォルト True) +- [x] 実装: 既存の `merged_cells` 生成ロジックを新構造へ置換 +- [x] 実装: `rows` から結合セル値を排除する分岐を追加(フラグ制御) +- [x] 実装: 結合セルの値がない場合は `" "` を出力 - [ ] 更新: 既存の JSON 出力例・ドキュメントの整合性確認 -- [ ] テスト: 結合セルが多いケースの JSON 量削減を確認 +- [x] テスト: 結合セルが多いケースの JSON 量削減を確認 diff --git a/tests/backends/test_auto_page_breaks.py b/tests/backends/test_auto_page_breaks.py index 6c3adda..81e01d6 100644 --- a/tests/backends/test_auto_page_breaks.py +++ b/tests/backends/test_auto_page_breaks.py @@ -28,6 +28,7 @@ def fake_extract( include_default_background: bool = False, ignore_colors: set[str] | None = None, include_merged_cells: bool | None = None, + include_merged_values_in_rows: bool = True, ) -> WorkbookData: called["include_auto_page_breaks"] = include_auto_page_breaks return WorkbookData(book_name=path.name, sheets={}) diff --git a/tests/backends/test_merged_cells.py b/tests/backends/test_merged_cells.py index fbafca4..2683b81 100644 --- a/tests/backends/test_merged_cells.py +++ b/tests/backends/test_merged_cells.py @@ -8,7 +8,7 @@ from exstruct.core.backends.com_backend import ComBackend from exstruct.core.backends.openpyxl_backend import OpenpyxlBackend -from exstruct.models import MergedCell +from exstruct.core.cells import MergedCellRange def _make_merged_book(path: Path) -> None: @@ -32,13 +32,13 @@ def test_openpyxl_backend_extract_merged_cells(tmp_path: Path) -> None: backend = OpenpyxlBackend(path) merged = backend.extract_merged_cells() - assert merged["Sheet1"] == [MergedCell(r1=1, c1=0, r2=1, c2=2, v="Header")] + assert merged["Sheet1"] == [MergedCellRange(r1=1, c1=0, r2=1, c2=2, v="Header")] def test_openpyxl_backend_extract_merged_cells_handles_failure( monkeypatch: MonkeyPatch, tmp_path: Path ) -> None: - def _boom(_path: Path) -> dict[str, list[MergedCell]]: + def _boom(_path: Path) -> dict[str, list[MergedCellRange]]: raise RuntimeError("boom") monkeypatch.setattr( diff --git a/tests/core/test_merged_cells_core.py b/tests/core/test_merged_cells_core.py index 4129bd6..60377ac 100644 --- a/tests/core/test_merged_cells_core.py +++ b/tests/core/test_merged_cells_core.py @@ -24,7 +24,7 @@ def test_extract_sheet_merged_cells_basic(tmp_path: Path) -> None: assert len(ranges) == 2 tuples = {(r.r1, r.c1, r.r2, r.c2, r.v) for r in ranges} assert (1, 0, 2, 1, "Title") in tuples - assert (4, 3, 4, 4, "") in tuples + assert (4, 3, 4, 4, " ") in tuples def test_extract_sheet_merged_cells_empty(tmp_path: Path) -> None: diff --git a/tests/core/test_pipeline.py b/tests/core/test_pipeline.py index e45c5b1..24f821b 100644 --- a/tests/core/test_pipeline.py +++ b/tests/core/test_pipeline.py @@ -10,6 +10,7 @@ build_pre_com_pipeline, resolve_extraction_inputs, ) +from exstruct.core.cells import MergedCellRange from exstruct.models import CellRow, PrintArea @@ -26,6 +27,7 @@ def test_build_pre_com_pipeline_respects_flags( include_default_background=False, ignore_colors=None, include_merged_cells=False, + include_merged_values_in_rows=True, ) steps = build_pre_com_pipeline(inputs) step_names = [step.__name__ for step in steps] @@ -45,6 +47,7 @@ def test_build_pre_com_pipeline_includes_colors_map_for_light( include_default_background=False, ignore_colors=None, include_merged_cells=True, + include_merged_values_in_rows=True, ) steps = build_pre_com_pipeline(inputs) step_names = [step.__name__ for step in steps] @@ -69,6 +72,7 @@ def test_build_pre_com_pipeline_skips_merged_cells_when_disabled( include_default_background=False, ignore_colors=None, include_merged_cells=False, + include_merged_values_in_rows=True, ) steps = build_pre_com_pipeline(inputs) step_names = [step.__name__ for step in steps] @@ -86,6 +90,7 @@ def test_build_com_pipeline_respects_flags(tmp_path: Path) -> None: include_default_background=False, ignore_colors=None, include_merged_cells=False, + include_merged_values_in_rows=True, ) steps = build_com_pipeline(inputs) step_names = [step.__name__ for step in steps] @@ -109,6 +114,7 @@ def test_build_com_pipeline_excludes_auto_page_breaks_when_disabled( include_default_background=False, ignore_colors=None, include_merged_cells=False, + include_merged_values_in_rows=True, ) steps = build_com_pipeline(inputs) step_names = [step.__name__ for step in steps] @@ -126,6 +132,7 @@ def test_build_com_pipeline_empty_for_light(tmp_path: Path) -> None: include_default_background=False, ignore_colors=None, include_merged_cells=False, + include_merged_values_in_rows=True, ) steps = build_com_pipeline(inputs) assert steps == [] @@ -142,6 +149,7 @@ def test_resolve_extraction_inputs_defaults(tmp_path: Path) -> None: include_default_background=True, ignore_colors=None, include_merged_cells=None, + include_merged_values_in_rows=True, ) assert inputs.include_cell_links is False assert inputs.include_print_areas is True @@ -171,6 +179,7 @@ def fake_detect_tables(_: Path, __: str) -> list[str]: include_default_background=False, ignore_colors=None, include_merged_cells=True, + include_merged_values_in_rows=True, ) artifacts = ExtractionArtifacts( cell_data={"Sheet1": [CellRow(r=1, c={"0": "v"})]}, @@ -185,4 +194,30 @@ def fake_detect_tables(_: Path, __: str) -> list[str]: sheet = wb.sheets["Sheet1"] assert sheet.print_areas assert sheet.table_candidates == ["A1:B2"] - assert sheet.merged_cells == [] + assert sheet.merged_cells is None + + +def test_build_cells_tables_workbook_excludes_merged_values_in_rows( + tmp_path: Path, +) -> None: + inputs = ExtractionInputs( + file_path=tmp_path / "book.xlsx", + mode="standard", + include_cell_links=False, + include_print_areas=False, + include_auto_page_breaks=False, + include_colors_map=False, + include_default_background=False, + ignore_colors=None, + include_merged_cells=True, + include_merged_values_in_rows=False, + ) + artifacts = ExtractionArtifacts( + cell_data={"Sheet1": [CellRow(r=1, c={"0": "A", "1": "B", "2": "C"})]}, + merged_cell_data={ + "Sheet1": [MergedCellRange(r1=1, c1=0, r2=1, c2=1, v="A")] + }, + ) + wb = build_cells_tables_workbook(inputs=inputs, artifacts=artifacts, reason="test") + sheet = wb.sheets["Sheet1"] + assert sheet.rows[0].c == {"2": "C"} diff --git a/tests/core/test_pipeline_fallbacks.py b/tests/core/test_pipeline_fallbacks.py index 4eea4c7..393be5e 100644 --- a/tests/core/test_pipeline_fallbacks.py +++ b/tests/core/test_pipeline_fallbacks.py @@ -35,6 +35,7 @@ def test_pipeline_fallback_skip_com_tests( include_default_background=False, ignore_colors=None, include_merged_cells=None, + include_merged_values_in_rows=True, ) result = run_extraction_pipeline(inputs) @@ -68,6 +69,7 @@ def _raise(*_args: object, **_kwargs: object) -> None: include_default_background=False, ignore_colors=None, include_merged_cells=None, + include_merged_values_in_rows=True, ) result = run_extraction_pipeline(inputs) @@ -109,6 +111,7 @@ def _raise( include_default_background=False, ignore_colors=None, include_merged_cells=None, + include_merged_values_in_rows=True, ) result = run_extraction_pipeline(inputs) diff --git a/tests/engine/test_engine.py b/tests/engine/test_engine.py index d3c66c7..9ff36ec 100644 --- a/tests/engine/test_engine.py +++ b/tests/engine/test_engine.py @@ -14,7 +14,7 @@ CellRow, Chart, ChartSeries, - MergedCell, + MergedCells, PrintArea, Shape, SheetData, @@ -35,6 +35,7 @@ def fake_extract( include_default_background: bool = False, ignore_colors: set[str] | None = None, include_merged_cells: bool | None = None, + include_merged_values_in_rows: bool = True, ) -> WorkbookData: called["mode"] = mode called["include_print_areas"] = include_print_areas @@ -65,7 +66,7 @@ def _sample_workbook() -> WorkbookData: charts=[chart], table_candidates=["A1:B2"], print_areas=[PrintArea(r1=1, c1=0, r2=3, c2=2)], - merged_cells=[MergedCell(r1=1, c1=0, r2=1, c2=1, v="merged")], + merged_cells=MergedCells(items=[(1, 0, 1, 1, "merged")]), ) return WorkbookData(book_name="book.xlsx", sheets={"Sheet1": sheet}) diff --git a/tests/export/test_export_requirements.py b/tests/export/test_export_requirements.py index dae98d4..8bc6386 100644 --- a/tests/export/test_export_requirements.py +++ b/tests/export/test_export_requirements.py @@ -115,6 +115,6 @@ def test_export_sheets_yamlとtoonが出力される(tmp_path: Path) -> None: def test_merged_cells_empty_is_omitted_in_sheet_json() -> None: - sheet = SheetData(rows=[CellRow(r=1, c={"0": "v"})], merged_cells=[]) + sheet = SheetData(rows=[CellRow(r=1, c={"0": "v"})], merged_cells=None) payload = sheet.to_json() assert "merged_cells" not in payload diff --git a/tests/integration/test_integrate_raw_data.py b/tests/integration/test_integrate_raw_data.py index 68be78c..d94507c 100644 --- a/tests/integration/test_integrate_raw_data.py +++ b/tests/integration/test_integrate_raw_data.py @@ -59,6 +59,7 @@ def test_collect_sheet_raw_data_includes_extracted_fields( merged_cell_data={"Sheet1": []}, workbook=workbook, mode="standard", + include_merged_values_in_rows=True, print_area_data={"Sheet1": [PrintArea(r1=1, c1=0, r2=1, c2=0)]}, auto_page_break_data={"Sheet1": [PrintArea(r1=1, c1=0, r2=1, c2=0)]}, colors_map_data=colors_map, @@ -94,6 +95,7 @@ def test_collect_sheet_raw_data_skips_charts_in_light_mode( merged_cell_data={"Sheet1": []}, workbook=workbook, mode="light", + include_merged_values_in_rows=True, print_area_data=None, auto_page_break_data=None, colors_map_data=None, diff --git a/tests/models/test_modeling.py b/tests/models/test_modeling.py index 60a274a..c367c6a 100644 --- a/tests/models/test_modeling.py +++ b/tests/models/test_modeling.py @@ -1,3 +1,4 @@ +from exstruct.core.cells import MergedCellRange from exstruct.core.modeling import SheetRawData, WorkbookRawData, build_workbook_data from exstruct.models import CellRow, Chart, ChartSeries, PrintArea, Shape @@ -25,7 +26,7 @@ def test_build_workbook_data_from_raw() -> None: print_areas=[PrintArea(r1=1, c1=0, r2=1, c2=0)], auto_print_areas=[], colors_map={"#FFFFFF": [(1, 0)]}, - merged_cells=[], + merged_cells=[MergedCellRange(r1=1, c1=0, r2=1, c2=0, v=" ")], ) raw_workbook = WorkbookRawData(book_name="book.xlsx", sheets={"Sheet1": raw_sheet}) @@ -38,3 +39,4 @@ def test_build_workbook_data_from_raw() -> None: assert sheet.shapes assert sheet.charts assert sheet.print_areas + assert sheet.merged_cells is not None diff --git a/tests/models/test_models_export.py b/tests/models/test_models_export.py index 4d96008..22a66a4 100644 --- a/tests/models/test_models_export.py +++ b/tests/models/test_models_export.py @@ -5,7 +5,14 @@ import pytest from exstruct.errors import MissingDependencyError -from exstruct.models import CellRow, SheetData, SmartArt, SmartArtNode, WorkbookData +from exstruct.models import ( + CellRow, + MergedCells, + SheetData, + SmartArt, + SmartArtNode, + WorkbookData, +) HAS_PYYAML = util.find_spec("yaml") is not None HAS_TOON = util.find_spec("toon") is not None @@ -129,3 +136,13 @@ def test_sheet_json_includes_smartart_nodes() -> None: assert data["shapes"][0]["kind"] == "smartart" assert data["shapes"][0]["nodes"][0]["text"] == "root" assert data["shapes"][0]["nodes"][0]["kids"][0]["text"] == "child" + + +def test_sheet_json_includes_merged_cells_schema() -> None: + sheet = SheetData( + rows=[], + merged_cells=MergedCells(items=[(1, 0, 1, 1, "merged")]), + ) + data = json.loads(sheet.to_json()) + assert data["merged_cells"]["schema"] == ["r1", "c1", "r2", "c2", "v"] + assert data["merged_cells"]["items"][0] == [1, 0, 1, 1, "merged"] From af53737f73df0d25a405041f580726824e85bea1 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Tue, 6 Jan 2026 21:51:39 +0900 Subject: [PATCH 07/16] fix --- tests/core/test_pipeline.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/core/test_pipeline.py b/tests/core/test_pipeline.py index 24f821b..22c91b3 100644 --- a/tests/core/test_pipeline.py +++ b/tests/core/test_pipeline.py @@ -2,6 +2,7 @@ from _pytest.monkeypatch import MonkeyPatch +from exstruct.core.cells import MergedCellRange from exstruct.core.pipeline import ( ExtractionArtifacts, ExtractionInputs, @@ -10,7 +11,6 @@ build_pre_com_pipeline, resolve_extraction_inputs, ) -from exstruct.core.cells import MergedCellRange from exstruct.models import CellRow, PrintArea @@ -214,9 +214,7 @@ def test_build_cells_tables_workbook_excludes_merged_values_in_rows( ) artifacts = ExtractionArtifacts( cell_data={"Sheet1": [CellRow(r=1, c={"0": "A", "1": "B", "2": "C"})]}, - merged_cell_data={ - "Sheet1": [MergedCellRange(r1=1, c1=0, r2=1, c2=1, v="A")] - }, + merged_cell_data={"Sheet1": [MergedCellRange(r1=1, c1=0, r2=1, c2=1, v="A")]}, ) wb = build_cells_tables_workbook(inputs=inputs, artifacts=artifacts, reason="test") sheet = wb.sheets["Sheet1"] From 60c6db7009e1d3c8adc983cf6ce4af5c703eda6e Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Tue, 6 Jan 2026 21:54:00 +0900 Subject: [PATCH 08/16] =?UTF-8?q?json=E3=82=B9=E3=82=AD=E3=83=BC=E3=83=9E?= =?UTF-8?q?=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- schemas/merged_cells.json | 50 +++++++++++++++++++++++++++++++ schemas/sheet.json | 61 ++++++++++++++++++++++++++++++++++++++ schemas/workbook.json | 61 ++++++++++++++++++++++++++++++++++++++ scripts/gen_json_schema.py | 2 ++ 4 files changed, 174 insertions(+) create mode 100644 schemas/merged_cells.json diff --git a/schemas/merged_cells.json b/schemas/merged_cells.json new file mode 100644 index 0000000..8e35601 --- /dev/null +++ b/schemas/merged_cells.json @@ -0,0 +1,50 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "description": "Compressed merged cell ranges using schema + items.", + "properties": { + "items": { + "description": "Merged cell items as (r1, c1, r2, c2, v) tuples where rows are 1-based and columns are 0-based.", + "items": { + "maxItems": 5, + "minItems": 5, + "prefixItems": [ + { + "type": "integer" + }, + { + "type": "integer" + }, + { + "type": "integer" + }, + { + "type": "integer" + }, + { + "type": "string" + } + ], + "type": "array" + }, + "title": "Items", + "type": "array" + }, + "schema": { + "description": "Ordered field names for each item.", + "items": { + "enum": [ + "r1", + "c1", + "r2", + "c2", + "v" + ], + "type": "string" + }, + "title": "Schema", + "type": "array" + } + }, + "title": "MergedCells", + "type": "object" +} \ No newline at end of file diff --git a/schemas/sheet.json b/schemas/sheet.json index 9ea5497..0e39edd 100644 --- a/schemas/sheet.json +++ b/schemas/sheet.json @@ -373,6 +373,55 @@ "title": "ChartSeries", "type": "object" }, + "MergedCells": { + "description": "Compressed merged cell ranges using schema + items.", + "properties": { + "items": { + "description": "Merged cell items as (r1, c1, r2, c2, v) tuples where rows are 1-based and columns are 0-based.", + "items": { + "maxItems": 5, + "minItems": 5, + "prefixItems": [ + { + "type": "integer" + }, + { + "type": "integer" + }, + { + "type": "integer" + }, + { + "type": "integer" + }, + { + "type": "string" + } + ], + "type": "array" + }, + "title": "Items", + "type": "array" + }, + "schema": { + "description": "Ordered field names for each item.", + "items": { + "enum": [ + "r1", + "c1", + "r2", + "c2", + "v" + ], + "type": "string" + }, + "title": "Schema", + "type": "array" + } + }, + "title": "MergedCells", + "type": "object" + }, "PrintArea": { "description": "Cell coordinate bounds for a print area.", "properties": { @@ -669,6 +718,18 @@ "title": "Colors Map", "type": "object" }, + "merged_cells": { + "anyOf": [ + { + "$ref": "#/$defs/MergedCells" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Merged cell ranges on the sheet." + }, "print_areas": { "description": "User-defined print areas.", "items": { diff --git a/schemas/workbook.json b/schemas/workbook.json index 12ab273..576ef67 100644 --- a/schemas/workbook.json +++ b/schemas/workbook.json @@ -373,6 +373,55 @@ "title": "ChartSeries", "type": "object" }, + "MergedCells": { + "description": "Compressed merged cell ranges using schema + items.", + "properties": { + "items": { + "description": "Merged cell items as (r1, c1, r2, c2, v) tuples where rows are 1-based and columns are 0-based.", + "items": { + "maxItems": 5, + "minItems": 5, + "prefixItems": [ + { + "type": "integer" + }, + { + "type": "integer" + }, + { + "type": "integer" + }, + { + "type": "integer" + }, + { + "type": "string" + } + ], + "type": "array" + }, + "title": "Items", + "type": "array" + }, + "schema": { + "description": "Ordered field names for each item.", + "items": { + "enum": [ + "r1", + "c1", + "r2", + "c2", + "v" + ], + "type": "string" + }, + "title": "Schema", + "type": "array" + } + }, + "title": "MergedCells", + "type": "object" + }, "PrintArea": { "description": "Cell coordinate bounds for a print area.", "properties": { @@ -545,6 +594,18 @@ "title": "Colors Map", "type": "object" }, + "merged_cells": { + "anyOf": [ + { + "$ref": "#/$defs/MergedCells" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Merged cell ranges on the sheet." + }, "print_areas": { "description": "User-defined print areas.", "items": { diff --git a/scripts/gen_json_schema.py b/scripts/gen_json_schema.py index a67bd3a..e0837d5 100644 --- a/scripts/gen_json_schema.py +++ b/scripts/gen_json_schema.py @@ -10,6 +10,7 @@ CellRow, Chart, ChartSeries, + MergedCells, PrintArea, PrintAreaView, Shape, @@ -59,6 +60,7 @@ def main() -> int: "smartart_node": SmartArtNode, "chart": Chart, "chart_series": ChartSeries, + "merged_cells": MergedCells, "print_area": PrintArea, "print_area_view": PrintAreaView, } From 6b53857df1ba66ab9371a4db259fffa5bc7fe779 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Tue, 6 Jan 2026 22:01:00 +0900 Subject: [PATCH 09/16] =?UTF-8?q?sample=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sample/__init__.py | 1 + .../forms_with_many_merged_cells/__init__.py | 1 + .../en_form_sf425/__init__.py | 1 + .../en_form_sf425/example.py | 9 + .../en_form_sf425/sample.json | 295 +++++--------- .../ja_general_form/__init__.py | 1 + .../ja_general_form/example.py | 9 + .../ja_general_form/ja_form.json | 372 ++++++------------ 8 files changed, 228 insertions(+), 461 deletions(-) create mode 100644 sample/__init__.py create mode 100644 sample/forms_with_many_merged_cells/__init__.py create mode 100644 sample/forms_with_many_merged_cells/en_form_sf425/__init__.py create mode 100644 sample/forms_with_many_merged_cells/en_form_sf425/example.py create mode 100644 sample/forms_with_many_merged_cells/ja_general_form/__init__.py create mode 100644 sample/forms_with_many_merged_cells/ja_general_form/example.py diff --git a/sample/__init__.py b/sample/__init__.py new file mode 100644 index 0000000..89e52a2 --- /dev/null +++ b/sample/__init__.py @@ -0,0 +1 @@ +"""Sample package marker.""" diff --git a/sample/forms_with_many_merged_cells/__init__.py b/sample/forms_with_many_merged_cells/__init__.py new file mode 100644 index 0000000..92174a7 --- /dev/null +++ b/sample/forms_with_many_merged_cells/__init__.py @@ -0,0 +1 @@ +"""Sample forms package marker.""" diff --git a/sample/forms_with_many_merged_cells/en_form_sf425/__init__.py b/sample/forms_with_many_merged_cells/en_form_sf425/__init__.py new file mode 100644 index 0000000..28aa311 --- /dev/null +++ b/sample/forms_with_many_merged_cells/en_form_sf425/__init__.py @@ -0,0 +1 @@ +"""Sample SF425 form package marker.""" diff --git a/sample/forms_with_many_merged_cells/en_form_sf425/example.py b/sample/forms_with_many_merged_cells/en_form_sf425/example.py new file mode 100644 index 0000000..f3762ce --- /dev/null +++ b/sample/forms_with_many_merged_cells/en_form_sf425/example.py @@ -0,0 +1,9 @@ +from exstruct import ExStructEngine, StructOptions + +file_path = "sample.xlsx" + +engine = ExStructEngine( + options=StructOptions(include_merged_values_in_rows=False), +) +wb = engine.extract(file_path, mode="standard") +engine.export(wb, "output.json", pretty=False) diff --git a/sample/forms_with_many_merged_cells/en_form_sf425/sample.json b/sample/forms_with_many_merged_cells/en_form_sf425/sample.json index bb22a4f..0e2ef96 100644 --- a/sample/forms_with_many_merged_cells/en_form_sf425/sample.json +++ b/sample/forms_with_many_merged_cells/en_form_sf425/sample.json @@ -1,16 +1,13 @@ { - "book_name": "en_sf425.xlsx", + "book_name": "sample.xlsx", "sheets": { "FFR": { "rows": [ - { "r": 1, "c": { "0": "FEDERAL FINANCIAL REPORT" } }, - { "r": 3, "c": { "0": "(Follow form instructions)" } }, { "r": 4, "c": { "0": "1. Federal Agency and Organizational Element", "5": "2. Federal Grant or Other Identifying Number Assigned by Federal Agency", - "20": "Page", "23": " of" } }, @@ -21,14 +18,7 @@ "5": " (To report multiple grants, use FFR Attachment)" } }, - { - "r": 7, - "c": { - "0": "United States Environmental Protection Agency", - "21": 1, - "23": "pages" - } - }, + { "r": 7, "c": { "21": 1, "23": "pages" } }, { "r": 8, "c": { @@ -37,41 +27,13 @@ }, { "r": 13, - "c": { - "0": "4a. DUNS Number ", - "2": "4b. EIN", - "5": "5. Recipient Account Number or Identifying Number", - "12": "6. Report Type", - "18": "7. Basis of Accounting" - } + "c": { "5": "5. Recipient Account Number or Identifying Number" } }, { "r": 14, - "c": { - "5": " (To report multiple grants, use FFR Attachment)", - "12": "□ Quarterly \n□ Semi-Annual \n□ Annual \n□ Final", - "18": "□ Cash □ Accrual" - } - }, - { - "r": 16, - "c": { - "0": "8. Project/Grant Period", - "10": "9. Reporting Period End Date" - } - }, - { - "r": 17, - "c": { - "0": " From: (Month, Day, Year)", - "5": "To: (Month, Day, Year)", - "10": "(Month, Day, Year)" - } - }, - { - "r": 19, - "c": { "0": "10. Transactions ", "15": " Cumulative " } + "c": { "5": " (To report multiple grants, use FFR Attachment)" } }, + { "r": 19, "c": { "0": "10. Transactions " } }, { "r": 20, "c": { "0": "(Use lines a-c for single or multiple grant reporting)" } @@ -94,23 +56,16 @@ { "r": 28, "c": { "0": " e. Federal share of expenditures " } }, { "r": 29, - "c": { - "0": " f. Federal share of unliquidated obligations", - "15": 0 - } + "c": { "0": " f. Federal share of unliquidated obligations" } }, { "r": 30, - "c": { - "0": " g. Total Federal share (sum of lines e and f)", - "15": 0 - } + "c": { "0": " g. Total Federal share (sum of lines e and f)" } }, { "r": 31, "c": { - "0": " h. Unobligated balance of Federal funds (line d minus g)", - "15": 0 + "0": " h. Unobligated balance of Federal funds (line d minus g)" } }, { "r": 32, "c": { "0": " Recipient Share:" } }, @@ -119,8 +74,7 @@ { "r": 35, "c": { - "0": " k. Remaining recipient share to be provided (line i minus j)", - "15": 0 + "0": " k. Remaining recipient share to be provided (line i minus j)" } }, { "r": 36, "c": { "0": " Program Income:" } }, @@ -149,7 +103,6 @@ { "r": 41, "c": { - "0": "11. Indirect Expense", "1": " a. Type ", "3": "b. Rate", "5": "c. Period From", @@ -159,7 +112,7 @@ "18": "f. Federal Share" } }, - { "r": 44, "c": { "6": " g. Totals:", "7": 0, "10": 0, "18": 0 } }, + { "r": 44, "c": { "6": " g. Totals:" } }, { "r": 45, "c": { @@ -194,16 +147,9 @@ } }, { "r": 55, "c": { "10": "14. Agency use only:" } }, - { "r": 56, "c": { "10": "Federal Share Calculation" } }, { "r": 58, "c": { "12": "Standard Form 425" } }, { "r": 59, "c": { "12": "OMB Approval Number: 0348-0061 " } }, - { "r": 60, "c": { "12": "Expiration Date: 02/28/2015" } }, - { - "r": 61, - "c": { - "0": "Paperwork Burden Statement\nAccording to the Paperwork Reduction Act, as amended, no persons are required to respond to a collection of information unless it displays a valid OMB Control Number. The valid OMB control number for this information collection is 0348-0061. Public reporting burden for this collection of information is estimated to average 1.5 hours per response, including time for reviewing instructions, searching existing data sources, gathering and maintaining the data needed, and completing and reviewing the collection of information. Send comments regarding the burden estimate or any other aspect of this collection of information, including suggestions for reducing this burden, to the Office of Management and Budget, Paperwork Reduction Project ( 0348-0060), Washington, DC 20503." - } - } + { "r": 60, "c": { "12": "Expiration Date: 02/28/2015" } } ], "shapes": [ { @@ -216,144 +162,87 @@ } ], "print_areas": [{ "r1": 1, "c1": 0, "r2": 66, "c2": 23 }], - "merged_cells": [ - { "r1": 34, "c1": 15, "r2": 34, "c2": 23 }, - { - "r1": 56, - "c1": 10, - "r2": 57, - "c2": 17, - "v": "Federal Share Calculation" - }, - { "r1": 18, "c1": 10, "r2": 18, "c2": 23 }, - { "r1": 15, "c1": 0, "r2": 15, "c2": 1 }, - { - "r1": 16, - "c1": 10, - "r2": 16, - "c2": 23, - "v": "9. Reporting Period End Date" - }, - { "r1": 13, "c1": 2, "r2": 14, "c2": 4, "v": "4b. EIN" }, - { "r1": 27, "c1": 15, "r2": 27, "c2": 23 }, - { "r1": 13, "c1": 12, "r2": 13, "c2": 17, "v": "6. Report Type" }, - { - "r1": 13, - "c1": 18, - "r2": 13, - "c2": 23, - "v": "7. Basis of Accounting" - }, - { "r1": 46, "c1": 0, "r2": 46, "c2": 23 }, - { "r1": 39, "c1": 15, "r2": 39, "c2": 23 }, - { "r1": 23, "c1": 15, "r2": 23, "c2": 23 }, - { "r1": 7, "c1": 5, "r2": 7, "c2": 19 }, - { "r1": 43, "c1": 1, "r2": 43, "c2": 2 }, - { - "r1": 3, - "c1": 0, - "r2": 3, - "c2": 23, - "v": "(Follow form instructions)" - }, - { "r1": 54, "c1": 10, "r2": 54, "c2": 23 }, - { - "r1": 13, - "c1": 0, - "r2": 14, - "c2": 1, - "v": "4a. DUNS Number " - }, - { "r1": 12, "c1": 0, "r2": 12, "c2": 23 }, - { - "r1": 16, - "c1": 0, - "r2": 16, - "c2": 4, - "v": "8. Project/Grant Period" - }, - { - "r1": 7, - "c1": 0, - "r2": 7, - "c2": 4, - "v": "United States Environmental Protection Agency" - }, - { "r1": 28, "c1": 15, "r2": 28, "c2": 23 }, - { "r1": 42, "c1": 1, "r2": 42, "c2": 2 }, - { "r1": 19, "c1": 15, "r2": 19, "c2": 23, "v": " Cumulative " }, - { "r1": 37, "c1": 15, "r2": 37, "c2": 23 }, - { "r1": 18, "c1": 5, "r2": 18, "c2": 9 }, - { "r1": 50, "c1": 10, "r2": 50, "c2": 23 }, - { "r1": 44, "c1": 10, "r2": 44, "c2": 17, "v": "0" }, - { - "r1": 1, - "c1": 0, - "r2": 2, - "c2": 23, - "v": "FEDERAL FINANCIAL REPORT" - }, - { "r1": 55, "c1": 19, "r2": 57, "c2": 23, "v": " " }, - { "r1": 14, "c1": 18, "r2": 15, "c2": 23, "v": "□ Cash □ Accrual" }, - { "r1": 18, "c1": 0, "r2": 18, "c2": 4 }, - { - "r1": 17, - "c1": 5, - "r2": 17, - "c2": 9, - "v": "To: (Month, Day, Year)" - }, - { "r1": 30, "c1": 15, "r2": 30, "c2": 23, "v": "0" }, - { - "r1": 61, - "c1": 0, - "r2": 64, - "c2": 23, - "v": "Paperwork Burden Statement\nAccording to the Paperwork Reduction Act, as amended, no persons are required to respond to a collection of information unless it displays a valid OMB Control Number. The valid OMB control number for this information collection is 0348-0061. Public reporting burden for this collection of information is estimated to average 1.5 hours per response, including time for reviewing instructions, searching existing data sources, gathering and maintaining the data needed, and completing and reviewing the collection of information. Send comments regarding the burden estimate or any other aspect of this collection of information, including suggestions for reducing this burden, to the Office of Management and Budget, Paperwork Reduction Project ( 0348-0060), Washington, DC 20503." - }, - { "r1": 42, "c1": 7, "r2": 42, "c2": 9 }, - { "r1": 43, "c1": 10, "r2": 43, "c2": 17 }, - { "r1": 24, "c1": 15, "r2": 24, "c2": 23 }, - { "r1": 43, "c1": 3, "r2": 43, "c2": 4 }, - { "r1": 52, "c1": 10, "r2": 52, "c2": 23 }, - { "r1": 33, "c1": 15, "r2": 33, "c2": 23 }, - { "r1": 10, "c1": 0, "r2": 10, "c2": 23 }, - { "r1": 4, "c1": 20, "r2": 4, "c2": 22, "v": "Page" }, - { "r1": 44, "c1": 18, "r2": 44, "c2": 23, "v": "0" }, - { "r1": 41, "c1": 0, "r2": 43, "c2": 0, "v": "11. Indirect Expense" }, - { "r1": 42, "c1": 10, "r2": 42, "c2": 17 }, - { "r1": 15, "c1": 5, "r2": 15, "c2": 11 }, - { "r1": 15, "c1": 2, "r2": 15, "c2": 4 }, - { - "r1": 17, - "c1": 0, - "r2": 17, - "c2": 4, - "v": " From: (Month, Day, Year)" - }, - { "r1": 9, "c1": 0, "r2": 9, "c2": 23 }, - { "r1": 43, "c1": 7, "r2": 43, "c2": 9 }, - { "r1": 50, "c1": 0, "r2": 52, "c2": 9 }, - { "r1": 43, "c1": 18, "r2": 43, "c2": 23 }, - { "r1": 29, "c1": 15, "r2": 29, "c2": 23, "v": "0" }, - { "r1": 35, "c1": 15, "r2": 35, "c2": 23, "v": "0" }, - { "r1": 38, "c1": 15, "r2": 38, "c2": 23 }, - { "r1": 42, "c1": 3, "r2": 42, "c2": 4 }, - { "r1": 17, "c1": 10, "r2": 17, "c2": 23, "v": "(Month, Day, Year)" }, - { "r1": 44, "c1": 7, "r2": 44, "c2": 9, "v": "0" }, - { "r1": 42, "c1": 18, "r2": 42, "c2": 23 }, - { "r1": 11, "c1": 0, "r2": 11, "c2": 23 }, - { "r1": 31, "c1": 15, "r2": 31, "c2": 23, "v": "0" }, - { - "r1": 14, - "c1": 12, - "r2": 15, - "c2": 17, - "v": "□ Quarterly \n□ Semi-Annual \n□ Annual \n□ Final" - }, - { "r1": 22, "c1": 15, "r2": 22, "c2": 23 }, - { "r1": 40, "c1": 15, "r2": 40, "c2": 23 } - ] + "merged_cells": { + "schema": ["r1", "c1", "r2", "c2", "v"], + "items": [ + [34, 15, 34, 23, " "], + [56, 10, 57, 17, "Federal Share Calculation"], + [18, 10, 18, 23, " "], + [15, 0, 15, 1, " "], + [16, 10, 16, 23, "9. Reporting Period End Date"], + [13, 2, 14, 4, "4b. EIN"], + [27, 15, 27, 23, " "], + [13, 12, 13, 17, "6. Report Type"], + [13, 18, 13, 23, "7. Basis of Accounting"], + [46, 0, 46, 23, " "], + [39, 15, 39, 23, " "], + [23, 15, 23, 23, " "], + [7, 5, 7, 19, " "], + [43, 1, 43, 2, " "], + [3, 0, 3, 23, "(Follow form instructions)"], + [54, 10, 54, 23, " "], + [13, 0, 14, 1, "4a. DUNS Number "], + [12, 0, 12, 23, " "], + [16, 0, 16, 4, "8. Project/Grant Period"], + [7, 0, 7, 4, "United States Environmental Protection Agency"], + [28, 15, 28, 23, " "], + [42, 1, 42, 2, " "], + [19, 15, 19, 23, " Cumulative "], + [37, 15, 37, 23, " "], + [18, 5, 18, 9, " "], + [50, 10, 50, 23, " "], + [44, 10, 44, 17, "0"], + [1, 0, 2, 23, "FEDERAL FINANCIAL REPORT"], + [55, 19, 57, 23, " "], + [14, 18, 15, 23, "□ Cash □ Accrual"], + [18, 0, 18, 4, " "], + [17, 5, 17, 9, "To: (Month, Day, Year)"], + [30, 15, 30, 23, "0"], + [ + 61, + 0, + 64, + 23, + "Paperwork Burden Statement\nAccording to the Paperwork Reduction Act, as amended, no persons are required to respond to a collection of information unless it displays a valid OMB Control Number. The valid OMB control number for this information collection is 0348-0061. Public reporting burden for this collection of information is estimated to average 1.5 hours per response, including time for reviewing instructions, searching existing data sources, gathering and maintaining the data needed, and completing and reviewing the collection of information. Send comments regarding the burden estimate or any other aspect of this collection of information, including suggestions for reducing this burden, to the Office of Management and Budget, Paperwork Reduction Project ( 0348-0060), Washington, DC 20503." + ], + [42, 7, 42, 9, " "], + [43, 10, 43, 17, " "], + [24, 15, 24, 23, " "], + [43, 3, 43, 4, " "], + [52, 10, 52, 23, " "], + [33, 15, 33, 23, " "], + [10, 0, 10, 23, " "], + [4, 20, 4, 22, "Page"], + [44, 18, 44, 23, "0"], + [41, 0, 43, 0, "11. Indirect Expense"], + [42, 10, 42, 17, " "], + [15, 5, 15, 11, " "], + [15, 2, 15, 4, " "], + [17, 0, 17, 4, " From: (Month, Day, Year)"], + [9, 0, 9, 23, " "], + [43, 7, 43, 9, " "], + [50, 0, 52, 9, " "], + [43, 18, 43, 23, " "], + [29, 15, 29, 23, "0"], + [35, 15, 35, 23, "0"], + [38, 15, 38, 23, " "], + [42, 3, 42, 4, " "], + [17, 10, 17, 23, "(Month, Day, Year)"], + [44, 7, 44, 9, "0"], + [42, 18, 42, 23, " "], + [11, 0, 11, 23, " "], + [31, 15, 31, 23, "0"], + [ + 14, + 12, + 15, + 17, + "□ Quarterly \n□ Semi-Annual \n□ Annual \n□ Final" + ], + [22, 15, 22, 23, " "], + [40, 15, 40, 23, " "] + ] + } } } } diff --git a/sample/forms_with_many_merged_cells/ja_general_form/__init__.py b/sample/forms_with_many_merged_cells/ja_general_form/__init__.py new file mode 100644 index 0000000..819582c --- /dev/null +++ b/sample/forms_with_many_merged_cells/ja_general_form/__init__.py @@ -0,0 +1 @@ +"""Sample Japanese general form package marker.""" diff --git a/sample/forms_with_many_merged_cells/ja_general_form/example.py b/sample/forms_with_many_merged_cells/ja_general_form/example.py new file mode 100644 index 0000000..c10f23c --- /dev/null +++ b/sample/forms_with_many_merged_cells/ja_general_form/example.py @@ -0,0 +1,9 @@ +from exstruct import ExStructEngine, StructOptions + +file_path = "ja_form.xlsx" + +engine = ExStructEngine( + options=StructOptions(include_merged_values_in_rows=False), +) +wb = engine.extract(file_path, mode="standard") +engine.export(wb, "output.json", pretty=False) diff --git a/sample/forms_with_many_merged_cells/ja_general_form/ja_form.json b/sample/forms_with_many_merged_cells/ja_general_form/ja_form.json index e759226..176f89a 100644 --- a/sample/forms_with_many_merged_cells/ja_general_form/ja_form.json +++ b/sample/forms_with_many_merged_cells/ja_general_form/ja_form.json @@ -3,11 +3,6 @@ "sheets": { "Sheet1": { "rows": [ - { "r": 1, "c": { "0": "介護保険負担限度額認定申請書" } }, - { - "r": 3, - "c": { "0": "(申請先)", "7": "     年    月    日" } - }, { "r": 4, "c": { "1": "X市長 " } }, { "r": 5, @@ -15,60 +10,19 @@ "0": "次のとおり関係書類を添えて、食費・居住費(滞在費)に係る負担限度額認定を申請します。" } }, - { "r": 6, "c": { "0": "フ リ ガ ナ", "7": " 被保険者番号" } }, - { "r": 7, "c": { "0": "被保険者氏名" } }, { "r": 8, "c": { "7": "個人番号" } }, - { - "r": 9, - "c": { - "0": "生年月日", - "2": "明・大・昭      年     月     日" - } - }, - { "r": 10, "c": { "0": "住   所" } }, { "r": 12, "c": { "7": "連絡先" } }, - { - "r": 13, - "c": { - "0": "入所(院)し       た介護保険施        設の所在地及     び名称(※)" - } - }, { "r": 14, "c": { "7": "連絡先" } }, - { - "r": 17, - "c": { - "0": "入所(院)", - "2": "     年   月   日", - "5": "(※)介護保険施設に入所(院)していない場合及び                        ショートスティを利用している場合は、記入不要です。" - } - }, - { "r": 18, "c": { "0": "年月日(*)" } }, - { - "r": 20, - "c": { - "0": "配偶者の有無", - "2": "有     ・     無", - "5": "左記において「無」の場合は、以下の「配偶者に関する事項」につい                   ては、記入不要です。" - } - }, - { "r": 22, "c": { "0": "配偶者に関する事項", "1": "フ リ ガ ナ" } }, - { "r": 23, "c": { "1": "氏  名" } }, + { "r": 22, "c": { "1": "フ リ ガ ナ" } }, { "r": 25, "c": { "1": "生年月日", - "2": "  明  ・  大  ・  昭       年       月       日", - "8": "個人番号" - } - }, - { "r": 26, "c": { "1": "住  所", "2": "〒", "7": "連絡先" } }, - { - "r": 29, - "c": { - "1": "本年1月1日現在の住所   (現住所と     異なる場合)", - "2": "〒" + "2": "  明  ・  大  ・  昭       年       月       日" } }, + { "r": 26, "c": { "2": "〒", "7": "連絡先" } }, + { "r": 29, "c": { "2": "〒" } }, { "r": 33, "c": { @@ -77,18 +31,7 @@ "4": "  課税       ・       非課税" } }, - { - "r": 35, - "c": { - "0": "収入等に関    する申告", - "2": "□", - "3": "①生活保護受給者/②市町村民税世帯非課税である老齢福祉年金受給者" - } - }, - { - "r": 37, - "c": { "2": "□", "3": "③市町村民税世帯非課税者であって、" } - }, + { "r": 37, "c": { "3": "③市町村民税世帯非課税者であって、" } }, { "r": 38, "c": { @@ -107,10 +50,7 @@ "3": "  ※ 寡婦年金、かん夫年金、母子年金、準母子年金、遺児年金を含みます。以下同じ。" } }, - { - "r": 41, - "c": { "2": "□", "3": "④市町村民税世帯非課税者であって、" } - }, + { "r": 41, "c": { "3": "④市町村民税世帯非課税者であって、" } }, { "r": 42, "c": { @@ -118,10 +58,7 @@ } }, { "r": 43, "c": { "3": "年額80万円を超え、120万円以下です。" } }, - { - "r": 44, - "c": { "2": "□", "3": "⑤市町村民税世帯非課税者であって、" } - }, + { "r": 44, "c": { "3": "⑤市町村民税世帯非課税者であって、" } }, { "r": 45, "c": { @@ -132,8 +69,6 @@ { "r": 47, "c": { - "0": "預貯金等に     関する申告", - "2": "□", "3": "預貯金、有価証券等の金額の合計が②の方は1000万円(夫婦は2000万円)、" } }, @@ -153,31 +88,9 @@ "3": "※第2号被保険者(40歳以上64歳以下)の場合、③~⑤の方は1000万円(夫婦は2000万円)以下です。" } }, - { - "r": 51, - "c": { - "2": "預貯金額", - "4": "円", - "5": "有価証券", - "7": "円", - "8": "その他", - "10": "(        )※" - } - }, - { - "r": 52, - "c": { "8": "(現金・負債      を含む)", "10": "円" } - }, - { - "r": 53, - "c": { "5": "(評価概算額)", "10": "※内容を記入してください" } - }, - { - "r": 55, - "c": { - "5": "申請者が被保険者本人の場合には、下記について記載は不要です。" - } - }, + { "r": 51, "c": { "10": "(        )※" } }, + { "r": 52, "c": { "10": "円" } }, + { "r": 53, "c": { "10": "※内容を記入してください" } }, { "r": 56, "c": { "0": " 申請者氏名", "8": " 連絡先(自宅・勤務先)" } @@ -191,13 +104,7 @@ "1": " この申請書における「配偶者」については、世帯分離している配偶者又は内縁関係の者を含みます。" } }, - { - "r": 65, - "c": { - "0": -2, - "1": " 預貯金等に関しては、同じ種類の預貯金等を複数所有している場合は、そのすべてを記入し、通帳等の写し                  を添付してください。" - } - }, + { "r": 65, "c": { "0": -2 } }, { "r": 67, "c": { @@ -205,162 +112,111 @@ "1": " 書き切れない場合は、余白に記入するか又は別紙に記入の上添付してください。" } }, - { - "r": 68, - "c": { - "0": -4, - "1": " 虚偽の申告により不正に特定入所者介護サービス費等の支給を受けた場合には、介護保険法第22条第1項              の規定に基づき、支給された額及び最大2倍の加算金を返還していただくことがあります。" - } - } + { "r": 68, "c": { "0": -4 } } ], "table_candidates": ["B25:C26", "C37:D50"], - "merged_cells": [ - { - "r1": 55, - "c1": 5, - "r2": 55, - "c2": 10, - "v": "申請者が被保険者本人の場合には、下記について記載は不要です。" - }, - { "r1": 54, "c1": 8, "r2": 54, "c2": 10 }, - { "r1": 51, "c1": 5, "r2": 52, "c2": 6, "v": "有価証券" }, - { - "r1": 52, - "c1": 8, - "r2": 53, - "c2": 9, - "v": "(現金・負債      を含む)" - }, - { - "r1": 47, - "c1": 0, - "r2": 53, - "c2": 1, - "v": "預貯金等に     関する申告" - }, - { "r1": 10, "c1": 0, "r2": 12, "c2": 1, "v": "住   所" }, - { "r1": 10, "c1": 2, "r2": 10, "c2": 10 }, - { "r1": 7, "c1": 3, "r2": 8, "c2": 4 }, - { - "r1": 3, - "c1": 7, - "r2": 3, - "c2": 10, - "v": "     年    月    日" - }, - { "r1": 6, "c1": 7, "r2": 7, "c2": 7, "v": " 被保険者番号" }, - { "r1": 7, "c1": 6, "r2": 8, "c2": 6 }, - { "r1": 22, "c1": 0, "r2": 33, "c2": 0, "v": "配偶者に関する事項" }, - { "r1": 11, "c1": 3, "r2": 12, "c2": 6 }, - { - "r1": 1, - "c1": 0, - "r2": 1, - "c2": 10, - "v": "介護保険負担限度額認定申請書" - }, - { "r1": 6, "c1": 0, "r2": 6, "c2": 1, "v": "フ リ ガ ナ" }, - { "r1": 35, "c1": 2, "r2": 36, "c2": 2, "v": "□" }, - { "r1": 6, "c1": 3, "r2": 6, "c2": 4 }, - { "r1": 23, "c1": 1, "r2": 24, "c2": 1, "v": "氏  名" }, - { "r1": 6, "c1": 8, "r2": 7, "c2": 10 }, - { "r1": 20, "c1": 0, "r2": 21, "c2": 1, "v": "配偶者の有無" }, - { "r1": 26, "c1": 1, "r2": 28, "c2": 1, "v": "住  所" }, - { "r1": 25, "c1": 8, "r2": 25, "c2": 9, "v": "個人番号" }, - { "r1": 47, "c1": 2, "r2": 50, "c2": 2, "v": "□" }, - { "r1": 8, "c1": 8, "r2": 8, "c2": 10 }, - { - "r1": 29, - "c1": 1, - "r2": 32, - "c2": 1, - "v": "本年1月1日現在の住所   (現住所と     異なる場合)" - }, - { "r1": 18, "c1": 0, "r2": 18, "c2": 1, "v": "年月日(*)" }, - { - "r1": 13, - "c1": 0, - "r2": 16, - "c2": 1, - "v": "入所(院)し       た介護保険施        設の所在地及     び名称(※)" - }, - { "r1": 3, "c1": 0, "r2": 3, "c2": 1, "v": "(申請先)" }, - { - "r1": 9, - "c1": 2, - "r2": 9, - "c2": 6, - "v": "明・大・昭      年     月     日" - }, - { - "r1": 20, - "c1": 2, - "r2": 21, - "c2": 4, - "v": "有     ・     無" - }, - { - "r1": 68, - "c1": 1, - "r2": 69, - "c2": 10, - "v": " 虚偽の申告により不正に特定入所者介護サービス費等の支給を受けた場合には、介護保険法第22条第1項              の規定に基づき、支給された額及び最大2倍の加算金を返還していただくことがあります。" - }, - { - "r1": 20, - "c1": 5, - "r2": 21, - "c2": 10, - "v": "左記において「無」の場合は、以下の「配偶者に関する事項」につい                   ては、記入不要です。" - }, - { "r1": 9, "c1": 8, "r2": 9, "c2": 10 }, - { "r1": 51, "c1": 4, "r2": 53, "c2": 4, "v": "円" }, - { - "r1": 65, - "c1": 1, - "r2": 66, - "c2": 10, - "v": " 預貯金等に関しては、同じ種類の預貯金等を複数所有している場合は、そのすべてを記入し、通帳等の写し                  を添付してください。" - }, - { - "r1": 17, - "c1": 5, - "r2": 18, - "c2": 10, - "v": "(※)介護保険施設に入所(院)していない場合及び                        ショートスティを利用している場合は、記入不要です。" - }, - { "r1": 17, "c1": 0, "r2": 17, "c2": 1, "v": "入所(院)" }, - { "r1": 41, "c1": 2, "r2": 43, "c2": 2, "v": "□" }, - { - "r1": 35, - "c1": 3, - "r2": 36, - "c2": 10, - "v": "①生活保護受給者/②市町村民税世帯非課税である老齢福祉年金受給者" - }, - { "r1": 44, "c1": 2, "r2": 46, "c2": 2, "v": "□" }, - { "r1": 51, "c1": 2, "r2": 53, "c2": 3, "v": "預貯金額" }, - { "r1": 51, "c1": 8, "r2": 51, "c2": 9, "v": "その他" }, - { "r1": 7, "c1": 0, "r2": 8, "c2": 1, "v": "被保険者氏名" }, - { "r1": 51, "c1": 7, "r2": 53, "c2": 7, "v": "円" }, - { - "r1": 35, - "c1": 0, - "r2": 46, - "c2": 1, - "v": "収入等に関    する申告" - }, - { - "r1": 17, - "c1": 2, - "r2": 18, - "c2": 4, - "v": "     年   月   日" - }, - { "r1": 37, "c1": 2, "r2": 40, "c2": 2, "v": "□" }, - { "r1": 9, "c1": 0, "r2": 9, "c2": 1, "v": "生年月日" }, - { "r1": 53, "c1": 5, "r2": 53, "c2": 6, "v": "(評価概算額)" } - ] + "merged_cells": { + "schema": ["r1", "c1", "r2", "c2", "v"], + "items": [ + [ + 55, + 5, + 55, + 10, + "申請者が被保険者本人の場合には、下記について記載は不要です。" + ], + [47, 0, 53, 1, "預貯金等に     関する申告"], + [52, 8, 53, 9, "(現金・負債      を含む)"], + [51, 5, 52, 6, "有価証券"], + [54, 8, 54, 10, " "], + [10, 0, 12, 1, "住   所"], + [10, 2, 10, 10, " "], + [7, 3, 8, 4, " "], + [3, 7, 3, 10, "     年    月    日"], + [6, 7, 7, 7, " 被保険者番号"], + [7, 6, 8, 6, " "], + [22, 0, 33, 0, "配偶者に関する事項"], + [11, 3, 12, 6, " "], + [1, 0, 1, 10, "介護保険負担限度額認定申請書"], + [6, 0, 6, 1, "フ リ ガ ナ"], + [35, 2, 36, 2, "□"], + [23, 1, 24, 1, "氏  名"], + [6, 8, 7, 10, " "], + [6, 3, 6, 4, " "], + [20, 0, 21, 1, "配偶者の有無"], + [26, 1, 28, 1, "住  所"], + [25, 8, 25, 9, "個人番号"], + [47, 2, 50, 2, "□"], + [8, 8, 8, 10, " "], + [ + 29, + 1, + 32, + 1, + "本年1月1日現在の住所   (現住所と     異なる場合)" + ], + [18, 0, 18, 1, "年月日(*)"], + [ + 13, + 0, + 16, + 1, + "入所(院)し       た介護保険施        設の所在地及     び名称(※)" + ], + [9, 2, 9, 6, "明・大・昭      年     月     日"], + [3, 0, 3, 1, "(申請先)"], + [20, 2, 21, 4, "有     ・     無"], + [ + 68, + 1, + 69, + 10, + " 虚偽の申告により不正に特定入所者介護サービス費等の支給を受けた場合には、介護保険法第22条第1項              の規定に基づき、支給された額及び最大2倍の加算金を返還していただくことがあります。" + ], + [ + 20, + 5, + 21, + 10, + "左記において「無」の場合は、以下の「配偶者に関する事項」につい                   ては、記入不要です。" + ], + [9, 8, 9, 10, " "], + [51, 4, 53, 4, "円"], + [ + 65, + 1, + 66, + 10, + " 預貯金等に関しては、同じ種類の預貯金等を複数所有している場合は、そのすべてを記入し、通帳等の写し                  を添付してください。" + ], + [ + 17, + 5, + 18, + 10, + "(※)介護保険施設に入所(院)していない場合及び                        ショートスティを利用している場合は、記入不要です。" + ], + [17, 0, 17, 1, "入所(院)"], + [41, 2, 43, 2, "□"], + [ + 35, + 3, + 36, + 10, + "①生活保護受給者/②市町村民税世帯非課税である老齢福祉年金受給者" + ], + [44, 2, 46, 2, "□"], + [51, 2, 53, 3, "預貯金額"], + [51, 8, 51, 9, "その他"], + [7, 0, 8, 1, "被保険者氏名"], + [51, 7, 53, 7, "円"], + [35, 0, 46, 1, "収入等に関    する申告"], + [17, 2, 18, 4, "     年   月   日"], + [37, 2, 40, 2, "□"], + [9, 0, 9, 1, "生年月日"], + [53, 5, 53, 6, "(評価概算額)"] + ] + } } } } From c21d544c34dc495e4527180628b7cf9112a0420c Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Tue, 6 Jan 2026 22:15:36 +0900 Subject: [PATCH 10/16] =?UTF-8?q?=E3=83=89=E3=82=AD=E3=83=A5=E3=83=A1?= =?UTF-8?q?=E3=83=B3=E3=83=88=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.ja.md | 27 +++++++++++------------- README.md | 41 +++++++++++++++++++----------------- docs/README.en.md | 23 +++++++++----------- docs/README.ja.md | 21 ++++++++---------- docs/release-notes/v0.3.2.md | 2 +- 5 files changed, 54 insertions(+), 60 deletions(-) diff --git a/README.ja.md b/README.ja.md index 4cbdafc..351943a 100644 --- a/README.ja.md +++ b/README.ja.md @@ -2,7 +2,7 @@ [![PyPI version](https://badge.fury.io/py/exstruct.svg)](https://pypi.org/project/exstruct/) [![PyPI Downloads](https://static.pepy.tech/personalized-badge/exstruct?period=total&units=INTERNATIONAL_SYSTEM&left_color=BLACK&right_color=GREEN&left_text=downloads)](https://pepy.tech/projects/exstruct) ![Licence: BSD-3-Clause](https://img.shields.io/badge/license-BSD--3--Clause-blue?style=flat-square) [![pytest](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml/badge.svg)](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/e081cb4f634e4175b259eb7c34f54f60)](https://app.codacy.com/gh/harumiWeb/exstruct/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) [![codecov](https://codecov.io/gh/harumiWeb/exstruct/graph/badge.svg?token=2XI1O8TTA9)](https://codecov.io/gh/harumiWeb/exstruct) -![ExStruct Image](/docs/assets/icon.webp) +![ExStruct Image](/assets/icon.webp) ExStruct は Excel ワークブックを読み取り、構造化データ(セル・テーブル候補・図形・チャート・SmartArt・印刷範囲ビュー)をデフォルトで JSON に出力します。必要に応じて YAML/TOON も選択でき、COM/Excel 環境ではリッチ抽出、非 COM 環境ではセル+テーブル候補+印刷範囲へのフォールバックで安全に動作します。LLM/RAG 向けに検出ヒューリスティックや出力モードを調整可能です。 @@ -160,7 +160,7 @@ exstruct input.xlsx --pdf --image --dpi 144 - 図形のみで作成したフローチャート (下画像が実際のサンプル Excel シート) -![Sample Excel](/docs/assets/demo_sheet.png) +![Sample Excel](/assets/demo_sheet.png) サンプル Excel: `sample/sample.xlsx` ### 1. Input: Excel Sheet Overview @@ -339,7 +339,7 @@ flowchart TD ### Excel データ -![一般的な申請書Excel](/docs/assets/demo_form.ja.png) +![一般的な申請書Excel](/assets/demo_form.ja.png) ### ExStruct JSON @@ -360,18 +360,15 @@ flowchart TD ... ], "table_candidates": ["B25:C26", "C37:D50"], - "merged_cells": [ - { - "r1": 55, - "c1": 5, - "r2": 55, - "c2": 10, - "v": "申請者が被保険者本人の場合には、下記について記載は不要です。" - }, - { "r1": 54, "c1": 8, "r2": 54, "c2": 10 }, - { "r1": 51, "c1": 5, "r2": 52, "c2": 6, "v": "有価証券" }, - ... - ] + "merged_cells": { + "schema": ["r1", "c1", "r2", "c2", "v"], + "items": [ + [55, 5, 55, 10, "申請者が被保険者本人の場合には、下記について記載は不要です。"], + [54, 8, 54, 10, " "], + [51, 5, 52, 6, "有価証券"], + ... + ] + } } } } diff --git a/README.md b/README.md index 224b280..ceaaafb 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![PyPI version](https://badge.fury.io/py/exstruct.svg)](https://pypi.org/project/exstruct/) [![PyPI Downloads](https://static.pepy.tech/personalized-badge/exstruct?period=total&units=INTERNATIONAL_SYSTEM&left_color=BLACK&right_color=GREEN&left_text=downloads)](https://pepy.tech/projects/exstruct) ![Licence: BSD-3-Clause](https://img.shields.io/badge/license-BSD--3--Clause-blue?style=flat-square) [![pytest](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml/badge.svg)](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/e081cb4f634e4175b259eb7c34f54f60)](https://app.codacy.com/gh/harumiWeb/exstruct/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) [![codecov](https://codecov.io/gh/harumiWeb/exstruct/graph/badge.svg?token=2XI1O8TTA9)](https://codecov.io/gh/harumiWeb/exstruct) -![ExStruct Image](/docs/assets/icon.webp) +![ExStruct Image](docs/assets/icon.webp) ExStruct reads Excel workbooks and outputs structured data (cells, table candidates, shapes, charts, smartart, merged cell ranges, print areas/views, auto page-break areas, hyperlinks) as JSON by default, with optional YAML/TOON formats. It targets both COM/Excel environments (rich extraction) and non-COM environments (cells + table candidates + print areas), with tunable detection heuristics and multiple output modes to fit LLM/RAG pipelines. @@ -43,8 +43,8 @@ exstruct input.xlsx -o out.json --pretty # pretty JSON to a file exstruct input.xlsx --format yaml # YAML (needs pyyaml) exstruct input.xlsx --format toon # TOON (needs python-toon) exstruct input.xlsx --sheets-dir sheets/ # split per sheet in chosen format -exstruct input.xlsx --print-areas-dir areas/ # split per print area (if any) exstruct input.xlsx --auto-page-breaks-dir auto_areas/ # COM only; option appears when available +exstruct input.xlsx --print-areas-dir areas/ # split per print area (if any) exstruct input.xlsx --mode light # cells + table candidates only exstruct input.xlsx --pdf --image # PDF and PNGs (Excel required) ``` @@ -92,9 +92,9 @@ engine = ExStructEngine( ), ) wb2 = engine.extract("input.xlsx") -engine.export(wb2, Path("out_filtered.json")) # drops shapes via filters +engine.export(wb2, Path("out_filtered.json")) -# Enable hyperlinks in other modes +# Enable hyperlinks in standard mode engine_links = ExStructEngine(options=StructOptions(mode="standard", include_cell_links=True)) with_links = engine_links.extract("input.xlsx") @@ -161,7 +161,7 @@ To show how well exstruct can structure Excel, we parse a workbook that combines - Flowchart built only with shapes (Screenshot below is the actual sample Excel sheet) -![Sample Excel](/docs/assets/demo_sheet.png) +![Sample Excel](docs/assets/demo_sheet.png) Sample workbook: `sample/sample.xlsx` ### 1. Input: Excel Sheet Overview @@ -336,11 +336,12 @@ flowchart TD ``` ```` + ## Example 2: General Application Form ### Excel Sheet -![General Application Form Excel](/docs/assets/demo_form_en.png) +![General Application Form Excel](docs/assets/demo_form_en.png) ### ExStruct JSON @@ -376,19 +377,16 @@ flowchart TD } ], "print_areas": [{ "r1": 1, "c1": 0, "r2": 66, "c2": 23 }], - "merged_cells": [ - { "r1": 34, "c1": 15, "r2": 34, "c2": 23 }, - { - "r1": 56, - "c1": 10, - "r2": 57, - "c2": 17, - "v": "Federal Share Calculation" - }, - { "r1": 18, "c1": 10, "r2": 18, "c2": 23 }, - { "r1": 15, "c1": 0, "r2": 15, "c2": 1 }, - ... - ] + "merged_cells": { + "schema": ["r1", "c1", "r2", "c2", "v"], + "items": [ + [34, 15, 34, 23, " "], + [56, 10, 57, 17, "Federal Share Calculation"], + [18, 10, 18, 23, " "], + [15, 0, 15, 1, " "], + ... + ] + } } } } @@ -596,6 +594,11 @@ This project is suitable for teams that: - Use CLI `--auto-page-breaks-dir` (COM only), `DestinationOptions.auto_page_breaks_dir` (preferred), or `export_auto_page_breaks(...)` to write per-auto-page-break files; the API raises `ValueError` if no auto page breaks exist. - `PrintAreaView` includes rows and table candidates inside the area, plus shapes/charts that overlap the area (size-less shapes are treated as points). `normalize=True` rebases row/col indices to the area origin. +## Documentation build + +- Update generated model docs before building the site: `python scripts/gen_model_docs.py`. +- Build locally with mkdocs + mkdocstrings (dev deps required): `uv run mkdocs serve` or `uv run mkdocs build`. + ## Architecture ExStruct uses a pipeline-based architecture that separates diff --git a/docs/README.en.md b/docs/README.en.md index 24c9702..ce88b02 100644 --- a/docs/README.en.md +++ b/docs/README.en.md @@ -377,19 +377,16 @@ flowchart TD } ], "print_areas": [{ "r1": 1, "c1": 0, "r2": 66, "c2": 23 }], - "merged_cells": [ - { "r1": 34, "c1": 15, "r2": 34, "c2": 23 }, - { - "r1": 56, - "c1": 10, - "r2": 57, - "c2": 17, - "v": "Federal Share Calculation" - }, - { "r1": 18, "c1": 10, "r2": 18, "c2": 23 }, - { "r1": 15, "c1": 0, "r2": 15, "c2": 1 }, - ... - ] + "merged_cells": { + "schema": ["r1", "c1", "r2", "c2", "v"], + "items": [ + [34, 15, 34, 23, " "], + [56, 10, 57, 17, "Federal Share Calculation"], + [18, 10, 18, 23, " "], + [15, 0, 15, 1, " "], + ... + ] + } } } } diff --git a/docs/README.ja.md b/docs/README.ja.md index ba289fe..351943a 100644 --- a/docs/README.ja.md +++ b/docs/README.ja.md @@ -360,18 +360,15 @@ flowchart TD ... ], "table_candidates": ["B25:C26", "C37:D50"], - "merged_cells": [ - { - "r1": 55, - "c1": 5, - "r2": 55, - "c2": 10, - "v": "申請者が被保険者本人の場合には、下記について記載は不要です。" - }, - { "r1": 54, "c1": 8, "r2": 54, "c2": 10 }, - { "r1": 51, "c1": 5, "r2": 52, "c2": 6, "v": "有価証券" }, - ... - ] + "merged_cells": { + "schema": ["r1", "c1", "r2", "c2", "v"], + "items": [ + [55, 5, 55, 10, "申請者が被保険者本人の場合には、下記について記載は不要です。"], + [54, 8, 54, 10, " "], + [51, 5, 52, 6, "有価証券"], + ... + ] + } } } } diff --git a/docs/release-notes/v0.3.2.md b/docs/release-notes/v0.3.2.md index d353016..3668bb4 100644 --- a/docs/release-notes/v0.3.2.md +++ b/docs/release-notes/v0.3.2.md @@ -5,7 +5,7 @@ pipeline integration and tests to cover the new feature. ## Highlights -- `MergedCell` model and `SheetData.merged_cells` added to output. +- `MergedCells` format and `SheetData.merged_cells` added to output. - Merged cell ranges extracted via openpyxl in standard/verbose modes. - Output can exclude merged cells via `OutputOptions.filters.include_merged_cells`. - Pipeline/Backend/Modeling integrations updated with coverage for the new flow. From 4a3fcb06df4951a5197d43cc9632f28b5490a424 Mon Sep 17 00:00:00 2001 From: harumi <164025931+harumiWeb@users.noreply.github.com> Date: Tue, 6 Jan 2026 22:28:06 +0900 Subject: [PATCH 11/16] Update tests/models/test_models_export.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- tests/models/test_models_export.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/models/test_models_export.py b/tests/models/test_models_export.py index 22a66a4..b318d7f 100644 --- a/tests/models/test_models_export.py +++ b/tests/models/test_models_export.py @@ -139,6 +139,13 @@ def test_sheet_json_includes_smartart_nodes() -> None: def test_sheet_json_includes_merged_cells_schema() -> None: + """ + Verify that SheetData.to_json serializes merged_cells with schema and items. + + Asserts that the JSON output includes a merged_cells object with a schema + field containing ["r1", "c1", "r2", "c2", "v"] and an items array with the + provided merged cell data as a 5-element array. + """ sheet = SheetData( rows=[], merged_cells=MergedCells(items=[(1, 0, 1, 1, "merged")]), From db3808e3c33cb7bd904a0d132faddcd6be371696 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Tue, 6 Jan 2026 22:29:56 +0900 Subject: [PATCH 12/16] =?UTF-8?q?=E3=83=86=E3=82=B9=E3=83=88=E3=82=B1?= =?UTF-8?q?=E3=83=BC=E3=82=B9=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/core/test_pipeline.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/core/test_pipeline.py b/tests/core/test_pipeline.py index 22c91b3..1c95daf 100644 --- a/tests/core/test_pipeline.py +++ b/tests/core/test_pipeline.py @@ -6,6 +6,7 @@ from exstruct.core.pipeline import ( ExtractionArtifacts, ExtractionInputs, + _filter_rows_excluding_merged_values, build_cells_tables_workbook, build_com_pipeline, build_pre_com_pipeline, @@ -219,3 +220,24 @@ def test_build_cells_tables_workbook_excludes_merged_values_in_rows( wb = build_cells_tables_workbook(inputs=inputs, artifacts=artifacts, reason="test") sheet = wb.sheets["Sheet1"] assert sheet.rows[0].c == {"2": "C"} + + +def test_filter_rows_excluding_merged_values_updates_links() -> None: + rows = [ + CellRow( + r=1, + c={"0": "A", "1": "B", "x": "keep"}, + links={"0": "L0", "1": "L1", "x": "LX"}, + ) + ] + merged_cells = [MergedCellRange(r1=1, c1=0, r2=1, c2=1, v="A")] + filtered = _filter_rows_excluding_merged_values(rows, merged_cells) + assert filtered[0].c == {"x": "keep"} + assert filtered[0].links == {"x": "LX"} + + +def test_filter_rows_excluding_merged_values_drops_empty_rows() -> None: + rows = [CellRow(r=1, c={"0": "A"}, links={"0": "L0"})] + merged_cells = [MergedCellRange(r1=1, c1=0, r2=1, c2=0, v="A")] + filtered = _filter_rows_excluding_merged_values(rows, merged_cells) + assert filtered == [] From 0035343a323e1ce2064ba71690d268b8e3c6f824 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Tue, 6 Jan 2026 22:40:40 +0900 Subject: [PATCH 13/16] =?UTF-8?q?=E6=B3=A8=E8=A8=98=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.ja.md | 38 ++++++++++++++++++++++++++++++++++++++ README.md | 38 ++++++++++++++++++++++++++++++++++++++ docs/README.en.md | 38 ++++++++++++++++++++++++++++++++++++++ docs/README.ja.md | 38 ++++++++++++++++++++++++++++++++++++++ pyproject.toml | 2 +- 5 files changed, 153 insertions(+), 1 deletion(-) diff --git a/README.ja.md b/README.ja.md index 351943a..87f7c24 100644 --- a/README.ja.md +++ b/README.ja.md @@ -375,6 +375,44 @@ flowchart TD ``` +### 互換性メモ(v0.3.5): merged_cells 形式変更 + +`merged_cells` は v0.3.5 で「オブジェクト配列」から「schema/items」形式に変更されました(JSON 利用側には破壊的変更)。 + +旧形式(<= v0.3.2): + +```json +"merged_cells": [ + { "r1": 55, "c1": 5, "r2": 55, "c2": 10, "v": "申請者が被保険者本人の場合には、下記について記載は不要です。" }, + { "r1": 51, "c1": 5, "r2": 52, "c2": 6, "v": "有価証券" } +] +``` + +新形式(v0.3.5+): + +```json +"merged_cells": { + "schema": ["r1", "c1", "r2", "c2", "v"], + "items": [ + [55, 5, 55, 10, "申請者が被保険者本人の場合には、下記について記載は不要です。"], + [51, 5, 52, 6, "有価証券"] + ] +} +``` + +移行例(併存パース): + +```python +def normalize_merged_cells(raw): + schema = ["r1", "c1", "r2", "c2", "v"] + if isinstance(raw, list): + items = [[d.get(k, " ") for k in schema] for d in raw] + return {"schema": schema, "items": items} + if isinstance(raw, dict) and "schema" in raw and "items" in raw: + return raw + return None +``` + ### LLM 推論による ExStruct JSON → Markdown 変換結果 ```md diff --git a/README.md b/README.md index ceaaafb..7588c0e 100644 --- a/README.md +++ b/README.md @@ -393,6 +393,44 @@ flowchart TD ``` +### Migration note (v0.3.5): merged_cells format change + +`merged_cells` changed from a list of objects to a schema/items structure in v0.3.5 (breaking change for JSON consumers). + +Old format (<= v0.3.2): + +```json +"merged_cells": [ + { "r1": 34, "c1": 15, "r2": 34, "c2": 23, "v": " " }, + { "r1": 56, "c1": 10, "r2": 57, "c2": 17, "v": "Federal Share Calculation" } +] +``` + +New format (v0.3.5+): + +```json +"merged_cells": { + "schema": ["r1", "c1", "r2", "c2", "v"], + "items": [ + [34, 15, 34, 23, " "], + [56, 10, 57, 17, "Federal Share Calculation"] + ] +} +``` + +Migration example (support both during transition): + +```python +def normalize_merged_cells(raw): + schema = ["r1", "c1", "r2", "c2", "v"] + if isinstance(raw, list): + items = [[d.get(k, " ") for k in schema] for d in raw] + return {"schema": schema, "items": items} + if isinstance(raw, dict) and "schema" in raw and "items" in raw: + return raw + return None +``` + ### LLM reconstruction example ```md diff --git a/docs/README.en.md b/docs/README.en.md index ce88b02..38fa5b0 100644 --- a/docs/README.en.md +++ b/docs/README.en.md @@ -393,6 +393,44 @@ flowchart TD ``` +### Migration note (v0.3.5): merged_cells format change + +`merged_cells` changed from a list of objects to a schema/items structure in v0.3.5 (breaking change for JSON consumers). + +Old format (<= v0.3.2): + +```json +"merged_cells": [ + { "r1": 34, "c1": 15, "r2": 34, "c2": 23, "v": " " }, + { "r1": 56, "c1": 10, "r2": 57, "c2": 17, "v": "Federal Share Calculation" } +] +``` + +New format (v0.3.5+): + +```json +"merged_cells": { + "schema": ["r1", "c1", "r2", "c2", "v"], + "items": [ + [34, 15, 34, 23, " "], + [56, 10, 57, 17, "Federal Share Calculation"] + ] +} +``` + +Migration example (support both during transition): + +```python +def normalize_merged_cells(raw): + schema = ["r1", "c1", "r2", "c2", "v"] + if isinstance(raw, list): + items = [[d.get(k, " ") for k in schema] for d in raw] + return {"schema": schema, "items": items} + if isinstance(raw, dict) and "schema" in raw and "items" in raw: + return raw + return None +``` + ### LLM reconstruction example ```md diff --git a/docs/README.ja.md b/docs/README.ja.md index 351943a..146aa6f 100644 --- a/docs/README.ja.md +++ b/docs/README.ja.md @@ -375,6 +375,44 @@ flowchart TD ``` +### 互換性メモ(v0.3.5): merged_cells 形式変更 + +`merged_cells` は v0.15 で「オブジェクト配列」から「schema/items」形式に変更されました(JSON 利用側には破壊的変更)。 + +旧形式(<= v0.3.2): + +```json +"merged_cells": [ + { "r1": 55, "c1": 5, "r2": 55, "c2": 10, "v": "申請者が被保険者本人の場合には、下記について記載は不要です。" }, + { "r1": 51, "c1": 5, "r2": 52, "c2": 6, "v": "有価証券" } +] +``` + +新形式(v0.3.5+): + +```json +"merged_cells": { + "schema": ["r1", "c1", "r2", "c2", "v"], + "items": [ + [55, 5, 55, 10, "申請者が被保険者本人の場合には、下記について記載は不要です。"], + [51, 5, 52, 6, "有価証券"] + ] +} +``` + +移行例(併存パース): + +```python +def normalize_merged_cells(raw): + schema = ["r1", "c1", "r2", "c2", "v"] + if isinstance(raw, list): + items = [[d.get(k, " ") for k in schema] for d in raw] + return {"schema": schema, "items": items} + if isinstance(raw, dict) and "schema" in raw and "items" in raw: + return raw + return None +``` + ### LLM 推論による ExStruct JSON → Markdown 変換結果 ```md diff --git a/pyproject.toml b/pyproject.toml index 68b391f..731ae9a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "exstruct" -version = "0.3.2" +version = "0.3.5" description = "Excel to structured JSON (tables, shapes, charts) for LLM/RAG pipelines" readme = "README.md" license = { file = "LICENSE" } From bbc9abc8feb50691652c05b0c978f997db15a895 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Tue, 6 Jan 2026 22:42:23 +0900 Subject: [PATCH 14/16] =?UTF-8?q?=E3=83=AA=E3=83=AA=E3=83=BC=E3=82=B9?= =?UTF-8?q?=E3=83=8E=E3=83=BC=E3=83=88=E4=BD=9C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/release-notes/v0.3.5.md | 18 ++++++++++++++++++ mkdocs.yml | 1 + 2 files changed, 19 insertions(+) create mode 100644 docs/release-notes/v0.3.5.md diff --git a/docs/release-notes/v0.3.5.md b/docs/release-notes/v0.3.5.md new file mode 100644 index 0000000..02f9faa --- /dev/null +++ b/docs/release-notes/v0.3.5.md @@ -0,0 +1,18 @@ +# v0.3.5 Release Notes + +This release introduces a new merged_cells serialization format and adds a +compatibility flag to control merged cell values in rows. + +## Highlights + +- `merged_cells` output now uses a schema/items structure (breaking change). +- Added `StructOptions.include_merged_values_in_rows` to control whether merged + cell values remain in `rows` (default: True). +- Merged cell values with no content are normalized to a single space `" "`. +- Updated tests and documentation to reflect the new format. + +## Compatibility Notes + +- Old format (<= v0.3.2): `merged_cells` is a list of objects. +- New format (v0.3.5+): `merged_cells` is an object with `schema` and `items`. +- Use the migration note in the README for transition guidance. diff --git a/mkdocs.yml b/mkdocs.yml index d143ed9..126d2dc 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -27,6 +27,7 @@ nav: - CLI Guide: cli.md - Concept / Why ExStruct?: concept.md - Release Notes: + - v0.3.5: release-notes/v0.3.5.md - v0.3.2: release-notes/v0.3.2.md - v0.3.1: release-notes/v0.3.1.md - v0.3.0: release-notes/v0.3.0.md From e91a37926cdea1a8ba42c52d456592caf284cecc Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Tue, 6 Jan 2026 22:45:05 +0900 Subject: [PATCH 15/16] =?UTF-8?q?-=20resolve=5Fextraction=5Finputs=20?= =?UTF-8?q?=E3=81=A7=20include=5Fmerged=5Fvalues=5Fin=5Frows=3DFalse=20?= =?UTF-8?q?=E3=81=AE=E5=A0=B4=E5=90=88=E3=81=AB=20resolved=5Fmerged=5Fcell?= =?UTF-8?q?s=3DTrue=20=E3=82=92=E5=BC=B7=E5=88=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/exstruct/core/pipeline.py | 2 ++ tests/core/test_pipeline.py | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/src/exstruct/core/pipeline.py b/src/exstruct/core/pipeline.py index 3a4657a..363596a 100644 --- a/src/exstruct/core/pipeline.py +++ b/src/exstruct/core/pipeline.py @@ -225,6 +225,8 @@ def resolve_extraction_inputs( resolved_merged_cells = ( include_merged_cells if include_merged_cells is not None else mode != "light" ) + if not include_merged_values_in_rows: + resolved_merged_cells = True return ExtractionInputs( file_path=normalized_file_path, diff --git a/tests/core/test_pipeline.py b/tests/core/test_pipeline.py index 1c95daf..80f45c7 100644 --- a/tests/core/test_pipeline.py +++ b/tests/core/test_pipeline.py @@ -159,6 +159,24 @@ def test_resolve_extraction_inputs_defaults(tmp_path: Path) -> None: assert inputs.include_merged_cells is True +def test_resolve_extraction_inputs_forces_merged_cells_when_excluding_values( + tmp_path: Path, +) -> None: + inputs = resolve_extraction_inputs( + tmp_path / "book.xlsx", + mode="light", + include_cell_links=None, + include_print_areas=None, + include_auto_page_breaks=False, + include_colors_map=None, + include_default_background=False, + ignore_colors=None, + include_merged_cells=False, + include_merged_values_in_rows=False, + ) + assert inputs.include_merged_cells is True + + def test_build_cells_tables_workbook_uses_print_areas( monkeypatch: MonkeyPatch, tmp_path: Path ) -> None: From ba1ee5a528f696f2fc7fb2fab4a49fb5fa5f1446 Mon Sep 17 00:00:00 2001 From: harumiWeb Date: Tue, 6 Jan 2026 22:50:44 +0900 Subject: [PATCH 16/16] =?UTF-8?q?=E3=83=90=E3=83=BC=E3=82=B8=E3=83=A7?= =?UTF-8?q?=E3=83=B3=E8=AA=A4=E8=A8=98=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/README.ja.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/README.ja.md b/docs/README.ja.md index 146aa6f..87f7c24 100644 --- a/docs/README.ja.md +++ b/docs/README.ja.md @@ -377,7 +377,7 @@ flowchart TD ### 互換性メモ(v0.3.5): merged_cells 形式変更 -`merged_cells` は v0.15 で「オブジェクト配列」から「schema/items」形式に変更されました(JSON 利用側には破壊的変更)。 +`merged_cells` は v0.3.5 で「オブジェクト配列」から「schema/items」形式に変更されました(JSON 利用側には破壊的変更)。 旧形式(<= v0.3.2):