Skip to content

Commit

Permalink
added three test-files for right-to-left
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
  • Loading branch information
PeterStaar-IBM committed Feb 4, 2025
1 parent 5db82d5 commit d7c9874
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 146 deletions.
245 changes: 106 additions & 139 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ pydantic = "^2.0.0"
docling-core = {extras = ["chunking"], version = "^2.17.0"}
docling-ibm-models = "^3.3.0"
deepsearch-glm = "^1.0.0"
docling-parse = "^3.1.0"
# docling-parse = "^3.1.0"
docling-parse = { git = "https://github.com/DS4SD/docling-parse.git", rev = "93e281576e740345d0161ad5da1b1fff815df8e4"}
filetype = "^1.2.0"
pypdfium2 = "^4.30.0"
pydantic-settings = "^2.3.0"
Expand Down
Binary file added tests/data/pdf/right_to_left_01.pdf
Binary file not shown.
Binary file added tests/data/pdf/right_to_left_02.pdf
Binary file not shown.
Binary file added tests/data/pdf/right_to_left_03.pdf
Binary file not shown.
15 changes: 9 additions & 6 deletions tests/verify_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,13 +248,14 @@ def verify_conversion_result_v1(
doc_pred_md = doc_result.legacy_document.export_to_markdown()
doc_pred_dt = doc_result.legacy_document.export_to_document_tokens()


engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"

gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name
if str(input_path.parent).endswith("pdf"):
gt_subpath = input_path.parent.parent / "groundtruth" / "docling_v1" / input_path.name

gt_subpath = (
input_path.parent.parent / "groundtruth" / "docling_v1" / input_path.name
)

pages_path = gt_subpath.with_suffix(f"{engine_suffix}.pages.json")
json_path = gt_subpath.with_suffix(f"{engine_suffix}.json")
md_path = gt_subpath.with_suffix(f"{engine_suffix}.md")
Expand Down Expand Up @@ -333,8 +334,10 @@ def verify_conversion_result_v2(

gt_subpath = input_path.parent / "groundtruth" / "docling_v2" / input_path.name
if str(input_path.parent).endswith("pdf"):
gt_subpath = input_path.parent.parent / "groundtruth" / "docling_v2" / input_path.name

gt_subpath = (
input_path.parent.parent / "groundtruth" / "docling_v2" / input_path.name
)

pages_path = gt_subpath.with_suffix(f"{engine_suffix}.pages.json")
json_path = gt_subpath.with_suffix(f"{engine_suffix}.json")
md_path = gt_subpath.with_suffix(f"{engine_suffix}.md")
Expand Down

0 comments on commit d7c9874

Please sign in to comment.