From e86d0c419f510ac8affc23fd9b05bfeede3e012d Mon Sep 17 00:00:00 2001 From: Yao You Date: Tue, 6 Jan 2026 20:05:00 -0600 Subject: [PATCH 1/6] feat: use text coverage for an inferred region to set is_extracted --- .../pdf_image/test_pdfminer_processing.py | 25 ++++++-- unstructured/partition/pdf_image/ocr.py | 2 +- .../pdf_image/pdfminer_processing.py | 63 +++++++++++++++++-- 3 files changed, 79 insertions(+), 11 deletions(-) diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py index e0fb02f23f..8934adb223 100644 --- a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py +++ b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py @@ -162,16 +162,33 @@ def test_aggregate_by_block(): expected = "Inside region1 Inside region2" embedded_regions = TextRegions.from_list( [ - TextRegion.from_coords(0, 0, 20, 20, "Inside region1"), - TextRegion.from_coords(20, 20, 80, 80, None), - TextRegion.from_coords(50, 50, 150, 150, "Inside region2"), + TextRegion.from_coords(0, 0, 300, 20, "Inside region1"), + TextRegion.from_coords(0, 20, 300, 80, None), + TextRegion.from_coords(0, 80, 200, 300, "Inside region2"), TextRegion.from_coords(250, 250, 350, 350, "Outside region"), ] ) + embedded_regions.is_extracted_array = np.array([IsExtracted.TRUE] * 4) + target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)]) + + text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions) + assert text == expected + assert extracted.value == "true" + + +def test_aggregate_only_partially_fill_target(): + expected = "Inside region1" + embedded_regions = TextRegions.from_list( + [ + TextRegion.from_coords(0, 0, 20, 20, "Inside region1"), + ] + ) + embedded_regions.is_extracted_array = np.array([IsExtracted.TRUE]) target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)]) - text, _ = aggregate_embedded_text_by_block(target_region, embedded_regions) + text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions) assert text == expected + assert extracted.value == "false" @pytest.mark.parametrize( diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py index 852e2f94e4..f0660ac9dd 100644 --- a/unstructured/partition/pdf_image/ocr.py +++ b/unstructured/partition/pdf_image/ocr.py @@ -396,7 +396,7 @@ def merge_out_layout_with_ocr_layout( out_layout.texts[idx], _ = aggregate_embedded_text_by_block( target_region=out_layout.slice([idx]), source_regions=ocr_layout, - threshold=subregion_threshold, + subregion_threshold=subregion_threshold, ) final_layout = ( diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 0c634c32ea..6ea5adb005 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -774,10 +774,49 @@ def remove_duplicate_elements( return elements.slice(np.concatenate(ious)) +def _inter_union(box1, box2): + x0_1, y0_1, x1_1, y1_1 = box1 + x0_2, y0_2, x1_2, y1_2 = box2 + + # Calculate intersection coordinates + x0_inter = max(x0_1, x0_2) + y0_inter = max(y0_1, y0_2) + x1_inter = min(x1_1, x1_2) + y1_inter = min(y1_1, y1_2) + + # Calculate intersection area + inter_width = max(0, x1_inter - x0_inter) + inter_height = max(0, y1_inter - y0_inter) + intersection = inter_width * inter_height + + # Calculate area of both boxes + area1 = (x1_1 - x0_1) * (y1_1 - y0_1) + area2 = (x1_2 - x0_2) * (y1_2 - y0_2) + + # Calculate union area + union = area1 + area2 - intersection + return intersection, union + + +def _aggregated_iou(box1s, box2): + intersection = 0.0 + union = 0.0 + + for i in range(box1s.shape[0]): + _intersection, _union = _inter_union(box1s[i, :], box2) + intersection += _intersection + union += _union + + if union == 0: + return 1.0 + return intersection / union + + def aggregate_embedded_text_by_block( target_region: TextRegions, source_regions: TextRegions, - threshold: float = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD, + subregion_threshold: float = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD, + embed_region_threshold: float = 0.25, ) -> tuple[str, IsExtracted | None]: """Extracts the text aggregated from the elements of the given layout that lie within the given block.""" @@ -789,17 +828,29 @@ def aggregate_embedded_text_by_block( bboxes1_is_almost_subregion_of_bboxes2( source_regions.element_coords, target_region.element_coords, - threshold, + subregion_threshold, ) .sum(axis=1) .astype(bool) ) text = " ".join([text for text in source_regions.slice(mask).texts if text]) - # if nothing is sliced then it is not extracted - is_extracted = sum(mask) and all( - flag == IsExtracted.TRUE for flag in source_regions.slice(mask).is_extracted_array - ) + + if sum(mask): + source_bboxes = source_regions.slice(mask).element_coords + + target_bboxes = target_region.element_coords + + iou = _aggregated_iou(source_bboxes, target_bboxes[0, :]) + print(text, iou) + + is_extracted = ( + all(flag == IsExtracted.TRUE for flag in source_regions.slice(mask).is_extracted_array) + and iou > embed_region_threshold + ) + else: + # if nothing is sliced then it is not extracted + is_extracted = False return text, IsExtracted.TRUE if is_extracted else IsExtracted.FALSE From db2dc9cb87fb4331f55dbe6fb30b4f8c674aa629 Mon Sep 17 00:00:00 2001 From: Yao You Date: Wed, 7 Jan 2026 09:53:29 -0600 Subject: [PATCH 2/6] fix aggregate iou computation --- CHANGELOG.md | 3 +- unstructured/__version__.py | 2 +- .../pdf_image/pdfminer_processing.py | 34 +++---------------- 3 files changed, 8 insertions(+), 31 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 67717f5688..b7fd029f0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ -## 0.18.27-dev3 +## 0.18.27 ### Fixes - Comment no-ops in `zoom_image` (codeflash) +- Fix an issue where elements with partially filled extracted text are marked as extracted ### Enhancement - Optimize `sentence_count` (codeflash) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 5ee98dbaca..4bb2b92ac3 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.27-dev3" # pragma: no cover +__version__ = "0.18.27" # pragma: no cover diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 6ea5adb005..6a21da7872 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -774,38 +774,15 @@ def remove_duplicate_elements( return elements.slice(np.concatenate(ious)) -def _inter_union(box1, box2): - x0_1, y0_1, x1_1, y1_1 = box1 - x0_2, y0_2, x1_2, y1_2 = box2 - - # Calculate intersection coordinates - x0_inter = max(x0_1, x0_2) - y0_inter = max(y0_1, y0_2) - x1_inter = min(x1_1, x1_2) - y1_inter = min(y1_1, y1_2) - - # Calculate intersection area - inter_width = max(0, x1_inter - x0_inter) - inter_height = max(0, y1_inter - y0_inter) - intersection = inter_width * inter_height - - # Calculate area of both boxes - area1 = (x1_1 - x0_1) * (y1_1 - y0_1) - area2 = (x1_2 - x0_2) * (y1_2 - y0_2) - - # Calculate union area - union = area1 + area2 - intersection - return intersection, union - - def _aggregated_iou(box1s, box2): intersection = 0.0 - union = 0.0 + sum_areas = calculate_bbox_area(box2) for i in range(box1s.shape[0]): - _intersection, _union = _inter_union(box1s[i, :], box2) - intersection += _intersection - union += _union + intersection += calculate_intersection_area(box1s[i, :], box2) + sum_areas += calculate_bbox_area(box1s[i, :]) + + union = sum_areas - intersection if union == 0: return 1.0 @@ -838,7 +815,6 @@ def aggregate_embedded_text_by_block( if sum(mask): source_bboxes = source_regions.slice(mask).element_coords - target_bboxes = target_region.element_coords iou = _aggregated_iou(source_bboxes, target_bboxes[0, :]) From e3d389471478029c7e1c61fab28c7da102cbbc1a Mon Sep 17 00:00:00 2001 From: Yao You Date: Wed, 7 Jan 2026 09:54:29 -0600 Subject: [PATCH 3/6] remove debug print --- unstructured/partition/pdf_image/pdfminer_processing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 6a21da7872..9e248e7891 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -818,7 +818,6 @@ def aggregate_embedded_text_by_block( target_bboxes = target_region.element_coords iou = _aggregated_iou(source_bboxes, target_bboxes[0, :]) - print(text, iou) is_extracted = ( all(flag == IsExtracted.TRUE for flag in source_regions.slice(mask).is_extracted_array) From 819956c7138f1ab361aafcac0b9aef9b07c8fe31 Mon Sep 17 00:00:00 2001 From: Yao You Date: Wed, 7 Jan 2026 12:36:21 -0600 Subject: [PATCH 4/6] use config to set threshold --- unstructured/partition/pdf_image/pdfminer_processing.py | 4 ++-- unstructured/partition/utils/config.py | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 9e248e7891..1f4c3ca765 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -793,7 +793,7 @@ def aggregate_embedded_text_by_block( target_region: TextRegions, source_regions: TextRegions, subregion_threshold: float = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD, - embed_region_threshold: float = 0.25, + text_coverage_threshold: float = env_config.TEXT_COVERAGE_THRESHOLD, ) -> tuple[str, IsExtracted | None]: """Extracts the text aggregated from the elements of the given layout that lie within the given block.""" @@ -821,7 +821,7 @@ def aggregate_embedded_text_by_block( is_extracted = ( all(flag == IsExtracted.TRUE for flag in source_regions.slice(mask).is_extracted_array) - and iou > embed_region_threshold + and iou > text_coverage_threshold ) else: # if nothing is sliced then it is not extracted diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py index 0e2daa714f..b4e4017b1a 100644 --- a/unstructured/partition/utils/config.py +++ b/unstructured/partition/utils/config.py @@ -234,5 +234,11 @@ def ANALYSIS_BBOX_FORMAT(self) -> str: """The format for analysed pages with bboxes drawn on them. Default is 'png'.""" return self._get_string("ANALYSIS_BBOX_FORMAT", "png") + @property + def TEXT_COVERAGE_THRESHOLD(self) -> float: + """the minimum iou between extracted text bboxes and their target inferred element bbox for + the inferred element to be considered contaning extracted text""" + return self._get_float("TEXT_COVERAGE_THRESHOLD", 0.25) + env_config = ENVConfig() From 23c1451c4fca6b73ab528945add43af94ee57daf Mon Sep 17 00:00:00 2001 From: Yao You Date: Wed, 7 Jan 2026 12:48:57 -0600 Subject: [PATCH 5/6] use partial --- unstructured/partition/pdf_image/pdfminer_processing.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 1f4c3ca765..9e5a3a9993 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -819,14 +819,15 @@ def aggregate_embedded_text_by_block( iou = _aggregated_iou(source_bboxes, target_bboxes[0, :]) - is_extracted = ( + fully_filled = ( all(flag == IsExtracted.TRUE for flag in source_regions.slice(mask).is_extracted_array) and iou > text_coverage_threshold ) + is_extracted = IsExtracted.TRUE if fully_filled else IsExtracted.PARTIAL else: # if nothing is sliced then it is not extracted - is_extracted = False - return text, IsExtracted.TRUE if is_extracted else IsExtracted.FALSE + is_extracted = IsExtracted.FALSE + return text, is_extracted def get_links_in_element(page_links: list, region: Rectangle) -> list: From caaad7f2a3b3740d56cacdd27bca560e6f5a1f92 Mon Sep 17 00:00:00 2001 From: Yao You Date: Wed, 7 Jan 2026 15:11:26 -0600 Subject: [PATCH 6/6] fix: fix test --- .../pdf_image/test_pdfminer_processing.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py index 8934adb223..0c3603636e 100644 --- a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py +++ b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py @@ -188,6 +188,20 @@ def test_aggregate_only_partially_fill_target(): text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions) assert text == expected + assert extracted.value == "partial" + + +def test_aggregate_not_filling_target(): + embedded_regions = TextRegions.from_list( + [ + TextRegion.from_coords(300, 0, 400, 20, "outside"), + ] + ) + embedded_regions.is_extracted_array = np.array([IsExtracted.TRUE]) + target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)]) + + text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions) + assert text == "" assert extracted.value == "false"