diff --git a/CHANGELOG.md b/CHANGELOG.md index 8c88a317e8..9d94f26bc5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ -## 0.18.27-dev6 +## 0.18.27 ### Fixes - Comment no-ops in `zoom_image` (codeflash) +- Fix an issue where elements with partially filled extracted text are marked as extracted ### Enhancement - Optimize `sentence_count` (codeflash) diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py index e0fb02f23f..0c3603636e 100644 --- a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py +++ b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py @@ -162,16 +162,47 @@ def test_aggregate_by_block(): expected = "Inside region1 Inside region2" embedded_regions = TextRegions.from_list( [ - TextRegion.from_coords(0, 0, 20, 20, "Inside region1"), - TextRegion.from_coords(20, 20, 80, 80, None), - TextRegion.from_coords(50, 50, 150, 150, "Inside region2"), + TextRegion.from_coords(0, 0, 300, 20, "Inside region1"), + TextRegion.from_coords(0, 20, 300, 80, None), + TextRegion.from_coords(0, 80, 200, 300, "Inside region2"), TextRegion.from_coords(250, 250, 350, 350, "Outside region"), ] ) + embedded_regions.is_extracted_array = np.array([IsExtracted.TRUE] * 4) target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)]) - text, _ = aggregate_embedded_text_by_block(target_region, embedded_regions) + text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions) assert text == expected + assert extracted.value == "true" + + +def test_aggregate_only_partially_fill_target(): + expected = "Inside region1" + embedded_regions = TextRegions.from_list( + [ + TextRegion.from_coords(0, 0, 20, 20, "Inside region1"), + ] + ) + embedded_regions.is_extracted_array = np.array([IsExtracted.TRUE]) + target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)]) + + text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions) + assert text == expected + assert extracted.value == "partial" + + +def test_aggregate_not_filling_target(): + embedded_regions = TextRegions.from_list( + [ + TextRegion.from_coords(300, 0, 400, 20, "outside"), + ] + ) + embedded_regions.is_extracted_array = np.array([IsExtracted.TRUE]) + target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)]) + + text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions) + assert text == "" + assert extracted.value == "false" @pytest.mark.parametrize( diff --git a/unstructured/__version__.py b/unstructured/__version__.py index c0da63dd46..4bb2b92ac3 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.27-dev6" # pragma: no cover +__version__ = "0.18.27" # pragma: no cover diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py index 852e2f94e4..f0660ac9dd 100644 --- a/unstructured/partition/pdf_image/ocr.py +++ b/unstructured/partition/pdf_image/ocr.py @@ -396,7 +396,7 @@ def merge_out_layout_with_ocr_layout( out_layout.texts[idx], _ = aggregate_embedded_text_by_block( target_region=out_layout.slice([idx]), source_regions=ocr_layout, - threshold=subregion_threshold, + subregion_threshold=subregion_threshold, ) final_layout = ( diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 0c634c32ea..9e5a3a9993 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -774,10 +774,26 @@ def remove_duplicate_elements( return elements.slice(np.concatenate(ious)) +def _aggregated_iou(box1s, box2): + intersection = 0.0 + sum_areas = calculate_bbox_area(box2) + + for i in range(box1s.shape[0]): + intersection += calculate_intersection_area(box1s[i, :], box2) + sum_areas += calculate_bbox_area(box1s[i, :]) + + union = sum_areas - intersection + + if union == 0: + return 1.0 + return intersection / union + + def aggregate_embedded_text_by_block( target_region: TextRegions, source_regions: TextRegions, - threshold: float = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD, + subregion_threshold: float = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD, + text_coverage_threshold: float = env_config.TEXT_COVERAGE_THRESHOLD, ) -> tuple[str, IsExtracted | None]: """Extracts the text aggregated from the elements of the given layout that lie within the given block.""" @@ -789,18 +805,29 @@ def aggregate_embedded_text_by_block( bboxes1_is_almost_subregion_of_bboxes2( source_regions.element_coords, target_region.element_coords, - threshold, + subregion_threshold, ) .sum(axis=1) .astype(bool) ) text = " ".join([text for text in source_regions.slice(mask).texts if text]) - # if nothing is sliced then it is not extracted - is_extracted = sum(mask) and all( - flag == IsExtracted.TRUE for flag in source_regions.slice(mask).is_extracted_array - ) - return text, IsExtracted.TRUE if is_extracted else IsExtracted.FALSE + + if sum(mask): + source_bboxes = source_regions.slice(mask).element_coords + target_bboxes = target_region.element_coords + + iou = _aggregated_iou(source_bboxes, target_bboxes[0, :]) + + fully_filled = ( + all(flag == IsExtracted.TRUE for flag in source_regions.slice(mask).is_extracted_array) + and iou > text_coverage_threshold + ) + is_extracted = IsExtracted.TRUE if fully_filled else IsExtracted.PARTIAL + else: + # if nothing is sliced then it is not extracted + is_extracted = IsExtracted.FALSE + return text, is_extracted def get_links_in_element(page_links: list, region: Rectangle) -> list: diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py index 0e2daa714f..b4e4017b1a 100644 --- a/unstructured/partition/utils/config.py +++ b/unstructured/partition/utils/config.py @@ -234,5 +234,11 @@ def ANALYSIS_BBOX_FORMAT(self) -> str: """The format for analysed pages with bboxes drawn on them. Default is 'png'.""" return self._get_string("ANALYSIS_BBOX_FORMAT", "png") + @property + def TEXT_COVERAGE_THRESHOLD(self) -> float: + """the minimum iou between extracted text bboxes and their target inferred element bbox for + the inferred element to be considered contaning extracted text""" + return self._get_float("TEXT_COVERAGE_THRESHOLD", 0.25) + env_config = ENVConfig()