diff --git a/CHANGELOG.md b/CHANGELOG.md index aae973d8ed..3477587066 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,18 +1,15 @@ -## 0.18.27-dev5 - -### Enhancement -- Upgrade pdfminer-six to 20260107 to fix ~15-18% performance regression from eager f-string evaluation - -## 0.18.27-dev4 +## 0.18.27 ### Fixes - Comment no-ops in `zoom_image` (codeflash) +- Fix an issue where elements with partially filled extracted text are marked as extracted ### Enhancement - Optimize `sentence_count` (codeflash) - Optimize `_PartitionerLoader._load_partitioner` (codeflash) - Optimize `detect_languages` (codeflash) - Optimize `contains_verb` (codeflash) +- Upgrade pdfminer-six to 20260107 to fix ~15-18% performance regression from eager f-string evaluation ## 0.18.26 diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py index e0fb02f23f..8934adb223 100644 --- a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py +++ b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py @@ -162,16 +162,33 @@ def test_aggregate_by_block(): expected = "Inside region1 Inside region2" embedded_regions = TextRegions.from_list( [ - TextRegion.from_coords(0, 0, 20, 20, "Inside region1"), - TextRegion.from_coords(20, 20, 80, 80, None), - TextRegion.from_coords(50, 50, 150, 150, "Inside region2"), + TextRegion.from_coords(0, 0, 300, 20, "Inside region1"), + TextRegion.from_coords(0, 20, 300, 80, None), + TextRegion.from_coords(0, 80, 200, 300, "Inside region2"), TextRegion.from_coords(250, 250, 350, 350, "Outside region"), ] ) + embedded_regions.is_extracted_array = np.array([IsExtracted.TRUE] * 4) + target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)]) + + text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions) + assert text == expected + assert extracted.value == "true" + + +def test_aggregate_only_partially_fill_target(): + expected = "Inside region1" + embedded_regions = TextRegions.from_list( + [ + TextRegion.from_coords(0, 0, 20, 20, "Inside region1"), + ] + ) + embedded_regions.is_extracted_array = np.array([IsExtracted.TRUE]) target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)]) - text, _ = aggregate_embedded_text_by_block(target_region, embedded_regions) + text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions) assert text == expected + assert extracted.value == "false" @pytest.mark.parametrize( diff --git a/unstructured/__version__.py b/unstructured/__version__.py index c72d9a6020..4bb2b92ac3 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.27-dev5" # pragma: no cover +__version__ = "0.18.27" # pragma: no cover diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py index 852e2f94e4..62bf4b6f39 100644 --- a/unstructured/partition/pdf_image/ocr.py +++ b/unstructured/partition/pdf_image/ocr.py @@ -18,7 +18,7 @@ from unstructured.partition.pdf_image.analysis.layout_dump import OCRLayoutDumper from unstructured.partition.pdf_image.pdf_image_utils import valid_text from unstructured.partition.pdf_image.pdfminer_processing import ( - aggregate_embedded_text_by_block, + aggregate_embedded_text_batch, bboxes1_is_almost_subregion_of_bboxes2, ) from unstructured.partition.utils.config import env_config @@ -390,15 +390,20 @@ def merge_out_layout_with_ocr_layout( return out_layout invalid_text_indices = [i for i, text in enumerate(out_layout.texts) if not valid_text(text)] - out_layout.texts = out_layout.texts.astype(object) - for idx in invalid_text_indices: - out_layout.texts[idx], _ = aggregate_embedded_text_by_block( - target_region=out_layout.slice([idx]), - source_regions=ocr_layout, - threshold=subregion_threshold, + if invalid_text_indices: + out_layout.texts = out_layout.texts.astype(object) + + aggregated_texts = aggregate_embedded_text_batch( + invalid_text_indices, + out_layout, + ocr_layout, + subregion_threshold=subregion_threshold, ) + for idx, text in zip(invalid_text_indices, aggregated_texts): + out_layout.texts[idx] = text + final_layout = ( supplement_layout_with_ocr_elements(out_layout, ocr_layout) if supplement_with_ocr_elements diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py index 4365b8dba5..2124ffbd91 100644 --- a/unstructured/partition/pdf_image/pdf_image_utils.py +++ b/unstructured/partition/pdf_image/pdf_image_utils.py @@ -259,9 +259,7 @@ def check_element_types_to_extract( def valid_text(text: str) -> bool: """a helper that determines if the text is valid ascii text""" - if not text: - return False - return "(cid:" not in text + return text and "(cid:" not in text def cid_ratio(text: str) -> float: diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 0c634c32ea..ac9528f1b3 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -774,10 +774,26 @@ def remove_duplicate_elements( return elements.slice(np.concatenate(ious)) +def _aggregated_iou(box1s, box2): + intersection = 0.0 + sum_areas = calculate_bbox_area(box2) + + for i in range(box1s.shape[0]): + intersection += calculate_intersection_area(box1s[i, :], box2) + sum_areas += calculate_bbox_area(box1s[i, :]) + + union = sum_areas - intersection + + if union == 0: + return 1.0 + return intersection / union + + def aggregate_embedded_text_by_block( target_region: TextRegions, source_regions: TextRegions, - threshold: float = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD, + subregion_threshold: float = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD, + embed_region_threshold: float = 0.25, ) -> tuple[str, IsExtracted | None]: """Extracts the text aggregated from the elements of the given layout that lie within the given block.""" @@ -789,17 +805,27 @@ def aggregate_embedded_text_by_block( bboxes1_is_almost_subregion_of_bboxes2( source_regions.element_coords, target_region.element_coords, - threshold, + subregion_threshold, ) .sum(axis=1) .astype(bool) ) text = " ".join([text for text in source_regions.slice(mask).texts if text]) - # if nothing is sliced then it is not extracted - is_extracted = sum(mask) and all( - flag == IsExtracted.TRUE for flag in source_regions.slice(mask).is_extracted_array - ) + + if sum(mask): + source_bboxes = source_regions.slice(mask).element_coords + target_bboxes = target_region.element_coords + + iou = _aggregated_iou(source_bboxes, target_bboxes[0, :]) + + is_extracted = ( + all(flag == IsExtracted.TRUE for flag in source_regions.slice(mask).is_extracted_array) + and iou > embed_region_threshold + ) + else: + # if nothing is sliced then it is not extracted + is_extracted = False return text, IsExtracted.TRUE if is_extracted else IsExtracted.FALSE @@ -1144,3 +1170,37 @@ def try_argmin(array: np.ndarray) -> int: return int(np.argmin(array)) except IndexError: return -1 + + +def aggregate_embedded_text_batch( + target_indices: list[int], + target_layout: "LayoutElements", + source_regions: TextRegions, + subregion_threshold: float = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD, +) -> list[str]: + """Batch process multiple target regions to extract aggregated text efficiently.""" + if not target_indices or len(source_regions) == 0: + return [""] * len(target_indices) + + # Get all target regions at once + target_coords = target_layout.element_coords[target_indices] + + # Compute masks for all targets in one operation + # Result shape: (len(source_regions), len(target_indices)) + all_masks = bboxes1_is_almost_subregion_of_bboxes2( + source_regions.element_coords, + target_coords, + subregion_threshold, + ) + + # Extract texts for each target + texts = [] + for i in range(len(target_indices)): + mask = all_masks[:, i].astype(bool) + if mask.any(): + text = " ".join([text for text in source_regions.slice(mask).texts if text]) + else: + text = "" + texts.append(text) + + return texts diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py index 0e2daa714f..b4e4017b1a 100644 --- a/unstructured/partition/utils/config.py +++ b/unstructured/partition/utils/config.py @@ -234,5 +234,11 @@ def ANALYSIS_BBOX_FORMAT(self) -> str: """The format for analysed pages with bboxes drawn on them. Default is 'png'.""" return self._get_string("ANALYSIS_BBOX_FORMAT", "png") + @property + def TEXT_COVERAGE_THRESHOLD(self) -> float: + """the minimum iou between extracted text bboxes and their target inferred element bbox for + the inferred element to be considered contaning extracted text""" + return self._get_float("TEXT_COVERAGE_THRESHOLD", 0.25) + env_config = ENVConfig()