Unstructured-IO · codeflash-ai · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,18 +1,15 @@
-## 0.18.27-dev5
-
-### Enhancement
-- Upgrade pdfminer-six to 20260107 to fix ~15-18% performance regression from eager f-string evaluation
-
-## 0.18.27-dev4
+## 0.18.27
 
 ### Fixes
 - Comment no-ops in `zoom_image` (codeflash)
+- Fix an issue where elements with partially filled extracted text are marked as extracted
 
 ### Enhancement
 - Optimize `sentence_count` (codeflash)
 - Optimize `_PartitionerLoader._load_partitioner` (codeflash)
 - Optimize `detect_languages` (codeflash)
 - Optimize `contains_verb` (codeflash)
+- Upgrade pdfminer-six to 20260107 to fix ~15-18% performance regression from eager f-string evaluation
 
 ## 0.18.26
 

diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
@@ -162,16 +162,33 @@ def test_aggregate_by_block():
     expected = "Inside region1 Inside region2"
     embedded_regions = TextRegions.from_list(
         [
-            TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
-            TextRegion.from_coords(20, 20, 80, 80, None),
-            TextRegion.from_coords(50, 50, 150, 150, "Inside region2"),
+            TextRegion.from_coords(0, 0, 300, 20, "Inside region1"),
+            TextRegion.from_coords(0, 20, 300, 80, None),
+            TextRegion.from_coords(0, 80, 200, 300, "Inside region2"),
             TextRegion.from_coords(250, 250, 350, 350, "Outside region"),
         ]
     )
+    embedded_regions.is_extracted_array = np.array([IsExtracted.TRUE] * 4)
+    target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)])
+
+    text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions)
+    assert text == expected
+    assert extracted.value == "true"
+
+
+def test_aggregate_only_partially_fill_target():
+    expected = "Inside region1"
+    embedded_regions = TextRegions.from_list(
+        [
+            TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
+        ]
+    )
+    embedded_regions.is_extracted_array = np.array([IsExtracted.TRUE])
     target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)])
 
-    text, _ = aggregate_embedded_text_by_block(target_region, embedded_regions)
+    text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions)
     assert text == expected
+    assert extracted.value == "false"
 
 
 @pytest.mark.parametrize(

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.18.27-dev5"  # pragma: no cover
+__version__ = "0.18.27"  # pragma: no cover
diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py
@@ -18,7 +18,7 @@
 from unstructured.partition.pdf_image.analysis.layout_dump import OCRLayoutDumper
 from unstructured.partition.pdf_image.pdf_image_utils import valid_text
 from unstructured.partition.pdf_image.pdfminer_processing import (
-    aggregate_embedded_text_by_block,
+    aggregate_embedded_text_batch,
     bboxes1_is_almost_subregion_of_bboxes2,
 )
 from unstructured.partition.utils.config import env_config
@@ -390,15 +390,20 @@ def merge_out_layout_with_ocr_layout(
         return out_layout
 
     invalid_text_indices = [i for i, text in enumerate(out_layout.texts) if not valid_text(text)]
-    out_layout.texts = out_layout.texts.astype(object)
 
-    for idx in invalid_text_indices:
-        out_layout.texts[idx], _ = aggregate_embedded_text_by_block(
-            target_region=out_layout.slice([idx]),
-            source_regions=ocr_layout,
-            threshold=subregion_threshold,
+    if invalid_text_indices:
+        out_layout.texts = out_layout.texts.astype(object)
+
+        aggregated_texts = aggregate_embedded_text_batch(
+            invalid_text_indices,
+            out_layout,
+            ocr_layout,
+            subregion_threshold=subregion_threshold,
         )
 
+        for idx, text in zip(invalid_text_indices, aggregated_texts):
+            out_layout.texts[idx] = text
+
     final_layout = (
         supplement_layout_with_ocr_elements(out_layout, ocr_layout)
         if supplement_with_ocr_elements

diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py
@@ -259,9 +259,7 @@ def check_element_types_to_extract(
 
 def valid_text(text: str) -> bool:
     """a helper that determines if the text is valid ascii text"""
-    if not text:
-        return False
-    return "(cid:" not in text
+    return text and "(cid:" not in text
 
 
 def cid_ratio(text: str) -> float:

diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -774,10 +774,26 @@ def remove_duplicate_elements(
     return elements.slice(np.concatenate(ious))
 
 
+def _aggregated_iou(box1s, box2):
+    intersection = 0.0
+    sum_areas = calculate_bbox_area(box2)
+
+    for i in range(box1s.shape[0]):
+        intersection += calculate_intersection_area(box1s[i, :], box2)
+        sum_areas += calculate_bbox_area(box1s[i, :])
+
+    union = sum_areas - intersection
+
+    if union == 0:
+        return 1.0
+    return intersection / union
+
+
 def aggregate_embedded_text_by_block(
     target_region: TextRegions,
     source_regions: TextRegions,
-    threshold: float = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
+    subregion_threshold: float = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
+    embed_region_threshold: float = 0.25,
 ) -> tuple[str, IsExtracted | None]:
     """Extracts the text aggregated from the elements of the given layout that lie within the given
     block."""
@@ -789,17 +805,27 @@ def aggregate_embedded_text_by_block(
         bboxes1_is_almost_subregion_of_bboxes2(
             source_regions.element_coords,
             target_region.element_coords,
-            threshold,
+            subregion_threshold,
         )
         .sum(axis=1)
         .astype(bool)
     )
 
     text = " ".join([text for text in source_regions.slice(mask).texts if text])
-    # if nothing is sliced then it is not extracted
-    is_extracted = sum(mask) and all(
-        flag == IsExtracted.TRUE for flag in source_regions.slice(mask).is_extracted_array
-    )
+
+    if sum(mask):
+        source_bboxes = source_regions.slice(mask).element_coords
+        target_bboxes = target_region.element_coords
+
+        iou = _aggregated_iou(source_bboxes, target_bboxes[0, :])
+
+        is_extracted = (
+            all(flag == IsExtracted.TRUE for flag in source_regions.slice(mask).is_extracted_array)
+            and iou > embed_region_threshold
+        )
+    else:
+        # if nothing is sliced then it is not extracted
+        is_extracted = False
     return text, IsExtracted.TRUE if is_extracted else IsExtracted.FALSE
 
 
@@ -1144,3 +1170,37 @@ def try_argmin(array: np.ndarray) -> int:
         return int(np.argmin(array))
     except IndexError:
         return -1
+
+
+def aggregate_embedded_text_batch(
+    target_indices: list[int],
+    target_layout: "LayoutElements",
+    source_regions: TextRegions,
+    subregion_threshold: float = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
+) -> list[str]:
+    """Batch process multiple target regions to extract aggregated text efficiently."""
+    if not target_indices or len(source_regions) == 0:
+        return [""] * len(target_indices)
+
+    # Get all target regions at once
+    target_coords = target_layout.element_coords[target_indices]
+
+    # Compute masks for all targets in one operation
+    # Result shape: (len(source_regions), len(target_indices))
+    all_masks = bboxes1_is_almost_subregion_of_bboxes2(
+        source_regions.element_coords,
+        target_coords,
+        subregion_threshold,
+    )
+
+    # Extract texts for each target
+    texts = []
+    for i in range(len(target_indices)):
+        mask = all_masks[:, i].astype(bool)
+        if mask.any():
+            text = " ".join([text for text in source_regions.slice(mask).texts if text])
+        else:
+            text = ""
+        texts.append(text)
+
+    return texts
diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
@@ -234,5 +234,11 @@ def ANALYSIS_BBOX_FORMAT(self) -> str:
         """The format for analysed pages with bboxes drawn on them. Default is 'png'."""
         return self._get_string("ANALYSIS_BBOX_FORMAT", "png")
 
+    @property
+    def TEXT_COVERAGE_THRESHOLD(self) -> float:
+        """the minimum iou between extracted text bboxes and their target inferred element bbox for
+        the inferred element to be considered contaning extracted text"""
+        return self._get_float("TEXT_COVERAGE_THRESHOLD", 0.25)
+
 
 env_config = ENVConfig()
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.18.27-dev5" # pragma: no cover
		__version__ = "0.18.27" # pragma: no cover