Unstructured-IO · badGarnet · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,8 @@
-## 0.18.27-dev6
+## 0.18.27
 
 ### Fixes
 - Comment no-ops in `zoom_image` (codeflash)
+- Fix an issue where elements with partially filled extracted text are marked as extracted
 
 ### Enhancement
 - Optimize `sentence_count` (codeflash)

diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
@@ -162,16 +162,47 @@ def test_aggregate_by_block():
     expected = "Inside region1 Inside region2"
     embedded_regions = TextRegions.from_list(
         [
-            TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
-            TextRegion.from_coords(20, 20, 80, 80, None),
-            TextRegion.from_coords(50, 50, 150, 150, "Inside region2"),
+            TextRegion.from_coords(0, 0, 300, 20, "Inside region1"),
+            TextRegion.from_coords(0, 20, 300, 80, None),
+            TextRegion.from_coords(0, 80, 200, 300, "Inside region2"),
             TextRegion.from_coords(250, 250, 350, 350, "Outside region"),
         ]
     )
+    embedded_regions.is_extracted_array = np.array([IsExtracted.TRUE] * 4)
     target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)])
 
-    text, _ = aggregate_embedded_text_by_block(target_region, embedded_regions)
+    text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions)
     assert text == expected
+    assert extracted.value == "true"
+
+
+def test_aggregate_only_partially_fill_target():
+    expected = "Inside region1"
+    embedded_regions = TextRegions.from_list(
+        [
+            TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
+        ]
+    )
+    embedded_regions.is_extracted_array = np.array([IsExtracted.TRUE])
+    target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)])
+
+    text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions)
+    assert text == expected
+    assert extracted.value == "partial"
+
+
+def test_aggregate_not_filling_target():
+    embedded_regions = TextRegions.from_list(
+        [
+            TextRegion.from_coords(300, 0, 400, 20, "outside"),
+        ]
+    )
+    embedded_regions.is_extracted_array = np.array([IsExtracted.TRUE])
+    target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)])
+
+    text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions)
+    assert text == ""
+    assert extracted.value == "false"
 
 
 @pytest.mark.parametrize(

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.18.27-dev6"  # pragma: no cover
+__version__ = "0.18.27"  # pragma: no cover
diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py
@@ -396,7 +396,7 @@ def merge_out_layout_with_ocr_layout(
         out_layout.texts[idx], _ = aggregate_embedded_text_by_block(
             target_region=out_layout.slice([idx]),
             source_regions=ocr_layout,
-            threshold=subregion_threshold,
+            subregion_threshold=subregion_threshold,
         )
 
     final_layout = (

diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -774,10 +774,26 @@ def remove_duplicate_elements(
     return elements.slice(np.concatenate(ious))
 
 
+def _aggregated_iou(box1s, box2):
+    intersection = 0.0
+    sum_areas = calculate_bbox_area(box2)
+
+    for i in range(box1s.shape[0]):
+        intersection += calculate_intersection_area(box1s[i, :], box2)
+        sum_areas += calculate_bbox_area(box1s[i, :])
+
-    intersection = 0.0
-    sum_areas = calculate_bbox_area(box2)
-
-    for i in range(box1s.shape[0]):
-        intersection += calculate_intersection_area(box1s[i, :], box2)
-        sum_areas += calculate_bbox_area(box1s[i, :])
+    # Extract box2 coordinates once
+    x1_2, y1_2, x2_2, y2_2 = box2
+    box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)
+
+    intersection = 0.0
+    sum_areas = box2_area
+
+    n = box1s.shape[0]
+    for i in range(n):
+        # Direct indexing instead of slicing
+        x1_1 = box1s[i, 0]
+        y1_1 = box1s[i, 1]
+        x2_1 = box1s[i, 2]
+        y2_1 = box1s[i, 3]
+
+        # Inline bbox area calculation
+        sum_areas += (x2_1 - x1_1) * (y2_1 - y1_1)
+
+        # Inline intersection calculation
+        x_intersection = max(x1_1, x1_2)
+        y_intersection = max(y1_1, y1_2)
+        x2_intersection = min(x2_1, x2_2)
+        y2_intersection = min(y2_1, y2_2)
+
+        if x_intersection < x2_intersection and y_intersection < y2_intersection:
+            intersection += (x2_intersection - x_intersection) * (y2_intersection - y_intersection)
+
-    intersection = 0.0
-    sum_areas = calculate_bbox_area(box2)
-
-    for i in range(box1s.shape[0]):
-        intersection += calculate_intersection_area(box1s[i, :], box2)
-        sum_areas += calculate_bbox_area(box1s[i, :])
+    # Extract box2 coordinates once
+    x1_2, y1_2, x2_2, y2_2 = box2
+    box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)
+
+    intersection = 0.0
+    sum_areas = box2_area
+
+    n = box1s.shape[0]
+    for i in range(n):
+        # Direct indexing instead of slicing
+        x1_1 = box1s[i, 0]
+        y1_1 = box1s[i, 1]
+        x2_1 = box1s[i, 2]
+        y2_1 = box1s[i, 3]
+
+        # Inline bbox area calculation
+        sum_areas += (x2_1 - x1_1) * (y2_1 - y1_1)
+
+        # Inline intersection calculation
+        x_intersection = max(x1_1, x1_2)
+        y_intersection = max(y1_1, y1_2)
+        x2_intersection = min(x2_1, x2_2)
+        y2_intersection = min(y2_1, y2_2)
+
+        if x_intersection < x2_intersection and y_intersection < y2_intersection:
+            intersection += (x2_intersection - x_intersection) * (y2_intersection - y_intersection)
+
+    union = sum_areas - intersection
+
+    if union == 0:
+        return 1.0
+    return intersection / union
+
+
 def aggregate_embedded_text_by_block(
     target_region: TextRegions,
     source_regions: TextRegions,
-    threshold: float = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
+    subregion_threshold: float = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
+    text_coverage_threshold: float = env_config.TEXT_COVERAGE_THRESHOLD,
 ) -> tuple[str, IsExtracted | None]:
     """Extracts the text aggregated from the elements of the given layout that lie within the given
     block."""
@@ -789,18 +805,29 @@ def aggregate_embedded_text_by_block(
         bboxes1_is_almost_subregion_of_bboxes2(
             source_regions.element_coords,
             target_region.element_coords,
-            threshold,
+            subregion_threshold,
         )
         .sum(axis=1)
         .astype(bool)
     )
 
     text = " ".join([text for text in source_regions.slice(mask).texts if text])
-    # if nothing is sliced then it is not extracted
-    is_extracted = sum(mask) and all(
-        flag == IsExtracted.TRUE for flag in source_regions.slice(mask).is_extracted_array
-    )
-    return text, IsExtracted.TRUE if is_extracted else IsExtracted.FALSE
+
+    if sum(mask):
+        source_bboxes = source_regions.slice(mask).element_coords
+        target_bboxes = target_region.element_coords
+
+        iou = _aggregated_iou(source_bboxes, target_bboxes[0, :])
+
+        fully_filled = (
+            all(flag == IsExtracted.TRUE for flag in source_regions.slice(mask).is_extracted_array)
+            and iou > text_coverage_threshold
+        )
+        is_extracted = IsExtracted.TRUE if fully_filled else IsExtracted.PARTIAL
+    else:
+        # if nothing is sliced then it is not extracted
+        is_extracted = IsExtracted.FALSE
+    return text, is_extracted
 
 
 def get_links_in_element(page_links: list, region: Rectangle) -> list:

diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
@@ -234,5 +234,11 @@ def ANALYSIS_BBOX_FORMAT(self) -> str:
         """The format for analysed pages with bboxes drawn on them. Default is 'png'."""
         return self._get_string("ANALYSIS_BBOX_FORMAT", "png")
 
+    @property
+    def TEXT_COVERAGE_THRESHOLD(self) -> float:
+        """the minimum iou between extracted text bboxes and their target inferred element bbox for
+        the inferred element to be considered contaning extracted text"""
+        return self._get_float("TEXT_COVERAGE_THRESHOLD", 0.25)
+
 
 env_config = ENVConfig()
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.18.27-dev6" # pragma: no cover
		__version__ = "0.18.27" # pragma: no cover