Unstructured-IO · codeflash-ai · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,18 +1,15 @@
-## 0.18.27-dev5
-
-### Enhancement
-- Upgrade pdfminer-six to 20260107 to fix ~15-18% performance regression from eager f-string evaluation
-
-## 0.18.27-dev4
+## 0.18.27
 
 ### Fixes
 - Comment no-ops in `zoom_image` (codeflash)
+- Fix an issue where elements with partially filled extracted text are marked as extracted
 
 ### Enhancement
 - Optimize `sentence_count` (codeflash)
 - Optimize `_PartitionerLoader._load_partitioner` (codeflash)
 - Optimize `detect_languages` (codeflash)
 - Optimize `contains_verb` (codeflash)
+- Upgrade pdfminer-six to 20260107 to fix ~15-18% performance regression from eager f-string evaluation
 
 ## 0.18.26
 

diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
@@ -162,16 +162,33 @@ def test_aggregate_by_block():
     expected = "Inside region1 Inside region2"
     embedded_regions = TextRegions.from_list(
         [
-            TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
-            TextRegion.from_coords(20, 20, 80, 80, None),
-            TextRegion.from_coords(50, 50, 150, 150, "Inside region2"),
+            TextRegion.from_coords(0, 0, 300, 20, "Inside region1"),
+            TextRegion.from_coords(0, 20, 300, 80, None),
+            TextRegion.from_coords(0, 80, 200, 300, "Inside region2"),
             TextRegion.from_coords(250, 250, 350, 350, "Outside region"),
         ]
     )
+    embedded_regions.is_extracted_array = np.array([IsExtracted.TRUE] * 4)
+    target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)])
+
+    text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions)
+    assert text == expected
+    assert extracted.value == "true"
+
+
+def test_aggregate_only_partially_fill_target():
+    expected = "Inside region1"
+    embedded_regions = TextRegions.from_list(
+        [
+            TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
+        ]
+    )
+    embedded_regions.is_extracted_array = np.array([IsExtracted.TRUE])
     target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)])
 
-    text, _ = aggregate_embedded_text_by_block(target_region, embedded_regions)
+    text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions)
     assert text == expected
+    assert extracted.value == "false"
 
 
 @pytest.mark.parametrize(

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.18.27-dev5"  # pragma: no cover
+__version__ = "0.18.27"  # pragma: no cover
diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py
@@ -396,7 +396,7 @@ def merge_out_layout_with_ocr_layout(
         out_layout.texts[idx], _ = aggregate_embedded_text_by_block(
             target_region=out_layout.slice([idx]),
             source_regions=ocr_layout,
-            threshold=subregion_threshold,
+            subregion_threshold=subregion_threshold,
         )
 
     final_layout = (

diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -774,10 +774,26 @@ def remove_duplicate_elements(
     return elements.slice(np.concatenate(ious))
 
 
+def _aggregated_iou(box1s, box2):
+    intersection = 0.0
+    sum_areas = calculate_bbox_area(box2)
+
+    for i in range(box1s.shape[0]):
+        intersection += calculate_intersection_area(box1s[i, :], box2)
+        sum_areas += calculate_bbox_area(box1s[i, :])
+
+    union = sum_areas - intersection
+
+    if union == 0:
+        return 1.0
+    return intersection / union
+
+
 def aggregate_embedded_text_by_block(
     target_region: TextRegions,
     source_regions: TextRegions,
-    threshold: float = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
+    subregion_threshold: float = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
+    embed_region_threshold: float = 0.25,
 ) -> tuple[str, IsExtracted | None]:
     """Extracts the text aggregated from the elements of the given layout that lie within the given
     block."""
@@ -789,17 +805,28 @@ def aggregate_embedded_text_by_block(
         bboxes1_is_almost_subregion_of_bboxes2(
             source_regions.element_coords,
             target_region.element_coords,
-            threshold,
+            subregion_threshold,
         )
         .sum(axis=1)
         .astype(bool)
     )
 
-    text = " ".join([text for text in source_regions.slice(mask).texts if text])
-    # if nothing is sliced then it is not extracted
-    is_extracted = sum(mask) and all(
-        flag == IsExtracted.TRUE for flag in source_regions.slice(mask).is_extracted_array
-    )
+    masked_source = source_regions.slice(mask)
+    text = " ".join([text for text in masked_source.texts if text])
+
+    if sum(mask):
+        source_bboxes = masked_source.element_coords
+        target_bboxes = target_region.element_coords
+
+        iou = _aggregated_iou(source_bboxes, target_bboxes[0, :])
+
+        is_extracted = (
+            all(flag == IsExtracted.TRUE for flag in masked_source.is_extracted_array)
+            and iou > embed_region_threshold
+        )
+    else:
+        # if nothing is sliced then it is not extracted
+        is_extracted = False
     return text, IsExtracted.TRUE if is_extracted else IsExtracted.FALSE
 
 

diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
@@ -234,5 +234,11 @@ def ANALYSIS_BBOX_FORMAT(self) -> str:
         """The format for analysed pages with bboxes drawn on them. Default is 'png'."""
         return self._get_string("ANALYSIS_BBOX_FORMAT", "png")
 
+    @property
+    def TEXT_COVERAGE_THRESHOLD(self) -> float:
+        """the minimum iou between extracted text bboxes and their target inferred element bbox for
+        the inferred element to be considered contaning extracted text"""
+        return self._get_float("TEXT_COVERAGE_THRESHOLD", 0.25)
+
 
 env_config = ENVConfig()
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.18.27-dev5" # pragma: no cover
		__version__ = "0.18.27" # pragma: no cover