Skip to content
Closed
9 changes: 3 additions & 6 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
## 0.18.27-dev5

### Enhancement
- Upgrade pdfminer-six to 20260107 to fix ~15-18% performance regression from eager f-string evaluation

## 0.18.27-dev4
## 0.18.27

### Fixes
- Comment no-ops in `zoom_image` (codeflash)
- Fix an issue where elements with partially filled extracted text are marked as extracted

### Enhancement
- Optimize `sentence_count` (codeflash)
- Optimize `_PartitionerLoader._load_partitioner` (codeflash)
- Optimize `detect_languages` (codeflash)
- Optimize `contains_verb` (codeflash)
- Upgrade pdfminer-six to 20260107 to fix ~15-18% performance regression from eager f-string evaluation

## 0.18.26

Expand Down
25 changes: 21 additions & 4 deletions test_unstructured/partition/pdf_image/test_pdfminer_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,16 +162,33 @@ def test_aggregate_by_block():
expected = "Inside region1 Inside region2"
embedded_regions = TextRegions.from_list(
[
TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
TextRegion.from_coords(20, 20, 80, 80, None),
TextRegion.from_coords(50, 50, 150, 150, "Inside region2"),
TextRegion.from_coords(0, 0, 300, 20, "Inside region1"),
TextRegion.from_coords(0, 20, 300, 80, None),
TextRegion.from_coords(0, 80, 200, 300, "Inside region2"),
TextRegion.from_coords(250, 250, 350, 350, "Outside region"),
]
)
embedded_regions.is_extracted_array = np.array([IsExtracted.TRUE] * 4)
target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)])

text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions)
assert text == expected
assert extracted.value == "true"


def test_aggregate_only_partially_fill_target():
expected = "Inside region1"
embedded_regions = TextRegions.from_list(
[
TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
]
)
embedded_regions.is_extracted_array = np.array([IsExtracted.TRUE])
target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)])

text, _ = aggregate_embedded_text_by_block(target_region, embedded_regions)
text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions)
assert text == expected
assert extracted.value == "false"


@pytest.mark.parametrize(
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.18.27-dev5" # pragma: no cover
__version__ = "0.18.27" # pragma: no cover
19 changes: 12 additions & 7 deletions unstructured/partition/pdf_image/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from unstructured.partition.pdf_image.analysis.layout_dump import OCRLayoutDumper
from unstructured.partition.pdf_image.pdf_image_utils import valid_text
from unstructured.partition.pdf_image.pdfminer_processing import (
aggregate_embedded_text_by_block,
aggregate_embedded_text_batch,
bboxes1_is_almost_subregion_of_bboxes2,
)
from unstructured.partition.utils.config import env_config
Expand Down Expand Up @@ -390,15 +390,20 @@ def merge_out_layout_with_ocr_layout(
return out_layout

invalid_text_indices = [i for i, text in enumerate(out_layout.texts) if not valid_text(text)]
out_layout.texts = out_layout.texts.astype(object)

for idx in invalid_text_indices:
out_layout.texts[idx], _ = aggregate_embedded_text_by_block(
target_region=out_layout.slice([idx]),
source_regions=ocr_layout,
threshold=subregion_threshold,
if invalid_text_indices:
out_layout.texts = out_layout.texts.astype(object)

aggregated_texts = aggregate_embedded_text_batch(
invalid_text_indices,
out_layout,
ocr_layout,
subregion_threshold=subregion_threshold,
)

for idx, text in zip(invalid_text_indices, aggregated_texts):
out_layout.texts[idx] = text

final_layout = (
supplement_layout_with_ocr_elements(out_layout, ocr_layout)
if supplement_with_ocr_elements
Expand Down
4 changes: 1 addition & 3 deletions unstructured/partition/pdf_image/pdf_image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,9 +259,7 @@ def check_element_types_to_extract(

def valid_text(text: str) -> bool:
"""a helper that determines if the text is valid ascii text"""
if not text:
return False
return "(cid:" not in text
return text and "(cid:" not in text


def cid_ratio(text: str) -> float:
Expand Down
72 changes: 66 additions & 6 deletions unstructured/partition/pdf_image/pdfminer_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,10 +774,26 @@ def remove_duplicate_elements(
return elements.slice(np.concatenate(ious))


def _aggregated_iou(box1s, box2):
intersection = 0.0
sum_areas = calculate_bbox_area(box2)

for i in range(box1s.shape[0]):
intersection += calculate_intersection_area(box1s[i, :], box2)
sum_areas += calculate_bbox_area(box1s[i, :])

union = sum_areas - intersection

if union == 0:
return 1.0
return intersection / union


def aggregate_embedded_text_by_block(
target_region: TextRegions,
source_regions: TextRegions,
threshold: float = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
subregion_threshold: float = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
embed_region_threshold: float = 0.25,
) -> tuple[str, IsExtracted | None]:
"""Extracts the text aggregated from the elements of the given layout that lie within the given
block."""
Expand All @@ -789,17 +805,27 @@ def aggregate_embedded_text_by_block(
bboxes1_is_almost_subregion_of_bboxes2(
source_regions.element_coords,
target_region.element_coords,
threshold,
subregion_threshold,
)
.sum(axis=1)
.astype(bool)
)

text = " ".join([text for text in source_regions.slice(mask).texts if text])
# if nothing is sliced then it is not extracted
is_extracted = sum(mask) and all(
flag == IsExtracted.TRUE for flag in source_regions.slice(mask).is_extracted_array
)

if sum(mask):
source_bboxes = source_regions.slice(mask).element_coords
target_bboxes = target_region.element_coords

iou = _aggregated_iou(source_bboxes, target_bboxes[0, :])

is_extracted = (
all(flag == IsExtracted.TRUE for flag in source_regions.slice(mask).is_extracted_array)
and iou > embed_region_threshold
)
else:
# if nothing is sliced then it is not extracted
is_extracted = False
return text, IsExtracted.TRUE if is_extracted else IsExtracted.FALSE


Expand Down Expand Up @@ -1144,3 +1170,37 @@ def try_argmin(array: np.ndarray) -> int:
return int(np.argmin(array))
except IndexError:
return -1


def aggregate_embedded_text_batch(
target_indices: list[int],
target_layout: "LayoutElements",
source_regions: TextRegions,
subregion_threshold: float = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
) -> list[str]:
"""Batch process multiple target regions to extract aggregated text efficiently."""
if not target_indices or len(source_regions) == 0:
return [""] * len(target_indices)

# Get all target regions at once
target_coords = target_layout.element_coords[target_indices]

# Compute masks for all targets in one operation
# Result shape: (len(source_regions), len(target_indices))
all_masks = bboxes1_is_almost_subregion_of_bboxes2(
source_regions.element_coords,
target_coords,
subregion_threshold,
)

# Extract texts for each target
texts = []
for i in range(len(target_indices)):
mask = all_masks[:, i].astype(bool)
if mask.any():
text = " ".join([text for text in source_regions.slice(mask).texts if text])
else:
text = ""
texts.append(text)

return texts
6 changes: 6 additions & 0 deletions unstructured/partition/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,5 +234,11 @@ def ANALYSIS_BBOX_FORMAT(self) -> str:
"""The format for analysed pages with bboxes drawn on them. Default is 'png'."""
return self._get_string("ANALYSIS_BBOX_FORMAT", "png")

@property
def TEXT_COVERAGE_THRESHOLD(self) -> float:
"""the minimum iou between extracted text bboxes and their target inferred element bbox for
the inferred element to be considered contaning extracted text"""
return self._get_float("TEXT_COVERAGE_THRESHOLD", 0.25)


env_config = ENVConfig()