Skip to content
Closed
9 changes: 3 additions & 6 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
## 0.18.27-dev5

### Enhancement
- Upgrade pdfminer-six to 20260107 to fix ~15-18% performance regression from eager f-string evaluation

## 0.18.27-dev4
## 0.18.27

### Fixes
- Comment no-ops in `zoom_image` (codeflash)
- Fix an issue where elements with partially filled extracted text are marked as extracted

### Enhancement
- Optimize `sentence_count` (codeflash)
- Optimize `_PartitionerLoader._load_partitioner` (codeflash)
- Optimize `detect_languages` (codeflash)
- Optimize `contains_verb` (codeflash)
- Upgrade pdfminer-six to 20260107 to fix ~15-18% performance regression from eager f-string evaluation

## 0.18.26

Expand Down
25 changes: 21 additions & 4 deletions test_unstructured/partition/pdf_image/test_pdfminer_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,16 +162,33 @@ def test_aggregate_by_block():
expected = "Inside region1 Inside region2"
embedded_regions = TextRegions.from_list(
[
TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
TextRegion.from_coords(20, 20, 80, 80, None),
TextRegion.from_coords(50, 50, 150, 150, "Inside region2"),
TextRegion.from_coords(0, 0, 300, 20, "Inside region1"),
TextRegion.from_coords(0, 20, 300, 80, None),
TextRegion.from_coords(0, 80, 200, 300, "Inside region2"),
TextRegion.from_coords(250, 250, 350, 350, "Outside region"),
]
)
embedded_regions.is_extracted_array = np.array([IsExtracted.TRUE] * 4)
target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)])

text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions)
assert text == expected
assert extracted.value == "true"


def test_aggregate_only_partially_fill_target():
expected = "Inside region1"
embedded_regions = TextRegions.from_list(
[
TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
]
)
embedded_regions.is_extracted_array = np.array([IsExtracted.TRUE])
target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)])

text, _ = aggregate_embedded_text_by_block(target_region, embedded_regions)
text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions)
assert text == expected
assert extracted.value == "false"


@pytest.mark.parametrize(
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.18.27-dev5" # pragma: no cover
__version__ = "0.18.27" # pragma: no cover
2 changes: 1 addition & 1 deletion unstructured/partition/pdf_image/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ def merge_out_layout_with_ocr_layout(
out_layout.texts[idx], _ = aggregate_embedded_text_by_block(
target_region=out_layout.slice([idx]),
source_regions=ocr_layout,
threshold=subregion_threshold,
subregion_threshold=subregion_threshold,
)

final_layout = (
Expand Down
41 changes: 34 additions & 7 deletions unstructured/partition/pdf_image/pdfminer_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,10 +774,26 @@ def remove_duplicate_elements(
return elements.slice(np.concatenate(ious))


def _aggregated_iou(box1s, box2):
intersection = 0.0
sum_areas = calculate_bbox_area(box2)

for i in range(box1s.shape[0]):
intersection += calculate_intersection_area(box1s[i, :], box2)
sum_areas += calculate_bbox_area(box1s[i, :])

union = sum_areas - intersection

if union == 0:
return 1.0
return intersection / union


def aggregate_embedded_text_by_block(
target_region: TextRegions,
source_regions: TextRegions,
threshold: float = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
subregion_threshold: float = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
embed_region_threshold: float = 0.25,
) -> tuple[str, IsExtracted | None]:
"""Extracts the text aggregated from the elements of the given layout that lie within the given
block."""
Expand All @@ -789,17 +805,28 @@ def aggregate_embedded_text_by_block(
bboxes1_is_almost_subregion_of_bboxes2(
source_regions.element_coords,
target_region.element_coords,
threshold,
subregion_threshold,
)
.sum(axis=1)
.astype(bool)
)

text = " ".join([text for text in source_regions.slice(mask).texts if text])
# if nothing is sliced then it is not extracted
is_extracted = sum(mask) and all(
flag == IsExtracted.TRUE for flag in source_regions.slice(mask).is_extracted_array
)
masked_source = source_regions.slice(mask)
text = " ".join([text for text in masked_source.texts if text])

if sum(mask):
source_bboxes = masked_source.element_coords
target_bboxes = target_region.element_coords

iou = _aggregated_iou(source_bboxes, target_bboxes[0, :])

is_extracted = (
all(flag == IsExtracted.TRUE for flag in masked_source.is_extracted_array)
and iou > embed_region_threshold
)
else:
# if nothing is sliced then it is not extracted
is_extracted = False
return text, IsExtracted.TRUE if is_extracted else IsExtracted.FALSE


Expand Down
6 changes: 6 additions & 0 deletions unstructured/partition/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,5 +234,11 @@ def ANALYSIS_BBOX_FORMAT(self) -> str:
"""The format for analysed pages with bboxes drawn on them. Default is 'png'."""
return self._get_string("ANALYSIS_BBOX_FORMAT", "png")

@property
def TEXT_COVERAGE_THRESHOLD(self) -> float:
"""the minimum iou between extracted text bboxes and their target inferred element bbox for
the inferred element to be considered contaning extracted text"""
return self._get_float("TEXT_COVERAGE_THRESHOLD", 0.25)


env_config = ENVConfig()