Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
## 0.18.27-dev6
## 0.18.27

### Fixes
- Comment no-ops in `zoom_image` (codeflash)
- Fix an issue where elements with partially filled extracted text are marked as extracted

### Enhancement
- Optimize `sentence_count` (codeflash)
Expand Down
39 changes: 35 additions & 4 deletions test_unstructured/partition/pdf_image/test_pdfminer_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,16 +162,47 @@ def test_aggregate_by_block():
expected = "Inside region1 Inside region2"
embedded_regions = TextRegions.from_list(
[
TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
TextRegion.from_coords(20, 20, 80, 80, None),
TextRegion.from_coords(50, 50, 150, 150, "Inside region2"),
TextRegion.from_coords(0, 0, 300, 20, "Inside region1"),
TextRegion.from_coords(0, 20, 300, 80, None),
TextRegion.from_coords(0, 80, 200, 300, "Inside region2"),
TextRegion.from_coords(250, 250, 350, 350, "Outside region"),
]
)
embedded_regions.is_extracted_array = np.array([IsExtracted.TRUE] * 4)
target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)])

text, _ = aggregate_embedded_text_by_block(target_region, embedded_regions)
text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions)
assert text == expected
assert extracted.value == "true"


def test_aggregate_only_partially_fill_target():
expected = "Inside region1"
embedded_regions = TextRegions.from_list(
[
TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
]
)
embedded_regions.is_extracted_array = np.array([IsExtracted.TRUE])
target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)])

text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions)
assert text == expected
assert extracted.value == "partial"


def test_aggregate_not_filling_target():
embedded_regions = TextRegions.from_list(
[
TextRegion.from_coords(300, 0, 400, 20, "outside"),
]
)
embedded_regions.is_extracted_array = np.array([IsExtracted.TRUE])
target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)])

text, extracted = aggregate_embedded_text_by_block(target_region, embedded_regions)
assert text == ""
assert extracted.value == "false"


@pytest.mark.parametrize(
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.18.27-dev6" # pragma: no cover
__version__ = "0.18.27" # pragma: no cover
2 changes: 1 addition & 1 deletion unstructured/partition/pdf_image/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ def merge_out_layout_with_ocr_layout(
out_layout.texts[idx], _ = aggregate_embedded_text_by_block(
target_region=out_layout.slice([idx]),
source_regions=ocr_layout,
threshold=subregion_threshold,
subregion_threshold=subregion_threshold,
)

final_layout = (
Expand Down
41 changes: 34 additions & 7 deletions unstructured/partition/pdf_image/pdfminer_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,10 +774,26 @@ def remove_duplicate_elements(
return elements.slice(np.concatenate(ious))


def _aggregated_iou(box1s, box2):
intersection = 0.0
sum_areas = calculate_bbox_area(box2)

for i in range(box1s.shape[0]):
intersection += calculate_intersection_area(box1s[i, :], box2)
sum_areas += calculate_bbox_area(box1s[i, :])

Comment on lines +778 to +784
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚡️Codeflash found 129% (1.29x) speedup for _aggregated_iou in unstructured/partition/pdf_image/pdfminer_processing.py

⏱️ Runtime : 4.05 milliseconds 1.77 milliseconds (best of 15 runs)

📝 Explanation and details

The optimized code achieves a 129% speedup (4.05ms → 1.77ms) by eliminating expensive function call overhead within a tight loop. The key optimizations are:

What Changed

  1. Inlined function calls: Instead of calling calculate_bbox_area() and calculate_intersection_area() repeatedly in the loop (1,032+ times per execution), all area and intersection calculations are performed inline using direct arithmetic operations.

  2. Hoisted box2 unpacking: The box2 tuple is unpacked once before the loop instead of being unpacked on every iteration inside calculate_intersection_area().

  3. Direct array indexing: Changed from slice notation box1s[i, :] to individual element access box1s[i, 0], box1s[i, 1], etc., which avoids creating intermediate array slices.

Why It's Faster

Function call overhead dominates: The line profiler shows that in the original code, 65.2% of time was spent in calculate_intersection_area() calls and 30.2% in calculate_bbox_area() calls. In Python, function calls are expensive due to:

  • Stack frame creation/destruction
  • Argument passing and tuple unpacking
  • Name resolution in function scope

By inlining these operations, the optimized version spends only 5.3% of time on intersection area calculation (now a simple inline multiplication) and eliminates the separate calculate_bbox_area() calls entirely.

Reduced tuple operations: The original code unpacked bbox2 1,032 times inside calculate_intersection_area(). The optimized version does this once, saving thousands of tuple unpacking operations.

Impact on Workloads

Based on the function_references, _aggregated_iou() is called from aggregate_embedded_text_by_block() during PDF text extraction - once per target region to compute IoU between embedded text bounding boxes and layout blocks. Given that:

  • PDFs commonly have dozens to hundreds of layout blocks
  • Each call processes all overlapping text regions (the box1s parameter)
  • This happens during document parsing, a core operation

The optimization provides meaningful speedup in document-heavy workloads. The annotated tests confirm the optimization scales well:

  • Small inputs (1-3 boxes): 60-104% faster
  • Medium inputs (100 boxes): 135-145% faster
  • Large inputs (500 boxes): 128% faster

The speedup is consistent across all test scenarios because the bottleneck (function call overhead) is eliminated regardless of input size, making this particularly valuable for PDFs with many text regions.

Correctness verification report:

Test Status
⏪ Replay Tests 🔘 None Found
⚙️ Existing Unit Tests 🔘 None Found
🔎 Concolic Coverage Tests 🔘 None Found
🌀 Generated Regression Tests 29 Passed
📊 Tests Coverage 100.0%
🌀 Click to see Generated Regression Tests
# imports
from unstructured.partition.pdf_image.pdfminer_processing import _aggregated_iou


# Helper to create a "box1s" array for tests, since the function expects .shape and [i, :]
class DummyArray:
    def __init__(self, boxes):
        self._boxes = boxes
        self.shape = (len(boxes), 4)

    def __getitem__(self, idx):
        return self._boxes[idx]


# -------------------------
# Basic Test Cases
# -------------------------


def test_multiple_boxes_all_perfect_overlap():
    # Multiple boxes, all perfectly overlap with box2
    box1 = (0, 0, 2, 2)
    box2 = (0, 0, 2, 2)
    box3 = (0, 0, 2, 2)
    box2_main = (0, 0, 2, 2)
    box1s = DummyArray([box1, box2, box3])


# -------------------------
# Edge Test Cases
# -------------------------


def test_empty_box1s():
    # Edge: box1s is empty
    box1s = DummyArray([])
    box2 = (0, 0, 2, 2)
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 2.71μs -> 2.42μs (11.6% faster)
import numpy as np  # needed for array operations

# imports
# function to test
from unstructured.partition.pdf_image.pdfminer_processing import (
    _aggregated_iou,
)

# unit tests

# ============================================================================
# BASIC FUNCTIONALITY TEST CASES
# ============================================================================


def test_single_box_complete_overlap():
    """Test IoU when box1s contains one box identical to box2 (should return 1.0)"""
    # Create a single box in box1s that is identical to box2
    box1s = np.array([[0.0, 0.0, 10.0, 10.0]])
    box2 = (0.0, 0.0, 10.0, 10.0)

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 19.6μs -> 12.2μs (60.5% faster)


def test_multiple_boxes_varying_overlaps():
    """Test IoU with multiple boxes having different overlap amounts"""
    # Create multiple boxes with different overlaps
    box1s = np.array(
        [
            [0.0, 0.0, 10.0, 10.0],  # complete overlap
            [5.0, 0.0, 15.0, 10.0],  # partial overlap
            [20.0, 20.0, 30.0, 30.0],  # no overlap
        ]
    )
    box2 = (0.0, 0.0, 10.0, 10.0)

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 21.2μs -> 10.4μs (104% faster)

    # Intersection: 100 (first) + 50 (second) + 0 (third) = 150
    # Sum areas: 100 (box2) + 100 + 100 + 100 = 400
    # Union: 400 - 150 = 250
    # IoU: 150/250 = 0.6
    expected = 150.0 / 250.0


# ============================================================================
# EDGE CASES
# ============================================================================


def test_empty_box1s_array():
    """Test IoU when box1s is an empty array with shape (0, 4)"""
    # Create empty array
    box1s = np.empty((0, 4))
    box2 = (0.0, 0.0, 10.0, 10.0)

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 2.35μs -> 2.33μs (0.900% faster)


def test_zero_area_box_in_box1s():
    """Test IoU when a box in box1s has zero area (x1==x2)"""
    # Create box1s with a zero-area box
    box1s = np.array([[5.0, 5.0, 5.0, 10.0]])  # width = 0
    box2 = (0.0, 0.0, 10.0, 10.0)

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 11.9μs -> 6.19μs (91.7% faster)


def test_zero_area_box2():
    """Test IoU when box2 has zero area"""
    # Create normal box1s
    box1s = np.array([[0.0, 0.0, 10.0, 10.0]])
    box2 = (5.0, 5.0, 5.0, 10.0)  # width = 0

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 11.5μs -> 6.30μs (82.5% faster)


def test_both_boxes_zero_area():
    """Test IoU when both box1s and box2 have zero area"""
    # Create zero-area boxes
    box1s = np.array([[5.0, 5.0, 5.0, 5.0]])
    box2 = (3.0, 3.0, 3.0, 3.0)

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 11.6μs -> 6.03μs (91.8% faster)


def test_touching_boxes_no_overlap():
    """Test IoU when boxes touch at an edge but don't overlap"""
    # Create boxes that share an edge
    box1s = np.array([[10.0, 0.0, 20.0, 10.0]])  # touches right edge of box2
    box2 = (0.0, 0.0, 10.0, 10.0)

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 11.4μs -> 6.11μs (86.9% faster)


def test_negative_coordinates():
    """Test IoU with boxes having negative coordinates"""
    # Create boxes with negative coordinates
    box1s = np.array([[-10.0, -10.0, 0.0, 0.0]])
    box2 = (-5.0, -5.0, 5.0, 5.0)

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 11.9μs -> 6.28μs (89.3% faster)

    # Intersection area: overlap from (-5,-5) to (0,0) = 25
    # Union: 100 (box1s) + 100 (box2) - 25 = 175
    # IoU = 25/175 = 1/7
    expected = 25.0 / 175.0


def test_very_small_boxes():
    """Test IoU with very small boxes (minimal dimensions)"""
    # Create very small boxes
    box1s = np.array([[0.0, 0.0, 0.1, 0.1]])
    box2 = (0.05, 0.05, 0.15, 0.15)

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 11.8μs -> 6.40μs (83.7% faster)

    # Intersection: overlap from (0.05, 0.05) to (0.1, 0.1) = 0.0025
    # Union: 0.01 (box1s) + 0.01 (box2) - 0.0025 = 0.0175
    # IoU = 0.0025/0.0175 = 1/7
    expected = 0.0025 / 0.0175


# ============================================================================
# MATHEMATICAL EDGE CASES
# ============================================================================


def test_perfect_union_iou_equals_one():
    """Test that identical boxes give IoU = 1.0"""
    # Create identical boxes
    box1s = np.array([[5.0, 5.0, 15.0, 15.0]])
    box2 = (5.0, 5.0, 15.0, 15.0)

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 11.6μs -> 6.17μs (87.2% faster)


def test_zero_intersection_iou_equals_zero():
    """Test that completely separate boxes give IoU = 0.0"""
    # Create completely separate boxes
    box1s = np.array([[100.0, 100.0, 200.0, 200.0]])
    box2 = (0.0, 0.0, 10.0, 10.0)

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 11.3μs -> 5.91μs (90.7% faster)


def test_multiple_boxes_covering_box2_completely():
    """Test IoU when multiple boxes in box1s together cover all of box2"""
    # Create boxes that together cover box2
    box1s = np.array([[0.0, 0.0, 5.0, 10.0], [5.0, 0.0, 10.0, 10.0]])  # left half  # right half
    box2 = (0.0, 0.0, 10.0, 10.0)

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 16.2μs -> 8.13μs (98.9% faster)


def test_overlapping_boxes_in_box1s():
    """Test IoU when boxes in box1s overlap with each other"""
    # Create overlapping boxes in box1s
    box1s = np.array([[0.0, 0.0, 10.0, 10.0], [5.0, 5.0, 15.0, 15.0]])
    box2 = (0.0, 0.0, 10.0, 10.0)

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 16.2μs -> 8.46μs (92.1% faster)

    # Intersection with box2: 100 (first) + 25 (second) = 125
    # Sum areas: 100 (box2) + 100 (first) + 100 (second) = 300
    # Union: 300 - 125 = 175
    # IoU = 125/175 = 5/7
    expected = 125.0 / 175.0


def test_multiple_identical_boxes_in_box1s():
    """Test IoU when box1s contains multiple identical boxes"""
    # Create multiple identical boxes
    box1s = np.array([[0.0, 0.0, 10.0, 10.0], [0.0, 0.0, 10.0, 10.0], [0.0, 0.0, 10.0, 10.0]])
    box2 = (0.0, 0.0, 10.0, 10.0)

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 19.9μs -> 9.91μs (101% faster)

    # Intersection: 100 + 100 + 100 = 300
    # Sum areas: 100 (box2) + 100 + 100 + 100 = 400
    # Union: 400 - 300 = 100
    # IoU = 300/100 = 3.0
    expected = 300.0 / 100.0


# ============================================================================
# LARGE SCALE TEST CASES
# ============================================================================


def test_many_boxes_no_overlap():
    """Test IoU with 100 boxes that don't overlap box2"""
    # Create 100 non-overlapping boxes
    box1s = np.array(
        [[i * 20.0, i * 20.0, i * 20.0 + 10.0, i * 20.0 + 10.0] for i in range(1, 101)]
    )
    box2 = (0.0, 0.0, 10.0, 10.0)

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 351μs -> 146μs (141% faster)


def test_many_boxes_all_overlapping():
    """Test IoU with 100 boxes all overlapping box2"""
    # Create 100 boxes all overlapping with box2
    box1s = np.array([[0.0, 0.0, 10.0, 10.0] for _ in range(100)])
    box2 = (0.0, 0.0, 10.0, 10.0)

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 384μs -> 163μs (135% faster)

    # Intersection: 100 * 100 = 10000
    # Sum areas: 100 + 100*100 = 10100
    # Union: 10100 - 10000 = 100
    # IoU = 10000/100 = 100.0
    expected = 10000.0 / 100.0


def test_many_boxes_partial_overlap():
    """Test IoU with 200 boxes having varying partial overlaps"""
    # Create 200 boxes with different overlaps
    box1s_list = []
    for i in range(200):
        # Create boxes that shift slightly each time
        x_offset = i * 0.1
        box1s_list.append([x_offset, 0.0, x_offset + 10.0, 10.0])
    box1s = np.array(box1s_list)
    box2 = (0.0, 0.0, 10.0, 10.0)

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 736μs -> 307μs (139% faster)


def test_large_coordinate_values():
    """Test IoU with very large coordinate values"""
    # Create boxes with large coordinates
    box1s = np.array([[1e6, 1e6, 1e6 + 100, 1e6 + 100]])
    box2 = (1e6 + 50, 1e6 + 50, 1e6 + 150, 1e6 + 150)

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 12.0μs -> 6.53μs (83.3% faster)

    # Intersection: overlap from (1e6+50, 1e6+50) to (1e6+100, 1e6+100) = 2500
    # Union: 10000 + 10000 - 2500 = 17500
    # IoU = 2500/17500 = 1/7
    expected = 2500.0 / 17500.0


def test_high_precision_coordinates():
    """Test IoU with high precision decimal coordinates"""
    # Create boxes with many decimal places
    box1s = np.array([[0.123456789, 0.987654321, 10.123456789, 10.987654321]])
    box2 = (5.111111111, 5.222222222, 15.333333333, 15.444444444)

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 11.6μs -> 6.35μs (83.0% faster)


def test_mixed_large_and_small_boxes():
    """Test IoU with a mix of very large and very small boxes"""
    # Create mix of large and small boxes
    box1s = np.array(
        [
            [0.0, 0.0, 1000.0, 1000.0],  # large box
            [5.0, 5.0, 5.01, 5.01],  # tiny box
            [100.0, 100.0, 200.0, 200.0],  # medium box
        ]
    )
    box2 = (0.0, 0.0, 10.0, 10.0)

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 20.3μs -> 10.1μs (101% faster)


def test_many_boxes_grid_pattern():
    """Test IoU with boxes arranged in a grid pattern"""
    # Create a 10x10 grid of boxes
    box1s_list = []
    for i in range(10):
        for j in range(10):
            x1 = i * 5.0
            y1 = j * 5.0
            x2 = x1 + 3.0
            y2 = y1 + 3.0
            box1s_list.append([x1, y1, x2, y2])
    box1s = np.array(box1s_list)
    box2 = (0.0, 0.0, 10.0, 10.0)

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 357μs -> 145μs (145% faster)


def test_boxes_with_fractional_overlap():
    """Test IoU with boxes having various fractional overlaps"""
    # Create boxes with specific fractional overlaps
    box1s = np.array(
        [
            [0.0, 0.0, 5.0, 10.0],  # 50% overlap
            [0.0, 0.0, 2.5, 10.0],  # 25% overlap
            [0.0, 0.0, 7.5, 10.0],  # 75% overlap
            [0.0, 0.0, 1.0, 10.0],  # 10% overlap
        ]
    )
    box2 = (0.0, 0.0, 10.0, 10.0)

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 23.7μs -> 11.6μs (104% faster)

    # Intersection: 50 + 25 + 75 + 10 = 160
    # Sum areas: 100 + 50 + 25 + 75 + 10 = 260
    # Union: 260 - 160 = 100
    # IoU = 160/100 = 1.6
    expected = 160.0 / 100.0


def test_performance_with_500_boxes():
    """Test performance and correctness with 500 boxes"""
    # Create 500 boxes with varying positions
    np.random.seed(42)  # for reproducibility
    box1s = np.random.rand(500, 4) * 100
    # Ensure x1 < x2 and y1 < y2
    for i in range(500):
        if box1s[i, 0] > box1s[i, 2]:
            box1s[i, 0], box1s[i, 2] = box1s[i, 2], box1s[i, 0]
        if box1s[i, 1] > box1s[i, 3]:
            box1s[i, 1], box1s[i, 3] = box1s[i, 3], box1s[i, 1]

    box2 = (25.0, 25.0, 75.0, 75.0)

    # Calculate IoU
    codeflash_output = _aggregated_iou(box1s, box2)
    result = codeflash_output  # 1.91ms -> 837μs (128% faster)


# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.

To test or edit this optimization locally git merge codeflash/optimize-pr4169-2026-01-07T19.01.25

Click to see suggested changes
Suggested change
intersection = 0.0
sum_areas = calculate_bbox_area(box2)
for i in range(box1s.shape[0]):
intersection += calculate_intersection_area(box1s[i, :], box2)
sum_areas += calculate_bbox_area(box1s[i, :])
# Extract box2 coordinates once
x1_2, y1_2, x2_2, y2_2 = box2
box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)
intersection = 0.0
sum_areas = box2_area
n = box1s.shape[0]
for i in range(n):
# Direct indexing instead of slicing
x1_1 = box1s[i, 0]
y1_1 = box1s[i, 1]
x2_1 = box1s[i, 2]
y2_1 = box1s[i, 3]
# Inline bbox area calculation
sum_areas += (x2_1 - x1_1) * (y2_1 - y1_1)
# Inline intersection calculation
x_intersection = max(x1_1, x1_2)
y_intersection = max(y1_1, y1_2)
x2_intersection = min(x2_1, x2_2)
y2_intersection = min(y2_1, y2_2)
if x_intersection < x2_intersection and y_intersection < y2_intersection:
intersection += (x2_intersection - x_intersection) * (y2_intersection - y_intersection)

Static Badge

union = sum_areas - intersection

if union == 0:
return 1.0
return intersection / union


def aggregate_embedded_text_by_block(
target_region: TextRegions,
source_regions: TextRegions,
threshold: float = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
subregion_threshold: float = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
text_coverage_threshold: float = env_config.TEXT_COVERAGE_THRESHOLD,
) -> tuple[str, IsExtracted | None]:
"""Extracts the text aggregated from the elements of the given layout that lie within the given
block."""
Expand All @@ -789,18 +805,29 @@ def aggregate_embedded_text_by_block(
bboxes1_is_almost_subregion_of_bboxes2(
source_regions.element_coords,
target_region.element_coords,
threshold,
subregion_threshold,
)
.sum(axis=1)
.astype(bool)
)

text = " ".join([text for text in source_regions.slice(mask).texts if text])
# if nothing is sliced then it is not extracted
is_extracted = sum(mask) and all(
flag == IsExtracted.TRUE for flag in source_regions.slice(mask).is_extracted_array
)
return text, IsExtracted.TRUE if is_extracted else IsExtracted.FALSE

if sum(mask):
source_bboxes = source_regions.slice(mask).element_coords
target_bboxes = target_region.element_coords

iou = _aggregated_iou(source_bboxes, target_bboxes[0, :])

fully_filled = (
all(flag == IsExtracted.TRUE for flag in source_regions.slice(mask).is_extracted_array)
and iou > text_coverage_threshold
)
is_extracted = IsExtracted.TRUE if fully_filled else IsExtracted.PARTIAL
else:
# if nothing is sliced then it is not extracted
is_extracted = IsExtracted.FALSE
return text, is_extracted


def get_links_in_element(page_links: list, region: Rectangle) -> list:
Expand Down
6 changes: 6 additions & 0 deletions unstructured/partition/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,5 +234,11 @@ def ANALYSIS_BBOX_FORMAT(self) -> str:
"""The format for analysed pages with bboxes drawn on them. Default is 'png'."""
return self._get_string("ANALYSIS_BBOX_FORMAT", "png")

@property
def TEXT_COVERAGE_THRESHOLD(self) -> float:
"""the minimum iou between extracted text bboxes and their target inferred element bbox for
the inferred element to be considered contaning extracted text"""
return self._get_float("TEXT_COVERAGE_THRESHOLD", 0.25)


env_config = ENVConfig()
Loading