From 8d75f1f73cdbefa4d3dd2fd78b719016fa196ff2 Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Sat, 20 Dec 2025 01:04:42 +0000 Subject: [PATCH 1/3] Optimize get_bbox_thickness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The optimization replaces `np.polyfit` with direct linear interpolation, achieving a **13x speedup** by eliminating unnecessary computational overhead. **Key Optimization:** - **Removed `np.polyfit`**: The original code used NumPy's polynomial fitting for a simple linear interpolation between two points, which is computationally expensive - **Direct linear interpolation**: Replaced with manual slope calculation: `slope = (max_value - min_value) / (ratio_for_max_value - ratio_for_min_value)` **Why This is Faster:** - `np.polyfit` performs general polynomial regression using least squares, involving matrix operations and SVD decomposition - overkill for two points - Direct slope calculation requires only basic arithmetic operations (subtraction and division) - Line profiler shows the `np.polyfit` line consumed 91.7% of execution time (10.67ms out of 11.64ms total) **Performance Impact:** The function is called from `draw_bbox_on_image` which processes bounding boxes for PDF image visualization. Since this appears to be in a rendering pipeline that could process many bounding boxes per page, the 13x speedup significantly improves visualization performance. Test results show consistent 12-13x improvements across all scenarios, from single bbox calls (~25μs → ~2μs) to batch processing of 100 random bboxes (1.6ms → 116μs). **Optimization Benefits:** - **Small bboxes**: 1329% faster (basic cases) - **Large bboxes**: 1283% faster - **Batch processing**: 1297% faster for 100 random bboxes - **Scale-intensive workloads**: 1341% faster for processing 1000+ bboxes This optimization is particularly valuable for PDF processing workflows where many bounding boxes need thickness calculations for visualization. --- .../partition/pdf_image/analysis/bbox_visualisation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/unstructured/partition/pdf_image/analysis/bbox_visualisation.py b/unstructured/partition/pdf_image/analysis/bbox_visualisation.py index 4de4828122..197a07f331 100644 --- a/unstructured/partition/pdf_image/analysis/bbox_visualisation.py +++ b/unstructured/partition/pdf_image/analysis/bbox_visualisation.py @@ -117,8 +117,9 @@ def _get_optimal_value_for_bbox( The optimal value for the given bounding box and parameters given. """ bbox_to_page_ratio = _get_bbox_to_page_ratio(bbox, page_size) - coefficients = np.polyfit((ratio_for_min_value, ratio_for_max_value), (min_value, max_value), 1) - value = int(bbox_to_page_ratio * coefficients[0] + coefficients[1]) + # Direct linear interpolation instead of np.polyfit for better performance + slope = (max_value - min_value) / (ratio_for_max_value - ratio_for_min_value) + value = int(min_value + slope * (bbox_to_page_ratio - ratio_for_min_value)) return max(min_value, min(max_value, value)) From ecfffc3160671f2769f93a0cacdde7858fa64dc0 Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Mon, 5 Jan 2026 16:14:26 -0800 Subject: [PATCH 2/3] changelog and version --- CHANGELOG.md | 5 +++++ unstructured/__version__.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6709039bda..0f49ad9ed3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.18.27-dev5 + +### Enhancement +- Optimize `get_bbox_thickness` (codeflash) + ## 0.18.26 ### Fixes diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 868cca8535..c72d9a6020 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.26" # pragma: no cover +__version__ = "0.18.27-dev5" # pragma: no cover From 6f6ac8b971081c0093741aeb8ec25772b8a9b439 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Wed, 7 Jan 2026 12:36:40 -0600 Subject: [PATCH 3/3] version sync --- unstructured/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index c72d9a6020..c0da63dd46 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.27-dev5" # pragma: no cover +__version__ = "0.18.27-dev6" # pragma: no cover