From d576343ea3e95620d811e8ba99f80860cc080df8 Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Tue, 23 Dec 2025 05:49:44 +0000 Subject: [PATCH 1/9] Optimize clean_extra_whitespace_with_index_run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The optimized code achieves a **68% speedup** through two key changes that eliminate expensive operations in the main loop: ## What Changed 1. **Character replacement optimization**: Replaced `re.sub(r"[\xa0\n]", " ", text)` with `text.translate()` using a translation table. This avoids regex compilation and pattern matching for simple character substitutions. 2. **Main loop optimization**: Eliminated two `re.match()` calls per iteration by: - Pre-computing character comparisons (`c_orig = text_chars[original_index]`) - Using set membership (`c_orig in ws_chars`) instead of regex matching - Direct character comparison (`c_clean == ' '`) instead of regex ## Why It's Faster Looking at the line profiler data, the original code spent **15.4% of total time** (10.8% + 4.6%) on regex matching inside the loop: - `bool(re.match("[\xa0\n]", text[original_index]))` - 7.12ms (10.8%) - `bool(re.match(" ", cleaned_text[cleaned_index]))` - 3.02ms (4.6%) The optimized version replaces these with: - Set membership check: `c_orig in ws_chars` - 1.07ms (1.4%) - Direct comparison: `c_clean == ' '` (included in same line) **Result**: Regex overhead is eliminated, saving ~9ms per 142 invocations in the benchmark. ## Performance Profile The annotated tests show the optimization excels when: - **Large inputs with whitespace**: `test_large_leading_and_trailing_whitespace` shows 291% speedup (203μs → 52.1μs) - **Many consecutive whitespace characters**: `test_large_mixed_whitespace_everywhere` shows 297% speedup (189μs → 47.8μs) - **Mixed whitespace types** (spaces, newlines, nbsp): `test_edge_all_whitespace_between_words` shows 47.9% speedup Small inputs with minimal whitespace see minor regressions (~5-17% slower) due to setup overhead, but these are negligible in absolute terms (< 2μs difference). ## Impact on Production Workloads The function is called in `_process_pdfminer_pages()` during PDF text extraction, processing **every text snippet on every page**. Given that PDFs often contain: - Multiple spaces/tabs between words - Newlines from paragraph breaks - Non-breaking spaces from formatting This optimization will provide substantial cumulative benefits when processing large documents with hundreds of pages, as the per-snippet savings compound across the entire document. --- unstructured/cleaners/core.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index 10fc83a180..5176652ddd 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -450,19 +450,35 @@ def clean_extra_whitespace_with_index_run(text: str) -> Tuple[str, np.ndarray]: array([0., 0., 0., 0., 0., 0., 0., 0., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.])) """ - cleaned_text = re.sub(r"[\xa0\n]", " ", text) + # Replace non-breaking space and newlines with a space (using translation table for speed) + translate_table = {ord("\xa0"): ord(" "), ord("\n"): ord(" ")} + cleaned_text = text.translate(translate_table) + # Collapse multiple spaces into one (keeps only single runs) cleaned_text = re.sub(r"([ ]{2,})", " ", cleaned_text) cleaned_text = cleaned_text.strip() moved_indices = np.zeros(len(text)) - distance, original_index, cleaned_index = 0, 0, 0 - while cleaned_index < len(cleaned_text): - if text[original_index] == cleaned_text[cleaned_index] or ( - bool(re.match("[\xa0\n]", text[original_index])) - and bool(re.match(" ", cleaned_text[cleaned_index])) - ): + # Optimize by using lookup instead of re.match in main loop + len(text) + cleaned_len = len(cleaned_text) + + ws_chars = {"\xa0", "\n"} # For quick lookup + + distance = 0 + original_index = 0 + cleaned_index = 0 + + # Fetch once for performance + text_chars = text + cleaned_chars = cleaned_text + + while cleaned_index < cleaned_len: + c_orig = text_chars[original_index] + c_clean = cleaned_chars[cleaned_index] + + if c_orig == c_clean or (c_orig in ws_chars and c_clean == " "): moved_indices[cleaned_index] = distance original_index += 1 cleaned_index += 1 From b17abe11ca8a6c117d10fecf392a0ab4256088fa Mon Sep 17 00:00:00 2001 From: misrasaurabh1 Date: Mon, 5 Jan 2026 14:08:50 -0800 Subject: [PATCH 2/9] clean logic --- unstructured/cleaners/core.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index 5176652ddd..1141bab6b3 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -460,23 +460,17 @@ def clean_extra_whitespace_with_index_run(text: str) -> Tuple[str, np.ndarray]: moved_indices = np.zeros(len(text)) - # Optimize by using lookup instead of re.match in main loop - len(text) cleaned_len = len(cleaned_text) - ws_chars = {"\xa0", "\n"} # For quick lookup + ws_chars = {"\xa0", "\n"} # For a quick lookup distance = 0 original_index = 0 cleaned_index = 0 - # Fetch once for performance - text_chars = text - cleaned_chars = cleaned_text - while cleaned_index < cleaned_len: - c_orig = text_chars[original_index] - c_clean = cleaned_chars[cleaned_index] + c_orig = text[original_index] + c_clean = cleaned_text[cleaned_index] if c_orig == c_clean or (c_orig in ws_chars and c_clean == " "): moved_indices[cleaned_index] = distance From 66f044981941d4600a39359e8fcefa26dffc301a Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Tue, 6 Jan 2026 10:54:40 -0800 Subject: [PATCH 3/9] changelog and version --- CHANGELOG.md | 9 +++++++++ unstructured/__version__.py | 4 ++++ 2 files changed, 13 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6709039bda..ee295027e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,4 @@ +<<<<<<< Updated upstream ## 0.18.26 ### Fixes @@ -9,6 +10,14 @@ - **Security update**: Removed pdfminer.six version constraint and bumped pdfminer.six and urllib3 to address high severity CVEs ## 0.18.24 +======= +## 0.18.27-dev6 + +### Enhancement +- Optimize `clean_extra_whitespace_with_index_run` (codeflash) + +## 0.18.24-dev0 +>>>>>>> Stashed changes ### Enhancement - Optimize `OCRAgentTesseract.extract_word_from_hocr` (codeflash) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 868cca8535..d90b944f64 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1,5 @@ +<<<<<<< Updated upstream __version__ = "0.18.26" # pragma: no cover +======= +__version__ = "0.18.27-dev6" # pragma: no cover +>>>>>>> Stashed changes From 2519504c7f0b57e01592e65e830765018669a856 Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Tue, 6 Jan 2026 11:02:46 -0800 Subject: [PATCH 4/9] changelog and version --- unstructured/__version__.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index d90b944f64..c0da63dd46 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1,5 +1 @@ -<<<<<<< Updated upstream -__version__ = "0.18.26" # pragma: no cover -======= __version__ = "0.18.27-dev6" # pragma: no cover ->>>>>>> Stashed changes From 58a4d57c73fde68a74709ce1cb6cce3901646f15 Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Tue, 6 Jan 2026 11:15:58 -0800 Subject: [PATCH 5/9] changelog fix --- CHANGELOG.md | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ee295027e8..ba58624b89 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,8 @@ -<<<<<<< Updated upstream +## 0.18.27-dev6 + +### Enhancement +- Optimize `clean_extra_whitespace_with_index_run` (codeflash) + ## 0.18.26 ### Fixes @@ -10,22 +14,10 @@ - **Security update**: Removed pdfminer.six version constraint and bumped pdfminer.six and urllib3 to address high severity CVEs ## 0.18.24 -======= -## 0.18.27-dev6 - -### Enhancement -- Optimize `clean_extra_whitespace_with_index_run` (codeflash) - -## 0.18.24-dev0 ->>>>>>> Stashed changes ### Enhancement - Optimize `OCRAgentTesseract.extract_word_from_hocr` (codeflash) - -### Fixes -- **Security update**: Bumped dependencies to address security vulnerabilities - ## 0.18.23 ### Enhancement From b56f1e3c073070dc97ef28b5b50fb7a91cdfaefd Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Tue, 6 Jan 2026 11:17:35 -0800 Subject: [PATCH 6/9] undo changelog edit --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ba58624b89..1913aaf20e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,9 @@ ### Enhancement - Optimize `OCRAgentTesseract.extract_word_from_hocr` (codeflash) +### Fixes +- **Security update**: Bumped dependencies to address security vulnerabilities + ## 0.18.23 ### Enhancement From 6dbb2494300c8d9bc695ad3168ac32487a4d0820 Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Tue, 6 Jan 2026 11:18:43 -0800 Subject: [PATCH 7/9] undo newline --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1913aaf20e..1b21769109 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,6 @@ ### Enhancement - Optimize `OCRAgentTesseract.extract_word_from_hocr` (codeflash) - ### Fixes - **Security update**: Bumped dependencies to address security vulnerabilities From 046d7b1aa6128762b167964dd4d64cf03748649a Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Tue, 6 Jan 2026 11:19:49 -0800 Subject: [PATCH 8/9] correct number of newlines --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b21769109..58670fe3b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,8 @@ ### Enhancement - Optimize `OCRAgentTesseract.extract_word_from_hocr` (codeflash) + + ### Fixes - **Security update**: Bumped dependencies to address security vulnerabilities From 9985bec73cb8ba6ff86e396b9b98e172065f50f3 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Wed, 7 Jan 2026 14:10:10 -0600 Subject: [PATCH 9/9] version sync --- unstructured/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index c0da63dd46..7f17a89593 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.27-dev6" # pragma: no cover +__version__ = "0.18.27-dev7" # pragma: no cover