From d576343ea3e95620d811e8ba99f80860cc080df8 Mon Sep 17 00:00:00 2001
From: "codeflash-ai[bot]"
 <148906541+codeflash-ai[bot]@users.noreply.github.com>
Date: Tue, 23 Dec 2025 05:49:44 +0000
Subject: [PATCH 1/9] Optimize clean_extra_whitespace_with_index_run
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The optimized code achieves a **68% speedup** through two key changes that eliminate expensive operations in the main loop:

## What Changed

1. **Character replacement optimization**: Replaced `re.sub(r"[\xa0\n]", " ", text)` with `text.translate()` using a translation table. This avoids regex compilation and pattern matching for simple character substitutions.

2. **Main loop optimization**: Eliminated two `re.match()` calls per iteration by:
   - Pre-computing character comparisons (`c_orig = text_chars[original_index]`)
   - Using set membership (`c_orig in ws_chars`) instead of regex matching
   - Direct character comparison (`c_clean == ' '`) instead of regex

## Why It's Faster

Looking at the line profiler data, the original code spent **15.4% of total time** (10.8% + 4.6%) on regex matching inside the loop:
- `bool(re.match("[\xa0\n]", text[original_index]))` - 7.12ms (10.8%)
- `bool(re.match(" ", cleaned_text[cleaned_index]))` - 3.02ms (4.6%)

The optimized version replaces these with:
- Set membership check: `c_orig in ws_chars` - 1.07ms (1.4%)
- Direct comparison: `c_clean == ' '` (included in same line)

**Result**: Regex overhead is eliminated, saving ~9ms per 142 invocations in the benchmark.

## Performance Profile

The annotated tests show the optimization excels when:
- **Large inputs with whitespace**: `test_large_leading_and_trailing_whitespace` shows 291% speedup (203μs → 52.1μs)
- **Many consecutive whitespace characters**: `test_large_mixed_whitespace_everywhere` shows 297% speedup (189μs → 47.8μs)
- **Mixed whitespace types** (spaces, newlines, nbsp): `test_edge_all_whitespace_between_words` shows 47.9% speedup

Small inputs with minimal whitespace see minor regressions (~5-17% slower) due to setup overhead, but these are negligible in absolute terms (< 2μs difference).

## Impact on Production Workloads

The function is called in `_process_pdfminer_pages()` during PDF text extraction, processing **every text snippet on every page**. Given that PDFs often contain:
- Multiple spaces/tabs between words
- Newlines from paragraph breaks
- Non-breaking spaces from formatting

This optimization will provide substantial cumulative benefits when processing large documents with hundreds of pages, as the per-snippet savings compound across the entire document.
---
 unstructured/cleaners/core.py | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
index 10fc83a180..5176652ddd 100644
--- a/unstructured/cleaners/core.py
+++ b/unstructured/cleaners/core.py
@@ -450,19 +450,35 @@ def clean_extra_whitespace_with_index_run(text: str) -> Tuple[str, np.ndarray]:
     array([0., 0., 0., 0., 0., 0., 0., 0., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.]))
     """
 
-    cleaned_text = re.sub(r"[\xa0\n]", " ", text)
+    # Replace non-breaking space and newlines with a space (using translation table for speed)
+    translate_table = {ord("\xa0"): ord(" "), ord("\n"): ord(" ")}
+    cleaned_text = text.translate(translate_table)
+    # Collapse multiple spaces into one (keeps only single runs)
     cleaned_text = re.sub(r"([ ]{2,})", " ", cleaned_text)
 
     cleaned_text = cleaned_text.strip()
 
     moved_indices = np.zeros(len(text))
 
-    distance, original_index, cleaned_index = 0, 0, 0
-    while cleaned_index < len(cleaned_text):
-        if text[original_index] == cleaned_text[cleaned_index] or (
-            bool(re.match("[\xa0\n]", text[original_index]))
-            and bool(re.match(" ", cleaned_text[cleaned_index]))
-        ):
+    # Optimize by using lookup instead of re.match in main loop
+    len(text)
+    cleaned_len = len(cleaned_text)
+
+    ws_chars = {"\xa0", "\n"}  # For quick lookup
+
+    distance = 0
+    original_index = 0
+    cleaned_index = 0
+
+    # Fetch once for performance
+    text_chars = text
+    cleaned_chars = cleaned_text
+
+    while cleaned_index < cleaned_len:
+        c_orig = text_chars[original_index]
+        c_clean = cleaned_chars[cleaned_index]
+
+        if c_orig == c_clean or (c_orig in ws_chars and c_clean == " "):
             moved_indices[cleaned_index] = distance
             original_index += 1
             cleaned_index += 1

From b17abe11ca8a6c117d10fecf392a0ab4256088fa Mon Sep 17 00:00:00 2001
From: misrasaurabh1 <misra.saurabh1@gmail.com>
Date: Mon, 5 Jan 2026 14:08:50 -0800
Subject: [PATCH 2/9] clean logic

---
 unstructured/cleaners/core.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
index 5176652ddd..1141bab6b3 100644
--- a/unstructured/cleaners/core.py
+++ b/unstructured/cleaners/core.py
@@ -460,23 +460,17 @@ def clean_extra_whitespace_with_index_run(text: str) -> Tuple[str, np.ndarray]:
 
     moved_indices = np.zeros(len(text))
 
-    # Optimize by using lookup instead of re.match in main loop
-    len(text)
     cleaned_len = len(cleaned_text)
 
-    ws_chars = {"\xa0", "\n"}  # For quick lookup
+    ws_chars = {"\xa0", "\n"}  # For a quick lookup
 
     distance = 0
     original_index = 0
     cleaned_index = 0
 
-    # Fetch once for performance
-    text_chars = text
-    cleaned_chars = cleaned_text
-
     while cleaned_index < cleaned_len:
-        c_orig = text_chars[original_index]
-        c_clean = cleaned_chars[cleaned_index]
+        c_orig = text[original_index]
+        c_clean = cleaned_text[cleaned_index]
 
         if c_orig == c_clean or (c_orig in ws_chars and c_clean == " "):
             moved_indices[cleaned_index] = distance

From 66f044981941d4600a39359e8fcefa26dffc301a Mon Sep 17 00:00:00 2001
From: Aseem Saxena <aseem.bits@gmail.com>
Date: Tue, 6 Jan 2026 10:54:40 -0800
Subject: [PATCH 3/9] changelog and version

---
 CHANGELOG.md                | 9 +++++++++
 unstructured/__version__.py | 4 ++++
 2 files changed, 13 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6709039bda..ee295027e8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,4 @@
+<<<<<<< Updated upstream
 ## 0.18.26
 
 ### Fixes
@@ -9,6 +10,14 @@
 - **Security update**: Removed pdfminer.six version constraint and bumped pdfminer.six and urllib3 to address high severity CVEs
 
 ## 0.18.24
+=======
+## 0.18.27-dev6
+
+### Enhancement
+- Optimize `clean_extra_whitespace_with_index_run` (codeflash)
+
+## 0.18.24-dev0
+>>>>>>> Stashed changes
 
 ### Enhancement
 - Optimize `OCRAgentTesseract.extract_word_from_hocr` (codeflash)
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 868cca8535..d90b944f64 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1,5 @@
+<<<<<<< Updated upstream
 __version__ = "0.18.26"  # pragma: no cover
+=======
+__version__ = "0.18.27-dev6"  # pragma: no cover
+>>>>>>> Stashed changes

From 2519504c7f0b57e01592e65e830765018669a856 Mon Sep 17 00:00:00 2001
From: Aseem Saxena <aseem.bits@gmail.com>
Date: Tue, 6 Jan 2026 11:02:46 -0800
Subject: [PATCH 4/9] changelog and version

---
 unstructured/__version__.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index d90b944f64..c0da63dd46 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1,5 +1 @@
-<<<<<<< Updated upstream
-__version__ = "0.18.26"  # pragma: no cover
-=======
 __version__ = "0.18.27-dev6"  # pragma: no cover
->>>>>>> Stashed changes

From 58a4d57c73fde68a74709ce1cb6cce3901646f15 Mon Sep 17 00:00:00 2001
From: Aseem Saxena <aseem.bits@gmail.com>
Date: Tue, 6 Jan 2026 11:15:58 -0800
Subject: [PATCH 5/9] changelog fix

---
 CHANGELOG.md | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ee295027e8..ba58624b89 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,8 @@
-<<<<<<< Updated upstream
+## 0.18.27-dev6
+
+### Enhancement
+- Optimize `clean_extra_whitespace_with_index_run` (codeflash)
+
 ## 0.18.26
 
 ### Fixes
@@ -10,22 +14,10 @@
 - **Security update**: Removed pdfminer.six version constraint and bumped pdfminer.six and urllib3 to address high severity CVEs
 
 ## 0.18.24
-=======
-## 0.18.27-dev6
-
-### Enhancement
-- Optimize `clean_extra_whitespace_with_index_run` (codeflash)
-
-## 0.18.24-dev0
->>>>>>> Stashed changes
 
 ### Enhancement
 - Optimize `OCRAgentTesseract.extract_word_from_hocr` (codeflash)
 
-
-### Fixes
-- **Security update**: Bumped dependencies to address security vulnerabilities
-
 ## 0.18.23
 
 ### Enhancement

From b56f1e3c073070dc97ef28b5b50fb7a91cdfaefd Mon Sep 17 00:00:00 2001
From: Aseem Saxena <aseem.bits@gmail.com>
Date: Tue, 6 Jan 2026 11:17:35 -0800
Subject: [PATCH 6/9] undo changelog edit

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ba58624b89..1913aaf20e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,9 @@
 ### Enhancement
 - Optimize `OCRAgentTesseract.extract_word_from_hocr` (codeflash)
 
+### Fixes
+- **Security update**: Bumped dependencies to address security vulnerabilities
+
 ## 0.18.23
 
 ### Enhancement

From 6dbb2494300c8d9bc695ad3168ac32487a4d0820 Mon Sep 17 00:00:00 2001
From: Aseem Saxena <aseem.bits@gmail.com>
Date: Tue, 6 Jan 2026 11:18:43 -0800
Subject: [PATCH 7/9] undo newline

---
 CHANGELOG.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1913aaf20e..1b21769109 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,7 +17,6 @@
 
 ### Enhancement
 - Optimize `OCRAgentTesseract.extract_word_from_hocr` (codeflash)
-
 ### Fixes
 - **Security update**: Bumped dependencies to address security vulnerabilities
 

From 046d7b1aa6128762b167964dd4d64cf03748649a Mon Sep 17 00:00:00 2001
From: Aseem Saxena <aseem.bits@gmail.com>
Date: Tue, 6 Jan 2026 11:19:49 -0800
Subject: [PATCH 8/9] correct number of newlines

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1b21769109..58670fe3b9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,8 @@
 
 ### Enhancement
 - Optimize `OCRAgentTesseract.extract_word_from_hocr` (codeflash)
+
+
 ### Fixes
 - **Security update**: Bumped dependencies to address security vulnerabilities
 

From 9985bec73cb8ba6ff86e396b9b98e172065f50f3 Mon Sep 17 00:00:00 2001
From: Alan Bertl <alan@unstructured.io>
Date: Wed, 7 Jan 2026 14:10:10 -0600
Subject: [PATCH 9/9] version sync

---
 unstructured/__version__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index c0da63dd46..7f17a89593 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.18.27-dev6"  # pragma: no cover
+__version__ = "0.18.27-dev7"  # pragma: no cover