Unstructured-IO · qued · Jan 8, 2026 · Dec 23, 2025 · Jan 5, 2026 · Jan 6, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 0.18.28-dev0
+
+### Enhancement
+- Optimize `clean_extra_whitespace_with_index_run` (codeflash)
+
 ## 0.18.27
 
 ### Fixes

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.18.27"  # pragma: no cover
+__version__ = "0.18.28-dev0"  # pragma: no cover
diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
@@ -449,19 +449,29 @@ def clean_extra_whitespace_with_index_run(text: str) -> Tuple[str, np.ndarray]:
     array([0., 0., 0., 0., 0., 0., 0., 0., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.]))
     """
 
-    cleaned_text = re.sub(r"[\xa0\n]", " ", text)
+    # Replace non-breaking space and newlines with a space (using translation table for speed)
+    translate_table = {ord("\xa0"): ord(" "), ord("\n"): ord(" ")}
+    cleaned_text = text.translate(translate_table)
+    # Collapse multiple spaces into one (keeps only single runs)
     cleaned_text = re.sub(r"([ ]{2,})", " ", cleaned_text)
 
     cleaned_text = cleaned_text.strip()
 
     moved_indices = np.zeros(len(text))
 
-    distance, original_index, cleaned_index = 0, 0, 0
-    while cleaned_index < len(cleaned_text):
-        if text[original_index] == cleaned_text[cleaned_index] or (
-            bool(re.match("[\xa0\n]", text[original_index]))
-            and bool(re.match(" ", cleaned_text[cleaned_index]))
-        ):
+    cleaned_len = len(cleaned_text)
+
+    ws_chars = {"\xa0", "\n"}  # For a quick lookup
+
+    distance = 0
+    original_index = 0
+    cleaned_index = 0
+
+    while cleaned_index < cleaned_len:
+        c_orig = text[original_index]
+        c_clean = cleaned_text[cleaned_index]
+
+        if c_orig == c_clean or (c_orig in ws_chars and c_clean == " "):
             moved_indices[cleaned_index] = distance
             original_index += 1
             cleaned_index += 1
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.18.27" # pragma: no cover
		__version__ = "0.18.28-dev0" # pragma: no cover