Skip to content
Merged
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.18.28-dev0

### Enhancement
- Optimize `clean_extra_whitespace_with_index_run` (codeflash)

## 0.18.27

### Fixes
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.18.27" # pragma: no cover
__version__ = "0.18.28-dev0" # pragma: no cover
24 changes: 17 additions & 7 deletions unstructured/cleaners/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,19 +449,29 @@ def clean_extra_whitespace_with_index_run(text: str) -> Tuple[str, np.ndarray]:
array([0., 0., 0., 0., 0., 0., 0., 0., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.]))
"""

cleaned_text = re.sub(r"[\xa0\n]", " ", text)
# Replace non-breaking space and newlines with a space (using translation table for speed)
translate_table = {ord("\xa0"): ord(" "), ord("\n"): ord(" ")}
cleaned_text = text.translate(translate_table)
# Collapse multiple spaces into one (keeps only single runs)
cleaned_text = re.sub(r"([ ]{2,})", " ", cleaned_text)

cleaned_text = cleaned_text.strip()

moved_indices = np.zeros(len(text))

distance, original_index, cleaned_index = 0, 0, 0
while cleaned_index < len(cleaned_text):
if text[original_index] == cleaned_text[cleaned_index] or (
bool(re.match("[\xa0\n]", text[original_index]))
and bool(re.match(" ", cleaned_text[cleaned_index]))
):
cleaned_len = len(cleaned_text)

ws_chars = {"\xa0", "\n"} # For a quick lookup

distance = 0
original_index = 0
cleaned_index = 0

while cleaned_index < cleaned_len:
c_orig = text[original_index]
c_clean = cleaned_text[cleaned_index]

if c_orig == c_clean or (c_orig in ws_chars and c_clean == " "):
moved_indices[cleaned_index] = distance
original_index += 1
cleaned_index += 1
Expand Down
Loading