diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d94f26bc5..ca8b098edb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.18.28-dev0 + +### Enhancement +- Optimize `clean_extra_whitespace_with_index_run` (codeflash) + ## 0.18.27 ### Fixes diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 4bb2b92ac3..3dc5b6d69e 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.27" # pragma: no cover +__version__ = "0.18.28-dev0" # pragma: no cover diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index 22c77b9044..b64c7bd19c 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -449,19 +449,29 @@ def clean_extra_whitespace_with_index_run(text: str) -> Tuple[str, np.ndarray]: array([0., 0., 0., 0., 0., 0., 0., 0., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.])) """ - cleaned_text = re.sub(r"[\xa0\n]", " ", text) + # Replace non-breaking space and newlines with a space (using translation table for speed) + translate_table = {ord("\xa0"): ord(" "), ord("\n"): ord(" ")} + cleaned_text = text.translate(translate_table) + # Collapse multiple spaces into one (keeps only single runs) cleaned_text = re.sub(r"([ ]{2,})", " ", cleaned_text) cleaned_text = cleaned_text.strip() moved_indices = np.zeros(len(text)) - distance, original_index, cleaned_index = 0, 0, 0 - while cleaned_index < len(cleaned_text): - if text[original_index] == cleaned_text[cleaned_index] or ( - bool(re.match("[\xa0\n]", text[original_index])) - and bool(re.match(" ", cleaned_text[cleaned_index])) - ): + cleaned_len = len(cleaned_text) + + ws_chars = {"\xa0", "\n"} # For a quick lookup + + distance = 0 + original_index = 0 + cleaned_index = 0 + + while cleaned_index < cleaned_len: + c_orig = text[original_index] + c_clean = cleaned_text[cleaned_index] + + if c_orig == c_clean or (c_orig in ws_chars and c_clean == " "): moved_indices[cleaned_index] = distance original_index += 1 cleaned_index += 1