From b28a5a381b67e4af134639fca0984929fa39a2c4 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Fri, 17 May 2024 07:40:18 +0000 Subject: [PATCH 01/11] Refactor: Move some code to new files for reuse No new code is introduced; only existing code is shuffled around and the functions moved are unchanged as well. --- codespell_lib/_codespell.py | 65 ++---------------------------- codespell_lib/_text_util.py | 27 +++++++++++++ codespell_lib/spellchecker.py | 75 +++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 62 deletions(-) create mode 100644 codespell_lib/_text_util.py create mode 100644 codespell_lib/spellchecker.py diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 62a51b75b3..89945988dc 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -39,10 +39,13 @@ Tuple, ) +from ._text_util import fix_case + # autogenerated by setuptools_scm from ._version import ( # type: ignore[import-not-found] __version__ as VERSION, # noqa: N812 ) +from .spellchecker import Misspelling, build_dict word_regex_def = r"[\w\-'’]+" # noqa: RUF001 # While we want to treat characters like ( or " as okay for a starting break, @@ -52,9 +55,6 @@ "(\\b(?:https?|[ts]?ftp|file|git|smb)://[^\\s]+(?=$|\\s)|" "\\b[\\w.%+-]+@[\\w.-]+\\b)" ) -# Pass all misspellings through this translation table to generate -# alternative misspellings and fixes. -alt_chars = (("'", "’"),) # noqa: RUF001 inline_ignore_regex = re.compile(r"[^\w\s]\s?codespell:ignore\b(\s+(?P[\w,]*))?") USAGE = """ \t%prog [OPTIONS] [file1 file2 ... fileN] @@ -167,13 +167,6 @@ def match(self, filename: str) -> bool: return any(fnmatch.fnmatch(filename, p) for p in self.pattern_list) -class Misspelling: - def __init__(self, data: str, fix: bool, reason: str) -> None: - self.data = data - self.fix = fix - self.reason = reason - - class TermColors: def __init__(self) -> None: self.FILE = "\033[33m" @@ -703,48 +696,6 @@ def build_ignore_words( ) -def add_misspelling( - key: str, - data: str, - misspellings: Dict[str, Misspelling], -) -> None: - data = data.strip() - - if "," in data: - fix = False - data, reason = data.rsplit(",", 1) - reason = reason.lstrip() - else: - fix = True - reason = "" - - misspellings[key] = Misspelling(data, fix, reason) - - -def build_dict( - filename: str, - misspellings: Dict[str, Misspelling], - ignore_words: Set[str], -) -> None: - with open(filename, encoding="utf-8") as f: - translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars] - for line in f: - [key, data] = line.split("->") - # TODO: For now, convert both to lower. - # Someday we can maybe add support for fixing caps. - key = key.lower() - data = data.lower() - if key not in ignore_words: - add_misspelling(key, data, misspellings) - # generate alternative misspellings/fixes - for x, table in translate_tables: - if x in key: - alt_key = key.translate(table) - alt_data = data.translate(table) - if alt_key not in ignore_words: - add_misspelling(alt_key, alt_data, misspellings) - - def is_hidden(filename: str, check_hidden: bool) -> bool: bfilename = os.path.basename(filename) @@ -759,16 +710,6 @@ def is_text_file(filename: str) -> bool: return b"\x00" not in s -def fix_case(word: str, fixword: str) -> str: - if word == word.capitalize(): - return ", ".join(w.strip().capitalize() for w in fixword.split(",")) - if word == word.upper(): - return fixword.upper() - # they are both lower case - # or we don't have any idea - return fixword - - def ask_for_word_fix( line: str, match: Match[str], diff --git a/codespell_lib/_text_util.py b/codespell_lib/_text_util.py new file mode 100644 index 0000000000..18a2ec89b4 --- /dev/null +++ b/codespell_lib/_text_util.py @@ -0,0 +1,27 @@ +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see +# https://www.gnu.org/licenses/old-licenses/gpl-2.0.html. +""" +Copyright (C) 2010-2011 Lucas De Marchi +Copyright (C) 2011 ProFUSION embedded systems +""" + + +def fix_case(word: str, fixword: str) -> str: + if word == word.capitalize(): + return ", ".join(w.strip().capitalize() for w in fixword.split(",")) + if word == word.upper(): + return fixword.upper() + # they are both lower case + # or we don't have any idea + return fixword diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py new file mode 100644 index 0000000000..82865cdd19 --- /dev/null +++ b/codespell_lib/spellchecker.py @@ -0,0 +1,75 @@ +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see +# https://www.gnu.org/licenses/old-licenses/gpl-2.0.html. +""" +Copyright (C) 2010-2011 Lucas De Marchi +Copyright (C) 2011 ProFUSION embedded systems +""" + +from typing import ( + Dict, + Set, +) + +# Pass all misspellings through this translation table to generate +# alternative misspellings and fixes. +alt_chars = (("'", "’"),) # noqa: RUF001 + + +class Misspelling: + def __init__(self, data: str, fix: bool, reason: str) -> None: + self.data = data + self.fix = fix + self.reason = reason + + +def add_misspelling( + key: str, + data: str, + misspellings: Dict[str, Misspelling], +) -> None: + data = data.strip() + + if "," in data: + fix = False + data, reason = data.rsplit(",", 1) + reason = reason.lstrip() + else: + fix = True + reason = "" + + misspellings[key] = Misspelling(data, fix, reason) + + +def build_dict( + filename: str, + misspellings: Dict[str, Misspelling], + ignore_words: Set[str], +) -> None: + with open(filename, encoding="utf-8") as f: + translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars] + for line in f: + [key, data] = line.split("->") + # TODO: For now, convert both to lower. + # Someday we can maybe add support for fixing caps. + key = key.lower() + data = data.lower() + if key not in ignore_words: + add_misspelling(key, data, misspellings) + # generate alternative misspellings/fixes + for x, table in translate_tables: + if x in key: + alt_key = key.translate(table) + alt_data = data.translate(table) + if alt_key not in ignore_words: + add_misspelling(alt_key, alt_data, misspellings) From 824bd7c2ea283f1a10baddefb720ef8aa4858c76 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Fri, 17 May 2024 08:21:10 +0000 Subject: [PATCH 02/11] Replace `data: str` with `candidates: Sequence[str]` When the spelling dictionaries are loaded, previously the correction line was just stored in memory as a simple text. Through out the code, callers would then have to deal with the `data` attribute, correctly `split()` + `strip()` it. With this change, the dictionary parsing code now encapsulates this problem. The auto-correction works from the assumption that there is only one candidate. This assumption is invariant and seem to be properly maintained in the code. Therefore, we can just pick the first candidate word when doing a correction. In the code, the following name changes are performed: * `Misspelling.data` -> `Misspelling.candidates` * `fixword` -> `candidates` when used for multiple candidates (`fixword` remains for when it is a correction) On performance: Performance-wise, this change moves computation from "checking" time to "startup" time. The performance cost does not appear to be noticeable in my baseline (#3419). Though, keep the corpus weakness on the ratio of cased vs. non-cased corrections with multiple candidates in mind. The all lowercase typo is now slightly more expensive (it was passed throughout `fix_case` and fed directly into the `print` in the original code. In the new code, it will always need a `join`). There are still an overweight of lower-case only corrections in general, so the unconditional `.join` alone is not sufficient to affect the performance noticeably. --- codespell_lib/_codespell.py | 36 +++++++++++++++++++---------------- codespell_lib/_text_util.py | 10 ++++++---- codespell_lib/spellchecker.py | 11 ++++++++--- 3 files changed, 34 insertions(+), 23 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 89945988dc..2fa15adfe9 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -26,6 +26,10 @@ import sys import textwrap from ctypes import wintypes +from .spellchecker import ( + build_dict, + Misspelling, +) from typing import ( Any, Dict, @@ -45,7 +49,6 @@ from ._version import ( # type: ignore[import-not-found] __version__ as VERSION, # noqa: N812 ) -from .spellchecker import Misspelling, build_dict word_regex_def = r"[\w\-'’]+" # noqa: RUF001 # While we want to treat characters like ( or " as okay for a starting break, @@ -716,10 +719,10 @@ def ask_for_word_fix( misspelling: Misspelling, interactivity: int, colors: TermColors, -) -> Tuple[bool, str]: +) -> Tuple[bool, Sequence[str]]: wrongword = match.group() if interactivity <= 0: - return misspelling.fix, fix_case(wrongword, misspelling.data) + return misspelling.fix, fix_case(wrongword, misspelling.candidates) line_ui = ( f"{line[:match.start()]}" @@ -729,7 +732,8 @@ def ask_for_word_fix( if misspelling.fix and interactivity & 1: r = "" - fixword = fix_case(wrongword, misspelling.data) + candidates = fix_case(wrongword, misspelling.candidates) + fixword = candidates[0] while not r: print(f"{line_ui}\t{wrongword} ==> {fixword} (Y/n) ", end="", flush=True) r = sys.stdin.readline().strip().upper() @@ -747,12 +751,12 @@ def ask_for_word_fix( # we ask the user which word to use r = "" - opt = [w.strip() for w in misspelling.data.split(",")] + opt = misspelling.candidates while not r: print(f"{line_ui} Choose an option (blank for none): ", end="") - for i, o in enumerate(opt): - fixword = fix_case(wrongword, o) - print(f" {i}) {fixword}", end="") + cased_candidates = fix_case(wrongword, opt) + for i, candidates in enumerate(cased_candidates): + print(f" {i}) {candidates}", end="") print(": ", end="", flush=True) n = sys.stdin.readline().strip() @@ -767,9 +771,9 @@ def ask_for_word_fix( if r: misspelling.fix = True - misspelling.data = r + misspelling.candidates = (r,) - return misspelling.fix, fix_case(wrongword, misspelling.data) + return misspelling.fix, fix_case(wrongword, misspelling.candidates) def print_context( @@ -861,14 +865,14 @@ def parse_file( if lword not in misspellings: continue fix = misspellings[lword].fix - fixword = fix_case(word, misspellings[lword].data) + candidates = fix_case(word, misspellings[lword].candidates) if summary and fix: summary.update(lword) cfilename = f"{colors.FILE}{filename}{colors.DISABLE}" cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}" - crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}" + crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}" reason = misspellings[lword].reason if reason: @@ -958,13 +962,13 @@ def parse_file( context_shown = False fix = misspellings[lword].fix - fixword = fix_case(word, misspellings[lword].data) + candidates = fix_case(word, misspellings[lword].candidates) if options.interactive and lword not in asked_for: if context is not None: context_shown = True print_context(lines, i, context) - fix, fixword = ask_for_word_fix( + fix, candidates = ask_for_word_fix( lines[i], match, misspellings[lword], @@ -981,7 +985,7 @@ def parse_file( if options.write_changes and fix: changed = True - lines[i] = re.sub(rf"\b{word}\b", fixword, lines[i]) + lines[i] = re.sub(rf"\b{word}\b", candidates[0], lines[i]) fixed_words.add(word) continue @@ -996,7 +1000,7 @@ def parse_file( cfilename = f"{colors.FILE}{filename}{colors.DISABLE}" cline = f"{colors.FILE}{i + 1}{colors.DISABLE}" cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}" - crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}" + crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}" reason = misspellings[lword].reason if reason: diff --git a/codespell_lib/_text_util.py b/codespell_lib/_text_util.py index 18a2ec89b4..c141db503d 100644 --- a/codespell_lib/_text_util.py +++ b/codespell_lib/_text_util.py @@ -16,12 +16,14 @@ Copyright (C) 2011 ProFUSION embedded systems """ +from typing import Sequence -def fix_case(word: str, fixword: str) -> str: + +def fix_case(word: str, candidates: Sequence[str]) -> Sequence[str]: if word == word.capitalize(): - return ", ".join(w.strip().capitalize() for w in fixword.split(",")) + return tuple(c.capitalize() for c in candidates) if word == word.upper(): - return fixword.upper() + return tuple(c.upper() for c in candidates) # they are both lower case # or we don't have any idea - return fixword + return candidates diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py index 82865cdd19..fadaf49e44 100644 --- a/codespell_lib/spellchecker.py +++ b/codespell_lib/spellchecker.py @@ -18,6 +18,7 @@ from typing import ( Dict, + Sequence, Set, ) @@ -27,8 +28,8 @@ class Misspelling: - def __init__(self, data: str, fix: bool, reason: str) -> None: - self.data = data + def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None: + self.candidates = candidates self.fix = fix self.reason = reason @@ -48,7 +49,11 @@ def add_misspelling( fix = True reason = "" - misspellings[key] = Misspelling(data, fix, reason) + misspellings[key] = Misspelling( + tuple(c.strip() for c in data.split(",")), + fix, + reason, + ) def build_dict( From aa7792fee731a5b195277bfe7a9184e04be63da5 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Fri, 17 May 2024 09:23:23 +0000 Subject: [PATCH 03/11] Refactor dictionary into a new `Spellchecker` class This is as close to a 1:1 conversion as possible. It might change whhen we get to designing the API. The callers have been refactored to only perform the lookup once. This was mostly to keep the code more readable. The performance cost does seem noticable, which is unsurprising. This method has a higher cost towards non-matches which is the most common case. This commit causes the performance to drop roughly 10% on its and we are now slower than the goal. --- codespell_lib/_codespell.py | 42 +++++++++++---------- codespell_lib/_text_util.py | 2 +- codespell_lib/spellchecker.py | 71 ++++++++++++++++++++++------------- 3 files changed, 68 insertions(+), 47 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 2fa15adfe9..faa546ddf6 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -26,10 +26,6 @@ import sys import textwrap from ctypes import wintypes -from .spellchecker import ( - build_dict, - Misspelling, -) from typing import ( Any, Dict, @@ -49,6 +45,10 @@ from ._version import ( # type: ignore[import-not-found] __version__ as VERSION, # noqa: N812 ) +from .spellchecker import ( + Misspelling, + Spellchecker, +) word_regex_def = r"[\w\-'’]+" # noqa: RUF001 # While we want to treat characters like ( or " as okay for a starting break, @@ -837,7 +837,7 @@ def parse_file( filename: str, colors: TermColors, summary: Optional[Summary], - misspellings: Dict[str, Misspelling], + spellchecker: Spellchecker, ignore_words_cased: Set[str], exclude_lines: Set[str], file_opener: FileOpener, @@ -862,10 +862,11 @@ def parse_file( if word in ignore_words_cased: continue lword = word.lower() - if lword not in misspellings: + misspelling = spellchecker.check_lower_cased_word(lword) + if misspelling is None: continue - fix = misspellings[lword].fix - candidates = fix_case(word, misspellings[lword].candidates) + fix = misspelling.fix + candidates = fix_case(word, misspelling.candidates) if summary and fix: summary.update(lword) @@ -874,7 +875,7 @@ def parse_file( cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}" crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}" - reason = misspellings[lword].reason + reason = misspelling.reason if reason: if options.quiet_level & QuietLevels.DISABLED_FIXES: continue @@ -946,7 +947,8 @@ def parse_file( if word in ignore_words_cased: continue lword = word.lower() - if lword in misspellings and lword not in extra_words_to_ignore: + misspelling = spellchecker.check_lower_cased_word(lword) + if misspelling is not None and lword not in extra_words_to_ignore: # Sometimes we find a 'misspelling' which is actually a valid word # preceded by a string escape sequence. Ignore such cases as # they're usually false alarms; see issue #17 among others. @@ -956,13 +958,13 @@ def parse_file( and line[char_before_idx] == "\\" # bell, backspace, formfeed, newline, carriage-return, tab, vtab. and word.startswith(("a", "b", "f", "n", "r", "t", "v")) - and lword[1:] not in misspellings + and spellchecker.check_lower_cased_word(lword[1:]) is None ): continue context_shown = False - fix = misspellings[lword].fix - candidates = fix_case(word, misspellings[lword].candidates) + fix = misspelling.fix + candidates = fix_case(word, misspelling.candidates) if options.interactive and lword not in asked_for: if context is not None: @@ -971,7 +973,7 @@ def parse_file( fix, candidates = ask_for_word_fix( lines[i], match, - misspellings[lword], + misspelling, options.interactive, colors=colors, ) @@ -993,7 +995,7 @@ def parse_file( if ( options.interactive & 2 and not fix - and not misspellings[lword].reason + and not misspelling.reason ): continue @@ -1002,7 +1004,7 @@ def parse_file( cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}" crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}" - reason = misspellings[lword].reason + reason = misspelling.reason if reason: if options.quiet_level & QuietLevels.DISABLED_FIXES: continue @@ -1174,9 +1176,9 @@ def main(*args: str) -> int: parser.print_help() return EX_USAGE use_dictionaries.append(dictionary) - misspellings: Dict[str, Misspelling] = {} + spellchecker = Spellchecker() for dictionary in use_dictionaries: - build_dict(dictionary, misspellings, ignore_words) + spellchecker.add_from_file(dictionary, ignore_words=ignore_words) colors = TermColors() if not options.colors: colors.disable() @@ -1251,7 +1253,7 @@ def main(*args: str) -> int: fname, colors, summary, - misspellings, + spellchecker, ignore_words_cased, exclude_lines, file_opener, @@ -1276,7 +1278,7 @@ def main(*args: str) -> int: filename, colors, summary, - misspellings, + spellchecker, ignore_words_cased, exclude_lines, file_opener, diff --git a/codespell_lib/_text_util.py b/codespell_lib/_text_util.py index c141db503d..33e6d7e033 100644 --- a/codespell_lib/_text_util.py +++ b/codespell_lib/_text_util.py @@ -24,6 +24,6 @@ def fix_case(word: str, candidates: Sequence[str]) -> Sequence[str]: return tuple(c.capitalize() for c in candidates) if word == word.upper(): return tuple(c.upper() for c in candidates) - # they are both lower case + # they are both lower-case # or we don't have any idea return candidates diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py index fadaf49e44..4f19e71269 100644 --- a/codespell_lib/spellchecker.py +++ b/codespell_lib/spellchecker.py @@ -17,9 +17,10 @@ """ from typing import ( + Container, Dict, + Optional, Sequence, - Set, ) # Pass all misspellings through this translation table to generate @@ -34,7 +35,49 @@ def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None: self.reason = reason -def add_misspelling( +class Spellchecker: + def __init__(self) -> None: + self._misspellings: Dict[str, Misspelling] = {} + + def check_lower_cased_word(self, word: str) -> Optional[Misspelling]: + """Check a given word against the loaded dictionaries + + :param word: The word to check. This should be all lower-case. + """ + return self._misspellings.get(word) + + def add_from_file( + self, + filename: str, + *, + ignore_words: Container[str] = frozenset(), + ) -> None: + """Parse a codespell dictionary + + :param filename: The codespell dictionary file to parse + :param ignore_words: Words to ignore from this dictionary. + """ + misspellings = self._misspellings + with open(filename, encoding="utf-8") as f: + translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars] + for line in f: + [key, data] = line.split("->") + # TODO: For now, convert both to lower. + # Someday we can maybe add support for fixing caps. + key = key.lower() + data = data.lower() + if key not in ignore_words: + _add_misspelling(key, data, misspellings) + # generate alternative misspellings/fixes + for x, table in translate_tables: + if x in key: + alt_key = key.translate(table) + alt_data = data.translate(table) + if alt_key not in ignore_words: + _add_misspelling(alt_key, alt_data, misspellings) + + +def _add_misspelling( key: str, data: str, misspellings: Dict[str, Misspelling], @@ -54,27 +97,3 @@ def add_misspelling( fix, reason, ) - - -def build_dict( - filename: str, - misspellings: Dict[str, Misspelling], - ignore_words: Set[str], -) -> None: - with open(filename, encoding="utf-8") as f: - translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars] - for line in f: - [key, data] = line.split("->") - # TODO: For now, convert both to lower. - # Someday we can maybe add support for fixing caps. - key = key.lower() - data = data.lower() - if key not in ignore_words: - add_misspelling(key, data, misspellings) - # generate alternative misspellings/fixes - for x, table in translate_tables: - if x in key: - alt_key = key.translate(table) - alt_data = data.translate(table) - if alt_key not in ignore_words: - add_misspelling(alt_key, alt_data, misspellings) From ef5096c2d8bc07bf5e9165bfc8b495843ee19b4c Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Fri, 17 May 2024 10:35:19 +0000 Subject: [PATCH 04/11] Refactor line tokenization to simplify an outer loop The refactor is a stepping stone towards the next commit where the inner loop is moved to the `Spellchecker`. --- codespell_lib/_codespell.py | 54 +++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index faa546ddf6..d290c6815f 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -37,6 +37,7 @@ Sequence, Set, Tuple, + Callable, ) from ._text_util import fix_case @@ -833,6 +834,34 @@ def apply_uri_ignore_words( return check_matches +def line_tokenizer_factory( + uri_ignore_words: Set[str], + uri_regex: Pattern[str], + word_regex: Pattern[str], + ignore_word_regex: Optional[Pattern[str]], +) -> Callable[[str], Iterable[re.Match[str]]]: + def line_tokenizer(line: str) -> Iterable[Match[str]]: + # If all URI spelling errors will be ignored, erase any URI before + # extracting words. Otherwise, apply ignores after extracting words. + # This ensures that if a URI ignore word occurs both inside a URI and + # outside, it will still be a spelling error. + if "*" in uri_ignore_words: + line = uri_regex.sub(" ", line) + check_matches = extract_words_iter(line, word_regex, ignore_word_regex) + if "*" not in uri_ignore_words: + check_matches = apply_uri_ignore_words( + check_matches, + line, + word_regex, + ignore_word_regex, + uri_regex, + uri_ignore_words, + ) + return check_matches + + return line_tokenizer + + def parse_file( filename: str, colors: TermColors, @@ -910,6 +939,13 @@ def parse_file( except OSError: return bad_count + line_tokenizer = line_tokenizer_factory( + uri_ignore_words, + uri_regex, + word_regex, + ignore_word_regex, + ) + for i, line in enumerate(lines): if line.rstrip() in exclude_lines: continue @@ -926,23 +962,7 @@ def parse_file( fixed_words = set() asked_for = set() - # If all URI spelling errors will be ignored, erase any URI before - # extracting words. Otherwise, apply ignores after extracting words. - # This ensures that if a URI ignore word occurs both inside a URI and - # outside, it will still be a spelling error. - if "*" in uri_ignore_words: - line = uri_regex.sub(" ", line) - check_matches = extract_words_iter(line, word_regex, ignore_word_regex) - if "*" not in uri_ignore_words: - check_matches = apply_uri_ignore_words( - check_matches, - line, - word_regex, - ignore_word_regex, - uri_regex, - uri_ignore_words, - ) - for match in check_matches: + for match in line_tokenizer(line): word = match.group() if word in ignore_words_cased: continue From cd57087893324d672ea94324c384b46d32f08d59 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Fri, 17 May 2024 11:13:19 +0000 Subject: [PATCH 05/11] Rewrite line spellchecking and move most of it into the `Spellchecker` With this rewrite, performance improved slightly and is now down to 7% slower than the baseline (6s vs. 5.6s). There is deliberate an over-indentation left in this commit, since that makes this commit easier to review (without ignoring space changes). --- codespell_lib/_codespell.py | 48 +++++++++++++---------------------- codespell_lib/spellchecker.py | 46 +++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 31 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index d290c6815f..4f9aba8664 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -37,7 +37,6 @@ Sequence, Set, Tuple, - Callable, ) from ._text_util import fix_case @@ -47,7 +46,8 @@ __version__ as VERSION, # noqa: N812 ) from .spellchecker import ( - Misspelling, + DetectedMisspelling, + LineTokenizer, Spellchecker, ) @@ -716,15 +716,17 @@ def is_text_file(filename: str) -> bool: def ask_for_word_fix( line: str, - match: Match[str], - misspelling: Misspelling, + issue: DetectedMisspelling, interactivity: int, colors: TermColors, ) -> Tuple[bool, Sequence[str]]: - wrongword = match.group() + wrongword = issue.word + misspelling = issue.misspelling if interactivity <= 0: return misspelling.fix, fix_case(wrongword, misspelling.candidates) + match = issue.re_match + line_ui = ( f"{line[:match.start()]}" f"{colors.WWORD}{wrongword}{colors.DISABLE}" @@ -839,7 +841,7 @@ def line_tokenizer_factory( uri_regex: Pattern[str], word_regex: Pattern[str], ignore_word_regex: Optional[Pattern[str]], -) -> Callable[[str], Iterable[re.Match[str]]]: +) -> LineTokenizer: def line_tokenizer(line: str) -> Iterable[Match[str]]: # If all URI spelling errors will be ignored, erase any URI before # extracting words. Otherwise, apply ignores after extracting words. @@ -867,7 +869,6 @@ def parse_file( colors: TermColors, summary: Optional[Summary], spellchecker: Spellchecker, - ignore_words_cased: Set[str], exclude_lines: Set[str], file_opener: FileOpener, word_regex: Pattern[str], @@ -888,7 +889,7 @@ def parse_file( else: if options.check_filenames: for word in extract_words(filename, word_regex, ignore_word_regex): - if word in ignore_words_cased: + if word in spellchecker.ignore_words_cased: continue lword = word.lower() misspelling = spellchecker.check_lower_cased_word(lword) @@ -962,25 +963,12 @@ def parse_file( fixed_words = set() asked_for = set() - for match in line_tokenizer(line): - word = match.group() - if word in ignore_words_cased: - continue - lword = word.lower() - misspelling = spellchecker.check_lower_cased_word(lword) - if misspelling is not None and lword not in extra_words_to_ignore: - # Sometimes we find a 'misspelling' which is actually a valid word - # preceded by a string escape sequence. Ignore such cases as - # they're usually false alarms; see issue #17 among others. - char_before_idx = match.start() - 1 - if ( - char_before_idx >= 0 - and line[char_before_idx] == "\\" - # bell, backspace, formfeed, newline, carriage-return, tab, vtab. - and word.startswith(("a", "b", "f", "n", "r", "t", "v")) - and spellchecker.check_lower_cased_word(lword[1:]) is None - ): - continue + issues = spellchecker.spellcheck_line(line, line_tokenizer, extra_words_to_ignore=extra_words_to_ignore) + for issue in issues: + # TODO: De-indent in next commit + misspelling = issue.misspelling + word = issue.word + lword = issue.lword context_shown = False fix = misspelling.fix @@ -992,8 +980,7 @@ def parse_file( print_context(lines, i, context) fix, candidates = ask_for_word_fix( lines[i], - match, - misspelling, + issue, options.interactive, colors=colors, ) @@ -1197,6 +1184,7 @@ def main(*args: str) -> int: return EX_USAGE use_dictionaries.append(dictionary) spellchecker = Spellchecker() + spellchecker.ignore_words_cased = ignore_words_cased for dictionary in use_dictionaries: spellchecker.add_from_file(dictionary, ignore_words=ignore_words) colors = TermColors() @@ -1274,7 +1262,6 @@ def main(*args: str) -> int: colors, summary, spellchecker, - ignore_words_cased, exclude_lines, file_opener, word_regex, @@ -1299,7 +1286,6 @@ def main(*args: str) -> int: colors, summary, spellchecker, - ignore_words_cased, exclude_lines, file_opener, word_regex, diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py index 4f19e71269..0d87eef366 100644 --- a/codespell_lib/spellchecker.py +++ b/codespell_lib/spellchecker.py @@ -17,8 +17,11 @@ """ from typing import ( + Callable, Container, Dict, + Iterable, + Match, Optional, Sequence, ) @@ -28,6 +31,9 @@ alt_chars = (("'", "’"),) # noqa: RUF001 +LineTokenizer = Callable[[str], Iterable[Match[str]]] + + class Misspelling: def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None: self.candidates = candidates @@ -35,9 +41,49 @@ def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None: self.reason = reason +class DetectedMisspelling: + + def __init__(self, word: str, lword: str, misspelling: Misspelling, match: Match[str]) -> None: + self.word = word + self.lword = lword + self.misspelling = misspelling + self.re_match = match + + class Spellchecker: def __init__(self) -> None: self._misspellings: Dict[str, Misspelling] = {} + self.ignore_words_cased: Container[str] = frozenset() + + def spellcheck_line( + self, + line: str, + tokenizer: Callable[[str], Iterable[re.Match[str]]], + *, + extra_words_to_ignore: Container[str] = frozenset() + ) -> Iterable[DetectedMisspelling]: + misspellings = self._misspellings + ignore_words_cased = self.ignore_words_cased + for match in tokenizer(line): + word = match.group() + if word in ignore_words_cased: + continue + lword = word.lower() + misspelling = misspellings.get(lword) + if misspelling is not None and lword not in extra_words_to_ignore: + # Sometimes we find a 'misspelling' which is actually a valid word + # preceded by a string escape sequence. Ignore such cases as + # they're usually false alarms; see issue #17 among others. + char_before_idx = match.start() - 1 + if ( + char_before_idx >= 0 + and line[char_before_idx] == "\\" + # bell, backspace, formfeed, newline, carriage-return, tab, vtab. + and word.startswith(("a", "b", "f", "n", "r", "t", "v")) + and lword[1:] not in misspellings + ): + continue + yield DetectedMisspelling(word, lword, misspelling, match) def check_lower_cased_word(self, word: str) -> Optional[Misspelling]: """Check a given word against the loaded dictionaries From 8bd3517ccc556744340d37f6a374697ca1aabdc7 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Fri, 17 May 2024 11:17:21 +0000 Subject: [PATCH 06/11] De-indent loop body (whitespace-only / reformatting-only change) Deliberately in a separate. There are no functional changes, but there are some reformatting changes (line merges) as a consequence of the de-indent. --- codespell_lib/_codespell.py | 122 +++++++++++++++++------------------- 1 file changed, 58 insertions(+), 64 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 4f9aba8664..98efc12033 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -965,80 +965,74 @@ def parse_file( issues = spellchecker.spellcheck_line(line, line_tokenizer, extra_words_to_ignore=extra_words_to_ignore) for issue in issues: - # TODO: De-indent in next commit - misspelling = issue.misspelling - word = issue.word - lword = issue.lword + misspelling = issue.misspelling + word = issue.word + lword = issue.lword - context_shown = False - fix = misspelling.fix - candidates = fix_case(word, misspelling.candidates) + context_shown = False + fix = misspelling.fix + candidates = fix_case(word, misspelling.candidates) - if options.interactive and lword not in asked_for: - if context is not None: - context_shown = True - print_context(lines, i, context) - fix, candidates = ask_for_word_fix( - lines[i], - issue, - options.interactive, - colors=colors, - ) - asked_for.add(lword) + if options.interactive and lword not in asked_for: + if context is not None: + context_shown = True + print_context(lines, i, context) + fix, candidates = ask_for_word_fix( + lines[i], + issue, + options.interactive, + colors=colors, + ) + asked_for.add(lword) - if summary and fix: - summary.update(lword) + if summary and fix: + summary.update(lword) - if word in fixed_words: # can skip because of re.sub below - continue + if word in fixed_words: # can skip because of re.sub below + continue - if options.write_changes and fix: - changed = True - lines[i] = re.sub(rf"\b{word}\b", candidates[0], lines[i]) - fixed_words.add(word) - continue + if options.write_changes and fix: + changed = True + lines[i] = re.sub(rf"\b{word}\b", candidates[0], lines[i]) + fixed_words.add(word) + continue - # otherwise warning was explicitly set by interactive mode - if ( - options.interactive & 2 - and not fix - and not misspelling.reason - ): - continue + # otherwise warning was explicitly set by interactive mode + if options.interactive & 2 and not fix and not misspelling.reason: + continue - cfilename = f"{colors.FILE}{filename}{colors.DISABLE}" - cline = f"{colors.FILE}{i + 1}{colors.DISABLE}" - cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}" - crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}" + cfilename = f"{colors.FILE}{filename}{colors.DISABLE}" + cline = f"{colors.FILE}{i + 1}{colors.DISABLE}" + cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}" + crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}" - reason = misspelling.reason - if reason: - if options.quiet_level & QuietLevels.DISABLED_FIXES: - continue - creason = f" | {colors.FILE}{reason}{colors.DISABLE}" - else: - if options.quiet_level & QuietLevels.NON_AUTOMATIC_FIXES: - continue - creason = "" + reason = misspelling.reason + if reason: + if options.quiet_level & QuietLevels.DISABLED_FIXES: + continue + creason = f" | {colors.FILE}{reason}{colors.DISABLE}" + else: + if options.quiet_level & QuietLevels.NON_AUTOMATIC_FIXES: + continue + creason = "" - # If we get to this point (uncorrected error) we should change - # our bad_count and thus return value - bad_count += 1 + # If we get to this point (uncorrected error) we should change + # our bad_count and thus return value + bad_count += 1 - if (not context_shown) and (context is not None): - print_context(lines, i, context) - if filename != "-": - print( - f"{cfilename}:{cline}: {cwrongword} " - f"==> {crightword}{creason}" - ) - elif options.stdin_single_line: - print(f"{cline}: {cwrongword} ==> {crightword}{creason}") - else: - print( - f"{cline}: {line.strip()}\n\t{cwrongword} " - f"==> {crightword}{creason}" - ) + if (not context_shown) and (context is not None): + print_context(lines, i, context) + if filename != "-": + print( + f"{cfilename}:{cline}: {cwrongword} " f"==> {crightword}{creason}" + ) + elif options.stdin_single_line: + print(f"{cline}: {cwrongword} ==> {crightword}{creason}") + else: + print( + f"{cline}: {line.strip()}\n\t{cwrongword} " + f"==> {crightword}{creason}" + ) if changed: if filename == "-": From 7273c77edd389154d5e6a848633e506b411d76ae Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Fri, 17 May 2024 14:21:17 +0000 Subject: [PATCH 07/11] Support non-regex based tokens for `spellcheck_line` The `Spellchecker` only needs the `group` method from the `re.Match`. With a bit of generics and typing protocols, we can make the `Spellchecker` work with any token type that has a `group()` method. The `codespell` command line tool still assumes `re.Match` but it can get that via its own line tokenizer, so it all works out for everyone. --- codespell_lib/_codespell.py | 6 +- codespell_lib/spellchecker.py | 155 +++++++++++++++++++++++++++++++--- 2 files changed, 145 insertions(+), 16 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 98efc12033..8cbb1155b8 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -716,7 +716,7 @@ def is_text_file(filename: str) -> bool: def ask_for_word_fix( line: str, - issue: DetectedMisspelling, + issue: "DetectedMisspelling[re.Match[str]]", interactivity: int, colors: TermColors, ) -> Tuple[bool, Sequence[str]]: @@ -725,7 +725,7 @@ def ask_for_word_fix( if interactivity <= 0: return misspelling.fix, fix_case(wrongword, misspelling.candidates) - match = issue.re_match + match = issue.token line_ui = ( f"{line[:match.start()]}" @@ -841,7 +841,7 @@ def line_tokenizer_factory( uri_regex: Pattern[str], word_regex: Pattern[str], ignore_word_regex: Optional[Pattern[str]], -) -> LineTokenizer: +) -> "LineTokenizer[re.Match[str]]": def line_tokenizer(line: str) -> Iterable[Match[str]]: # If all URI spelling errors will be ignored, erase any URI before # extracting words. Otherwise, apply ignores after extracting words. diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py index 0d87eef366..9d1c5398d6 100644 --- a/codespell_lib/spellchecker.py +++ b/codespell_lib/spellchecker.py @@ -16,22 +16,135 @@ Copyright (C) 2011 ProFUSION embedded systems """ +import os +import re from typing import ( - Callable, Container, Dict, + Generic, Iterable, - Match, Optional, + Protocol, Sequence, + TypeVar, ) # Pass all misspellings through this translation table to generate # alternative misspellings and fixes. alt_chars = (("'", "’"),) # noqa: RUF001 +T_co = TypeVar("T_co", bound="Token", covariant=True) -LineTokenizer = Callable[[str], Iterable[Match[str]]] + +supported_languages_en = ("en", "en_GB", "en_US", "en_CA", "en_AU") +supported_languages = supported_languages_en + +# Users might want to link this file into /usr/local/bin, so we resolve the +# symbolic link path to the real path if necessary. +_data_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data") +_builtin_dictionaries = ( + # name, desc, name, err in aspell, correction in aspell, \ + # err dictionary array, rep dictionary array + # The arrays must contain the names of aspell dictionaries + # The aspell tests here aren't the ideal state, but the None's are + # realistic for obscure words + ("clear", "for unambiguous errors", "", False, None, supported_languages_en, None), + ( + "rare", + "for rare (but valid) words that are likely to be errors", + "_rare", + None, + None, + None, + None, + ), + ( + "informal", + "for making informal words more formal", + "_informal", + True, + True, + supported_languages_en, + supported_languages_en, + ), + ( + "usage", + "for replacing phrasing with recommended terms", + "_usage", + None, + None, + None, + None, + ), + ( + "code", + "for words from code and/or mathematics that are likely to be typos in other contexts (such as uint)", # noqa: E501 + "_code", + None, + None, + None, + None, + ), + ( + "names", + "for valid proper names that might be typos", + "_names", + None, + None, + None, + None, + ), + ( + "en-GB_to_en-US", + "for corrections from en-GB to en-US", + "_en-GB_to_en-US", + True, + True, + ("en_GB",), + ("en_US",), + ), +) +_builtin_default = "clear,rare" + +_builtin_default_as_tuple = tuple(_builtin_default.split(",")) + + +class UnknownBuiltinDictionaryError(ValueError): + def __init__(self, name: str) -> None: + super().__init__(f"Unknown built-in dictionary: {name}") + + +class BuiltinDictionariesAlreadyLoadedError(TypeError): + def __init__(self) -> None: + super().__init__( + "load_builtin_dictionaries must not be called more than once", + ) + + +class LineTokenizer(Protocol[T_co]): + """Callable that splits a line into multiple tokens to be spellchecked + + Generally, a regex will do for simple cases. A probably too simple one is: + + >>> tokenizer = re.compile(r"[^ ]+").finditer + + For more complex cases, either use more complex regexes or custom tokenization + code. + """ + + def __call__(self, line: str) -> Iterable[T_co]: ... + + +class Token(Protocol): + """Describes a token + + This is a protocol to support `re.Match[str]` (which codespell uses) and any + other tokenization method that our API consumers might be using. + """ + + def group(self) -> str: ... + + def start(self) -> int: ... class Misspelling: @@ -41,13 +154,18 @@ def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None: self.reason = reason -class DetectedMisspelling: - - def __init__(self, word: str, lword: str, misspelling: Misspelling, match: Match[str]) -> None: +class DetectedMisspelling(Generic[T_co]): + def __init__( + self, + word: str, + lword: str, + misspelling: Misspelling, + token: T_co, + ) -> None: self.word = word self.lword = lword self.misspelling = misspelling - self.re_match = match + self.token = token class Spellchecker: @@ -58,14 +176,25 @@ def __init__(self) -> None: def spellcheck_line( self, line: str, - tokenizer: Callable[[str], Iterable[re.Match[str]]], + tokenizer: LineTokenizer[T_co], *, extra_words_to_ignore: Container[str] = frozenset() - ) -> Iterable[DetectedMisspelling]: + ) -> Iterable[DetectedMisspelling[T_co]]: + """Tokenize and spellcheck a line + + Split the line into tokens based using the provided tokenizer. See the doc + string for the class for an example. + + :param line: The line to spellcheck. + :param tokenizer: A callable that will tokenize the line + :param extra_words_to_ignore: Extra words to ignore for this particular line + (such as content from a `codespell:ignore` comment) + """ misspellings = self._misspellings ignore_words_cased = self.ignore_words_cased - for match in tokenizer(line): - word = match.group() + + for token in tokenizer(line): + word = token.group() if word in ignore_words_cased: continue lword = word.lower() @@ -74,7 +203,7 @@ def spellcheck_line( # Sometimes we find a 'misspelling' which is actually a valid word # preceded by a string escape sequence. Ignore such cases as # they're usually false alarms; see issue #17 among others. - char_before_idx = match.start() - 1 + char_before_idx = token.start() - 1 if ( char_before_idx >= 0 and line[char_before_idx] == "\\" @@ -83,7 +212,7 @@ def spellcheck_line( and lword[1:] not in misspellings ): continue - yield DetectedMisspelling(word, lword, misspelling, match) + yield DetectedMisspelling(word, lword, misspelling, token) def check_lower_cased_word(self, word: str) -> Optional[Misspelling]: """Check a given word against the loaded dictionaries From c4d1738c0e0fce05cb5d98919e863f14c646b214 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Fri, 17 May 2024 13:55:49 +0000 Subject: [PATCH 08/11] Speed up spellchecking by ignoring whitespace-only lines The new API has introduced extra overhead per line being spellchecked. One way of optimizing out this overhead, is to spellcheck fewer lines. An obvious choice here, is to optimize out empty and whitespace-only lines, since they will not have any typos at all (on account of not having any words). A side-effect of this change is that we now spellcheck lines with trailing whitespace stripped. Semantically, this gives the same result (per "whitespace never has typos"). Performance-wise, it is faster in theory because the strings are now shorter (since we were calling `.rstrip()` anyway). In pratice, I am not sure we are going to find any real corpus where the trailing whitespace is noteworthy from a performance point of view. On the performance corpus from #3491, this takes out ~0.4s of runtime brining us down to slightly above the 5.6s that made the baseline. --- codespell_lib/_codespell.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 8cbb1155b8..da9cbcafb6 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -948,7 +948,8 @@ def parse_file( ) for i, line in enumerate(lines): - if line.rstrip() in exclude_lines: + line = line.rstrip() + if not line or line in exclude_lines: continue extra_words_to_ignore = set() From 3c08c9bac1217fa94516fefcc20e391799f7519d Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Fri, 17 May 2024 13:33:41 +0000 Subject: [PATCH 09/11] Move `codespell:ignore` check into `Spellchecker` This makes the API automatically avoid some declared false-positives that the command line tool would also filter. --- codespell_lib/_codespell.py | 13 +------------ codespell_lib/spellchecker.py | 25 ++++++++++++++++++++++--- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index da9cbcafb6..32aea6bbe1 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -59,7 +59,6 @@ "(\\b(?:https?|[ts]?ftp|file|git|smb)://[^\\s]+(?=$|\\s)|" "\\b[\\w.%+-]+@[\\w.-]+\\b)" ) -inline_ignore_regex = re.compile(r"[^\w\s]\s?codespell:ignore\b(\s+(?P[\w,]*))?") USAGE = """ \t%prog [OPTIONS] [file1 file2 ... fileN] """ @@ -952,20 +951,10 @@ def parse_file( if not line or line in exclude_lines: continue - extra_words_to_ignore = set() - match = inline_ignore_regex.search(line) - if match: - extra_words_to_ignore = set( - filter(None, (match.group("words") or "").split(",")) - ) - if not extra_words_to_ignore: - continue - fixed_words = set() asked_for = set() - issues = spellchecker.spellcheck_line(line, line_tokenizer, extra_words_to_ignore=extra_words_to_ignore) - for issue in issues: + for issue in spellchecker.spellcheck_line(line, line_tokenizer): misspelling = issue.misspelling word = issue.word lword = issue.lword diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py index 9d1c5398d6..f1ad6885b6 100644 --- a/codespell_lib/spellchecker.py +++ b/codespell_lib/spellchecker.py @@ -21,6 +21,7 @@ from typing import ( Container, Dict, + FrozenSet, Generic, Iterable, Optional, @@ -108,6 +109,8 @@ _builtin_default_as_tuple = tuple(_builtin_default.split(",")) +_inline_ignore_regex = re.compile(r"[^\w\s]\s?codespell:ignore\b(\s+(?P[\w,]*))?") + class UnknownBuiltinDictionaryError(ValueError): def __init__(self, name: str) -> None: @@ -173,12 +176,21 @@ def __init__(self) -> None: self._misspellings: Dict[str, Misspelling] = {} self.ignore_words_cased: Container[str] = frozenset() + def _parse_inline_ignore(self, line: str) -> Optional[FrozenSet[str]]: + inline_ignore_match = _inline_ignore_regex.search(line) + if inline_ignore_match: + words = frozenset( + filter(None, (inline_ignore_match.group("words") or "").split(",")) + ) + return words if words else None + return frozenset() + def spellcheck_line( self, line: str, tokenizer: LineTokenizer[T_co], *, - extra_words_to_ignore: Container[str] = frozenset() + respect_inline_ignore: bool = True, ) -> Iterable[DetectedMisspelling[T_co]]: """Tokenize and spellcheck a line @@ -187,12 +199,19 @@ def spellcheck_line( :param line: The line to spellcheck. :param tokenizer: A callable that will tokenize the line - :param extra_words_to_ignore: Extra words to ignore for this particular line - (such as content from a `codespell:ignore` comment) + :param respect_inline_ignore: Whether to check the line for + `codespell:ignore` instructions + :returns: An iterable of discovered typos. """ misspellings = self._misspellings ignore_words_cased = self.ignore_words_cased + extra_words_to_ignore = ( + self._parse_inline_ignore(line) if respect_inline_ignore else frozenset() + ) + if extra_words_to_ignore is None: + return + for token in tokenizer(line): word = token.group() if word in ignore_words_cased: From ce280c94275565bec966f410db3ad25f26529545 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Fri, 17 May 2024 13:41:23 +0000 Subject: [PATCH 10/11] Speed up `codespell:ignore` check by skipping the regex in most cases The changes to provide a public API had some performance related costs of about 1% runtime. There is no trivial way to offset this any further without undermining the API we are building. However, we can pull performance-related shenanigans to compenstate for the cost introduced. The codespell codebase unsurprisingly spends a vast majority of its runtime in various regex related code such as `search` and `finditer`. The best way to optimize runtime spend in regexes is to not do a regex in the first place, since the regex engine has a rather steep overhead over regular string primitives (that is the cost of flexibility). If the regex rarely matches and there is a very easy static substring that can be used to rule out the match, then you can speed up the code by using `substring in string` as a conditional to skip the regex. This is assuming the regex is used enough for the performance to matter. An obvious choice here falls on the `codespell:ignore` regex, because it has a very distinctive substring in the form of `codespell:ignore`, which will rule out almost all lines that will not match. With this little trick, runtime goes from ~5.6s to ~4.9s on the corpus mentioned in #3419. --- codespell_lib/spellchecker.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py index f1ad6885b6..ac43074798 100644 --- a/codespell_lib/spellchecker.py +++ b/codespell_lib/spellchecker.py @@ -109,7 +109,10 @@ _builtin_default_as_tuple = tuple(_builtin_default.split(",")) -_inline_ignore_regex = re.compile(r"[^\w\s]\s?codespell:ignore\b(\s+(?P[\w,]*))?") +_codespell_ignore_tag = "codespell:ignore" +_inline_ignore_regex = re.compile( + rf"[^\w\s]\s?{_codespell_ignore_tag}\b(\s+(?P[\w,]*))?" +) class UnknownBuiltinDictionaryError(ValueError): @@ -177,6 +180,8 @@ def __init__(self) -> None: self.ignore_words_cased: Container[str] = frozenset() def _parse_inline_ignore(self, line: str) -> Optional[FrozenSet[str]]: + if _codespell_ignore_tag not in line: + return frozenset() inline_ignore_match = _inline_ignore_regex.search(line) if inline_ignore_match: words = frozenset( From ae0e8d2544a92c1a5b4b35ec6ffb2876a1829be7 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Sat, 25 May 2024 09:41:58 +0000 Subject: [PATCH 11/11] Refactor: Rename `spellchecker.py` to `_spellchecker.py` Per review comment. --- codespell_lib/_codespell.py | 10 +++++----- codespell_lib/{spellchecker.py => _spellchecker.py} | 0 2 files changed, 5 insertions(+), 5 deletions(-) rename codespell_lib/{spellchecker.py => _spellchecker.py} (100%) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 32aea6bbe1..baa176cba1 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -39,17 +39,17 @@ Tuple, ) +from ._spellchecker import ( + DetectedMisspelling, + LineTokenizer, + Spellchecker, +) from ._text_util import fix_case # autogenerated by setuptools_scm from ._version import ( # type: ignore[import-not-found] __version__ as VERSION, # noqa: N812 ) -from .spellchecker import ( - DetectedMisspelling, - LineTokenizer, - Spellchecker, -) word_regex_def = r"[\w\-'’]+" # noqa: RUF001 # While we want to treat characters like ( or " as okay for a starting break, diff --git a/codespell_lib/spellchecker.py b/codespell_lib/_spellchecker.py similarity index 100% rename from codespell_lib/spellchecker.py rename to codespell_lib/_spellchecker.py