From b28a5a381b67e4af134639fca0984929fa39a2c4 Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Fri, 17 May 2024 07:40:18 +0000
Subject: [PATCH 01/11] Refactor: Move some code to new files for reuse

No new code is introduced; only existing code is shuffled around and
the functions moved are unchanged as well.
---
 codespell_lib/_codespell.py   | 65 ++----------------------------
 codespell_lib/_text_util.py   | 27 +++++++++++++
 codespell_lib/spellchecker.py | 75 +++++++++++++++++++++++++++++++++++
 3 files changed, 105 insertions(+), 62 deletions(-)
 create mode 100644 codespell_lib/_text_util.py
 create mode 100644 codespell_lib/spellchecker.py
diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index 62a51b75b3..89945988dc 100644
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -39,10 +39,13 @@
     Tuple,
 )
 
+from ._text_util import fix_case
+
 # autogenerated by setuptools_scm
 from ._version import (  # type: ignore[import-not-found]
     __version__ as VERSION,  # noqa: N812
 )
+from .spellchecker import Misspelling, build_dict
 
 word_regex_def = r"[\w\-'’]+"  # noqa: RUF001
 # While we want to treat characters like ( or " as okay for a starting break,
@@ -52,9 +55,6 @@
     "(\\b(?:https?|[ts]?ftp|file|git|smb)://[^\\s]+(?=$|\\s)|"
     "\\b[\\w.%+-]+@[\\w.-]+\\b)"
 )
-# Pass all misspellings through this translation table to generate
-# alternative misspellings and fixes.
-alt_chars = (("'", "’"),)  # noqa: RUF001
 inline_ignore_regex = re.compile(r"[^\w\s]\s?codespell:ignore\b(\s+(?P<words>[\w,]*))?")
 USAGE = """
 \t%prog [OPTIONS] [file1 file2 ... fileN]
@@ -167,13 +167,6 @@ def match(self, filename: str) -> bool:
         return any(fnmatch.fnmatch(filename, p) for p in self.pattern_list)
 
 
-class Misspelling:
-    def __init__(self, data: str, fix: bool, reason: str) -> None:
-        self.data = data
-        self.fix = fix
-        self.reason = reason
-
-
 class TermColors:
     def __init__(self) -> None:
         self.FILE = "\033[33m"
@@ -703,48 +696,6 @@ def build_ignore_words(
         )
 
 
-def add_misspelling(
-    key: str,
-    data: str,
-    misspellings: Dict[str, Misspelling],
-) -> None:
-    data = data.strip()
-
-    if "," in data:
-        fix = False
-        data, reason = data.rsplit(",", 1)
-        reason = reason.lstrip()
-    else:
-        fix = True
-        reason = ""
-
-    misspellings[key] = Misspelling(data, fix, reason)
-
-
-def build_dict(
-    filename: str,
-    misspellings: Dict[str, Misspelling],
-    ignore_words: Set[str],
-) -> None:
-    with open(filename, encoding="utf-8") as f:
-        translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars]
-        for line in f:
-            [key, data] = line.split("->")
-            # TODO: For now, convert both to lower.
-            #       Someday we can maybe add support for fixing caps.
-            key = key.lower()
-            data = data.lower()
-            if key not in ignore_words:
-                add_misspelling(key, data, misspellings)
-            # generate alternative misspellings/fixes
-            for x, table in translate_tables:
-                if x in key:
-                    alt_key = key.translate(table)
-                    alt_data = data.translate(table)
-                    if alt_key not in ignore_words:
-                        add_misspelling(alt_key, alt_data, misspellings)
-
-
 def is_hidden(filename: str, check_hidden: bool) -> bool:
     bfilename = os.path.basename(filename)
 
@@ -759,16 +710,6 @@ def is_text_file(filename: str) -> bool:
     return b"\x00" not in s
 
 
-def fix_case(word: str, fixword: str) -> str:
-    if word == word.capitalize():
-        return ", ".join(w.strip().capitalize() for w in fixword.split(","))
-    if word == word.upper():
-        return fixword.upper()
-    # they are both lower case
-    # or we don't have any idea
-    return fixword
-
-
 def ask_for_word_fix(
     line: str,
     match: Match[str],
diff --git a/codespell_lib/_text_util.py b/codespell_lib/_text_util.py
new file mode 100644
index 0000000000..18a2ec89b4
--- /dev/null
+++ b/codespell_lib/_text_util.py
@@ -0,0 +1,27 @@
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see
+# https://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
+"""
+Copyright (C) 2010-2011  Lucas De Marchi <lucas.de.marchi@gmail.com>
+Copyright (C) 2011  ProFUSION embedded systems
+"""
+
+
+def fix_case(word: str, fixword: str) -> str:
+    if word == word.capitalize():
+        return ", ".join(w.strip().capitalize() for w in fixword.split(","))
+    if word == word.upper():
+        return fixword.upper()
+    # they are both lower case
+    # or we don't have any idea
+    return fixword
diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py
new file mode 100644
index 0000000000..82865cdd19
--- /dev/null
+++ b/codespell_lib/spellchecker.py
@@ -0,0 +1,75 @@
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see
+# https://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
+"""
+Copyright (C) 2010-2011  Lucas De Marchi <lucas.de.marchi@gmail.com>
+Copyright (C) 2011  ProFUSION embedded systems
+"""
+
+from typing import (
+    Dict,
+    Set,
+)
+
+# Pass all misspellings through this translation table to generate
+# alternative misspellings and fixes.
+alt_chars = (("'", "’"),)  # noqa: RUF001
+
+
+class Misspelling:
+    def __init__(self, data: str, fix: bool, reason: str) -> None:
+        self.data = data
+        self.fix = fix
+        self.reason = reason
+
+
+def add_misspelling(
+    key: str,
+    data: str,
+    misspellings: Dict[str, Misspelling],
+) -> None:
+    data = data.strip()
+
+    if "," in data:
+        fix = False
+        data, reason = data.rsplit(",", 1)
+        reason = reason.lstrip()
+    else:
+        fix = True
+        reason = ""
+
+    misspellings[key] = Misspelling(data, fix, reason)
+
+
+def build_dict(
+    filename: str,
+    misspellings: Dict[str, Misspelling],
+    ignore_words: Set[str],
+) -> None:
+    with open(filename, encoding="utf-8") as f:
+        translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars]
+        for line in f:
+            [key, data] = line.split("->")
+            # TODO: For now, convert both to lower.
+            #       Someday we can maybe add support for fixing caps.
+            key = key.lower()
+            data = data.lower()
+            if key not in ignore_words:
+                add_misspelling(key, data, misspellings)
+            # generate alternative misspellings/fixes
+            for x, table in translate_tables:
+                if x in key:
+                    alt_key = key.translate(table)
+                    alt_data = data.translate(table)
+                    if alt_key not in ignore_words:
+                        add_misspelling(alt_key, alt_data, misspellings)

From 824bd7c2ea283f1a10baddefb720ef8aa4858c76 Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Fri, 17 May 2024 08:21:10 +0000
Subject: [PATCH 02/11] Replace `data: str` with `candidates: Sequence[str]`

When the spelling dictionaries are loaded, previously the correction
line was just stored in memory as a simple text. Through out the code,
callers would then have to deal with the `data` attribute, correctly
`split()` + `strip()` it. With this change, the dictionary parsing
code now encapsulates this problem.

The auto-correction works from the assumption that there is only one
candidate. This assumption is invariant and seem to be properly
maintained in the code. Therefore, we can just pick the first
candidate word when doing a correction.

In the code, the following name changes are performed:

 * `Misspelling.data` -> `Misspelling.candidates`
 * `fixword` -> `candidates` when used for multiple candidates
   (`fixword` remains for when it is a correction)

On performance:

Performance-wise, this change moves computation from "checking" time
to "startup" time.  The performance cost does not appear to be
noticeable in my baseline (#3419). Though, keep the corpus weakness on
the ratio of cased vs. non-cased corrections with multiple candidates
in mind.

The all lowercase typo is now slightly more expensive (it was passed
throughout `fix_case` and fed directly into the `print` in the
original code. In the new code, it will always need a `join`).  There
are still an overweight of lower-case only corrections in general, so
the unconditional `.join` alone is not sufficient to affect the
performance noticeably.
---
 codespell_lib/_codespell.py   | 36 +++++++++++++++++++----------------
 codespell_lib/_text_util.py   | 10 ++++++----
 codespell_lib/spellchecker.py | 11 ++++++++---
 3 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index 89945988dc..2fa15adfe9 100644
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -26,6 +26,10 @@
 import sys
 import textwrap
 from ctypes import wintypes
+from .spellchecker import (
+    build_dict,
+    Misspelling,
+)
 from typing import (
     Any,
     Dict,
@@ -45,7 +49,6 @@
 from ._version import (  # type: ignore[import-not-found]
     __version__ as VERSION,  # noqa: N812
 )
-from .spellchecker import Misspelling, build_dict
 
 word_regex_def = r"[\w\-'’]+"  # noqa: RUF001
 # While we want to treat characters like ( or " as okay for a starting break,
@@ -716,10 +719,10 @@ def ask_for_word_fix(
     misspelling: Misspelling,
     interactivity: int,
     colors: TermColors,
-) -> Tuple[bool, str]:
+) -> Tuple[bool, Sequence[str]]:
     wrongword = match.group()
     if interactivity <= 0:
-        return misspelling.fix, fix_case(wrongword, misspelling.data)
+        return misspelling.fix, fix_case(wrongword, misspelling.candidates)
 
     line_ui = (
         f"{line[:match.start()]}"
@@ -729,7 +732,8 @@ def ask_for_word_fix(
 
     if misspelling.fix and interactivity & 1:
         r = ""
-        fixword = fix_case(wrongword, misspelling.data)
+        candidates = fix_case(wrongword, misspelling.candidates)
+        fixword = candidates[0]
         while not r:
             print(f"{line_ui}\t{wrongword} ==> {fixword} (Y/n) ", end="", flush=True)
             r = sys.stdin.readline().strip().upper()
@@ -747,12 +751,12 @@ def ask_for_word_fix(
         # we ask the user which word to use
 
         r = ""
-        opt = [w.strip() for w in misspelling.data.split(",")]
+        opt = misspelling.candidates
         while not r:
             print(f"{line_ui} Choose an option (blank for none): ", end="")
-            for i, o in enumerate(opt):
-                fixword = fix_case(wrongword, o)
-                print(f" {i}) {fixword}", end="")
+            cased_candidates = fix_case(wrongword, opt)
+            for i, candidates in enumerate(cased_candidates):
+                print(f" {i}) {candidates}", end="")
             print(": ", end="", flush=True)
 
             n = sys.stdin.readline().strip()
@@ -767,9 +771,9 @@ def ask_for_word_fix(
 
         if r:
             misspelling.fix = True
-            misspelling.data = r
+            misspelling.candidates = (r,)
 
-    return misspelling.fix, fix_case(wrongword, misspelling.data)
+    return misspelling.fix, fix_case(wrongword, misspelling.candidates)
 
 
 def print_context(
@@ -861,14 +865,14 @@ def parse_file(
                 if lword not in misspellings:
                     continue
                 fix = misspellings[lword].fix
-                fixword = fix_case(word, misspellings[lword].data)
+                candidates = fix_case(word, misspellings[lword].candidates)
 
                 if summary and fix:
                     summary.update(lword)
 
                 cfilename = f"{colors.FILE}{filename}{colors.DISABLE}"
                 cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
-                crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}"
+                crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}"
 
                 reason = misspellings[lword].reason
                 if reason:
@@ -958,13 +962,13 @@ def parse_file(
 
                 context_shown = False
                 fix = misspellings[lword].fix
-                fixword = fix_case(word, misspellings[lword].data)
+                candidates = fix_case(word, misspellings[lword].candidates)
 
                 if options.interactive and lword not in asked_for:
                     if context is not None:
                         context_shown = True
                         print_context(lines, i, context)
-                    fix, fixword = ask_for_word_fix(
+                    fix, candidates = ask_for_word_fix(
                         lines[i],
                         match,
                         misspellings[lword],
@@ -981,7 +985,7 @@ def parse_file(
 
                 if options.write_changes and fix:
                     changed = True
-                    lines[i] = re.sub(rf"\b{word}\b", fixword, lines[i])
+                    lines[i] = re.sub(rf"\b{word}\b", candidates[0], lines[i])
                     fixed_words.add(word)
                     continue
 
@@ -996,7 +1000,7 @@ def parse_file(
                 cfilename = f"{colors.FILE}{filename}{colors.DISABLE}"
                 cline = f"{colors.FILE}{i + 1}{colors.DISABLE}"
                 cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
-                crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}"
+                crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}"
 
                 reason = misspellings[lword].reason
                 if reason:
diff --git a/codespell_lib/_text_util.py b/codespell_lib/_text_util.py
index 18a2ec89b4..c141db503d 100644
--- a/codespell_lib/_text_util.py
+++ b/codespell_lib/_text_util.py
@@ -16,12 +16,14 @@
 Copyright (C) 2011  ProFUSION embedded systems
 """
 
+from typing import Sequence
 
-def fix_case(word: str, fixword: str) -> str:
+
+def fix_case(word: str, candidates: Sequence[str]) -> Sequence[str]:
     if word == word.capitalize():
-        return ", ".join(w.strip().capitalize() for w in fixword.split(","))
+        return tuple(c.capitalize() for c in candidates)
     if word == word.upper():
-        return fixword.upper()
+        return tuple(c.upper() for c in candidates)
     # they are both lower case
     # or we don't have any idea
-    return fixword
+    return candidates
diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py
index 82865cdd19..fadaf49e44 100644
--- a/codespell_lib/spellchecker.py
+++ b/codespell_lib/spellchecker.py
@@ -18,6 +18,7 @@
 
 from typing import (
     Dict,
+    Sequence,
     Set,
 )
 
@@ -27,8 +28,8 @@
 
 
 class Misspelling:
-    def __init__(self, data: str, fix: bool, reason: str) -> None:
-        self.data = data
+    def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None:
+        self.candidates = candidates
         self.fix = fix
         self.reason = reason
 
@@ -48,7 +49,11 @@ def add_misspelling(
         fix = True
         reason = ""
 
-    misspellings[key] = Misspelling(data, fix, reason)
+    misspellings[key] = Misspelling(
+        tuple(c.strip() for c in data.split(",")),
+        fix,
+        reason,
+    )
 
 
 def build_dict(

From aa7792fee731a5b195277bfe7a9184e04be63da5 Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Fri, 17 May 2024 09:23:23 +0000
Subject: [PATCH 03/11] Refactor dictionary into a new `Spellchecker` class

This is as close to a 1:1 conversion as possible. It might change
whhen we get to designing the API. The callers have been refactored to
only perform the lookup once. This was mostly to keep the code more
readable.

The performance cost does seem noticable, which is unsurprising. This
method has a higher cost towards non-matches which is the most common
case.  This commit causes the performance to drop roughly 10% on its
and we are now slower than the goal.
---
 codespell_lib/_codespell.py   | 42 +++++++++++----------
 codespell_lib/_text_util.py   |  2 +-
 codespell_lib/spellchecker.py | 71 ++++++++++++++++++++++-------------
 3 files changed, 68 insertions(+), 47 deletions(-)

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index 2fa15adfe9..faa546ddf6 100644
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -26,10 +26,6 @@
 import sys
 import textwrap
 from ctypes import wintypes
-from .spellchecker import (
-    build_dict,
-    Misspelling,
-)
 from typing import (
     Any,
     Dict,
@@ -49,6 +45,10 @@
 from ._version import (  # type: ignore[import-not-found]
     __version__ as VERSION,  # noqa: N812
 )
+from .spellchecker import (
+    Misspelling,
+    Spellchecker,
+)
 
 word_regex_def = r"[\w\-'’]+"  # noqa: RUF001
 # While we want to treat characters like ( or " as okay for a starting break,
@@ -837,7 +837,7 @@ def parse_file(
     filename: str,
     colors: TermColors,
     summary: Optional[Summary],
-    misspellings: Dict[str, Misspelling],
+    spellchecker: Spellchecker,
     ignore_words_cased: Set[str],
     exclude_lines: Set[str],
     file_opener: FileOpener,
@@ -862,10 +862,11 @@ def parse_file(
                 if word in ignore_words_cased:
                     continue
                 lword = word.lower()
-                if lword not in misspellings:
+                misspelling = spellchecker.check_lower_cased_word(lword)
+                if misspelling is None:
                     continue
-                fix = misspellings[lword].fix
-                candidates = fix_case(word, misspellings[lword].candidates)
+                fix = misspelling.fix
+                candidates = fix_case(word, misspelling.candidates)
 
                 if summary and fix:
                     summary.update(lword)
@@ -874,7 +875,7 @@ def parse_file(
                 cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
                 crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}"
 
-                reason = misspellings[lword].reason
+                reason = misspelling.reason
                 if reason:
                     if options.quiet_level & QuietLevels.DISABLED_FIXES:
                         continue
@@ -946,7 +947,8 @@ def parse_file(
             if word in ignore_words_cased:
                 continue
             lword = word.lower()
-            if lword in misspellings and lword not in extra_words_to_ignore:
+            misspelling = spellchecker.check_lower_cased_word(lword)
+            if misspelling is not None and lword not in extra_words_to_ignore:
                 # Sometimes we find a 'misspelling' which is actually a valid word
                 # preceded by a string escape sequence.  Ignore such cases as
                 # they're usually false alarms; see issue #17 among others.
@@ -956,13 +958,13 @@ def parse_file(
                     and line[char_before_idx] == "\\"
                     # bell, backspace, formfeed, newline, carriage-return, tab, vtab.
                     and word.startswith(("a", "b", "f", "n", "r", "t", "v"))
-                    and lword[1:] not in misspellings
+                    and spellchecker.check_lower_cased_word(lword[1:]) is None
                 ):
                     continue
 
                 context_shown = False
-                fix = misspellings[lword].fix
-                candidates = fix_case(word, misspellings[lword].candidates)
+                fix = misspelling.fix
+                candidates = fix_case(word, misspelling.candidates)
 
                 if options.interactive and lword not in asked_for:
                     if context is not None:
@@ -971,7 +973,7 @@ def parse_file(
                     fix, candidates = ask_for_word_fix(
                         lines[i],
                         match,
-                        misspellings[lword],
+                        misspelling,
                         options.interactive,
                         colors=colors,
                     )
@@ -993,7 +995,7 @@ def parse_file(
                 if (
                     options.interactive & 2
                     and not fix
-                    and not misspellings[lword].reason
+                    and not misspelling.reason
                 ):
                     continue
 
@@ -1002,7 +1004,7 @@ def parse_file(
                 cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
                 crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}"
 
-                reason = misspellings[lword].reason
+                reason = misspelling.reason
                 if reason:
                     if options.quiet_level & QuietLevels.DISABLED_FIXES:
                         continue
@@ -1174,9 +1176,9 @@ def main(*args: str) -> int:
                 parser.print_help()
                 return EX_USAGE
             use_dictionaries.append(dictionary)
-    misspellings: Dict[str, Misspelling] = {}
+    spellchecker = Spellchecker()
     for dictionary in use_dictionaries:
-        build_dict(dictionary, misspellings, ignore_words)
+        spellchecker.add_from_file(dictionary, ignore_words=ignore_words)
     colors = TermColors()
     if not options.colors:
         colors.disable()
@@ -1251,7 +1253,7 @@ def main(*args: str) -> int:
                         fname,
                         colors,
                         summary,
-                        misspellings,
+                        spellchecker,
                         ignore_words_cased,
                         exclude_lines,
                         file_opener,
@@ -1276,7 +1278,7 @@ def main(*args: str) -> int:
                 filename,
                 colors,
                 summary,
-                misspellings,
+                spellchecker,
                 ignore_words_cased,
                 exclude_lines,
                 file_opener,
diff --git a/codespell_lib/_text_util.py b/codespell_lib/_text_util.py
index c141db503d..33e6d7e033 100644
--- a/codespell_lib/_text_util.py
+++ b/codespell_lib/_text_util.py
@@ -24,6 +24,6 @@ def fix_case(word: str, candidates: Sequence[str]) -> Sequence[str]:
         return tuple(c.capitalize() for c in candidates)
     if word == word.upper():
         return tuple(c.upper() for c in candidates)
-    # they are both lower case
+    # they are both lower-case
     # or we don't have any idea
     return candidates
diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py
index fadaf49e44..4f19e71269 100644
--- a/codespell_lib/spellchecker.py
+++ b/codespell_lib/spellchecker.py
@@ -17,9 +17,10 @@
 """
 
 from typing import (
+    Container,
     Dict,
+    Optional,
     Sequence,
-    Set,
 )
 
 # Pass all misspellings through this translation table to generate
@@ -34,7 +35,49 @@ def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None:
         self.reason = reason
 
 
-def add_misspelling(
+class Spellchecker:
+    def __init__(self) -> None:
+        self._misspellings: Dict[str, Misspelling] = {}
+
+    def check_lower_cased_word(self, word: str) -> Optional[Misspelling]:
+        """Check a given word against the loaded dictionaries
+
+        :param word: The word to check. This should be all lower-case.
+        """
+        return self._misspellings.get(word)
+
+    def add_from_file(
+        self,
+        filename: str,
+        *,
+        ignore_words: Container[str] = frozenset(),
+    ) -> None:
+        """Parse a codespell dictionary
+
+        :param filename: The codespell dictionary file to parse
+        :param ignore_words: Words to ignore from this dictionary.
+        """
+        misspellings = self._misspellings
+        with open(filename, encoding="utf-8") as f:
+            translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars]
+            for line in f:
+                [key, data] = line.split("->")
+                # TODO: For now, convert both to lower.
+                #       Someday we can maybe add support for fixing caps.
+                key = key.lower()
+                data = data.lower()
+                if key not in ignore_words:
+                    _add_misspelling(key, data, misspellings)
+                # generate alternative misspellings/fixes
+                for x, table in translate_tables:
+                    if x in key:
+                        alt_key = key.translate(table)
+                        alt_data = data.translate(table)
+                        if alt_key not in ignore_words:
+                            _add_misspelling(alt_key, alt_data, misspellings)
+
+
+def _add_misspelling(
     key: str,
     data: str,
     misspellings: Dict[str, Misspelling],
@@ -54,27 +97,3 @@ def add_misspelling(
         fix,
         reason,
     )
-
-
-def build_dict(
-    filename: str,
-    misspellings: Dict[str, Misspelling],
-    ignore_words: Set[str],
-) -> None:
-    with open(filename, encoding="utf-8") as f:
-        translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars]
-        for line in f:
-            [key, data] = line.split("->")
-            # TODO: For now, convert both to lower.
-            #       Someday we can maybe add support for fixing caps.
-            key = key.lower()
-            data = data.lower()
-            if key not in ignore_words:
-                add_misspelling(key, data, misspellings)
-            # generate alternative misspellings/fixes
-            for x, table in translate_tables:
-                if x in key:
-                    alt_key = key.translate(table)
-                    alt_data = data.translate(table)
-                    if alt_key not in ignore_words:
-                        add_misspelling(alt_key, alt_data, misspellings)

From ef5096c2d8bc07bf5e9165bfc8b495843ee19b4c Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Fri, 17 May 2024 10:35:19 +0000
Subject: [PATCH 04/11] Refactor line tokenization to simplify an outer loop

The refactor is a stepping stone towards the next commit where the
inner loop is moved to the `Spellchecker`.
---
 codespell_lib/_codespell.py | 54 +++++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 17 deletions(-)

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index faa546ddf6..d290c6815f 100644
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -37,6 +37,7 @@
     Sequence,
     Set,
     Tuple,
+    Callable,
 )
 
 from ._text_util import fix_case
@@ -833,6 +834,34 @@ def apply_uri_ignore_words(
     return check_matches
 
 
+def line_tokenizer_factory(
+    uri_ignore_words: Set[str],
+    uri_regex: Pattern[str],
+    word_regex: Pattern[str],
+    ignore_word_regex: Optional[Pattern[str]],
+) -> Callable[[str], Iterable[re.Match[str]]]:
+    def line_tokenizer(line: str) -> Iterable[Match[str]]:
+        # If all URI spelling errors will be ignored, erase any URI before
+        # extracting words. Otherwise, apply ignores after extracting words.
+        # This ensures that if a URI ignore word occurs both inside a URI and
+        # outside, it will still be a spelling error.
+        if "*" in uri_ignore_words:
+            line = uri_regex.sub(" ", line)
+        check_matches = extract_words_iter(line, word_regex, ignore_word_regex)
+        if "*" not in uri_ignore_words:
+            check_matches = apply_uri_ignore_words(
+                check_matches,
+                line,
+                word_regex,
+                ignore_word_regex,
+                uri_regex,
+                uri_ignore_words,
+            )
+        return check_matches
+
+    return line_tokenizer
+
+
 def parse_file(
     filename: str,
     colors: TermColors,
@@ -910,6 +939,13 @@ def parse_file(
         except OSError:
             return bad_count
 
+    line_tokenizer = line_tokenizer_factory(
+        uri_ignore_words,
+        uri_regex,
+        word_regex,
+        ignore_word_regex,
+    )
+
     for i, line in enumerate(lines):
         if line.rstrip() in exclude_lines:
             continue
@@ -926,23 +962,7 @@ def parse_file(
         fixed_words = set()
         asked_for = set()
 
-        # If all URI spelling errors will be ignored, erase any URI before
-        # extracting words. Otherwise, apply ignores after extracting words.
-        # This ensures that if a URI ignore word occurs both inside a URI and
-        # outside, it will still be a spelling error.
-        if "*" in uri_ignore_words:
-            line = uri_regex.sub(" ", line)
-        check_matches = extract_words_iter(line, word_regex, ignore_word_regex)
-        if "*" not in uri_ignore_words:
-            check_matches = apply_uri_ignore_words(
-                check_matches,
-                line,
-                word_regex,
-                ignore_word_regex,
-                uri_regex,
-                uri_ignore_words,
-            )
-        for match in check_matches:
+        for match in line_tokenizer(line):
             word = match.group()
             if word in ignore_words_cased:
                 continue

From cd57087893324d672ea94324c384b46d32f08d59 Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Fri, 17 May 2024 11:13:19 +0000
Subject: [PATCH 05/11] Rewrite line spellchecking and move most of it into the
 `Spellchecker`

With this rewrite, performance improved slightly and is now down to 7%
slower than the baseline (6s vs. 5.6s).

There is deliberate an over-indentation left in this commit, since
that makes this commit easier to review (without ignoring space
changes).
---
 codespell_lib/_codespell.py   | 48 +++++++++++++----------------------
 codespell_lib/spellchecker.py | 46 +++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 31 deletions(-)

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index d290c6815f..4f9aba8664 100644
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -37,7 +37,6 @@
     Sequence,
     Set,
     Tuple,
-    Callable,
 )
 
 from ._text_util import fix_case
@@ -47,7 +46,8 @@
     __version__ as VERSION,  # noqa: N812
 )
 from .spellchecker import (
-    Misspelling,
+    DetectedMisspelling,
+    LineTokenizer,
     Spellchecker,
 )
 
@@ -716,15 +716,17 @@ def is_text_file(filename: str) -> bool:
 
 def ask_for_word_fix(
     line: str,
-    match: Match[str],
-    misspelling: Misspelling,
+    issue: DetectedMisspelling,
     interactivity: int,
     colors: TermColors,
 ) -> Tuple[bool, Sequence[str]]:
-    wrongword = match.group()
+    wrongword = issue.word
+    misspelling = issue.misspelling
     if interactivity <= 0:
         return misspelling.fix, fix_case(wrongword, misspelling.candidates)
 
+    match = issue.re_match
+
     line_ui = (
         f"{line[:match.start()]}"
         f"{colors.WWORD}{wrongword}{colors.DISABLE}"
@@ -839,7 +841,7 @@ def line_tokenizer_factory(
     uri_regex: Pattern[str],
     word_regex: Pattern[str],
     ignore_word_regex: Optional[Pattern[str]],
-) -> Callable[[str], Iterable[re.Match[str]]]:
+) -> LineTokenizer:
     def line_tokenizer(line: str) -> Iterable[Match[str]]:
         # If all URI spelling errors will be ignored, erase any URI before
         # extracting words. Otherwise, apply ignores after extracting words.
@@ -867,7 +869,6 @@ def parse_file(
     colors: TermColors,
     summary: Optional[Summary],
     spellchecker: Spellchecker,
-    ignore_words_cased: Set[str],
     exclude_lines: Set[str],
     file_opener: FileOpener,
     word_regex: Pattern[str],
@@ -888,7 +889,7 @@ def parse_file(
     else:
         if options.check_filenames:
             for word in extract_words(filename, word_regex, ignore_word_regex):
-                if word in ignore_words_cased:
+                if word in spellchecker.ignore_words_cased:
                     continue
                 lword = word.lower()
                 misspelling = spellchecker.check_lower_cased_word(lword)
@@ -962,25 +963,12 @@ def parse_file(
         fixed_words = set()
         asked_for = set()
 
-        for match in line_tokenizer(line):
-            word = match.group()
-            if word in ignore_words_cased:
-                continue
-            lword = word.lower()
-            misspelling = spellchecker.check_lower_cased_word(lword)
-            if misspelling is not None and lword not in extra_words_to_ignore:
-                # Sometimes we find a 'misspelling' which is actually a valid word
-                # preceded by a string escape sequence.  Ignore such cases as
-                # they're usually false alarms; see issue #17 among others.
-                char_before_idx = match.start() - 1
-                if (
-                    char_before_idx >= 0
-                    and line[char_before_idx] == "\\"
-                    # bell, backspace, formfeed, newline, carriage-return, tab, vtab.
-                    and word.startswith(("a", "b", "f", "n", "r", "t", "v"))
-                    and spellchecker.check_lower_cased_word(lword[1:]) is None
-                ):
-                    continue
+        issues = spellchecker.spellcheck_line(line, line_tokenizer, extra_words_to_ignore=extra_words_to_ignore)
+        for issue in issues:
+                # TODO: De-indent in next commit
+                misspelling = issue.misspelling
+                word = issue.word
+                lword = issue.lword
 
                 context_shown = False
                 fix = misspelling.fix
@@ -992,8 +980,7 @@ def parse_file(
                         print_context(lines, i, context)
                     fix, candidates = ask_for_word_fix(
                         lines[i],
-                        match,
-                        misspelling,
+                        issue,
                         options.interactive,
                         colors=colors,
                     )
@@ -1197,6 +1184,7 @@ def main(*args: str) -> int:
                 return EX_USAGE
             use_dictionaries.append(dictionary)
     spellchecker = Spellchecker()
+    spellchecker.ignore_words_cased = ignore_words_cased
     for dictionary in use_dictionaries:
         spellchecker.add_from_file(dictionary, ignore_words=ignore_words)
     colors = TermColors()
@@ -1274,7 +1262,6 @@ def main(*args: str) -> int:
                         colors,
                         summary,
                         spellchecker,
-                        ignore_words_cased,
                         exclude_lines,
                         file_opener,
                         word_regex,
@@ -1299,7 +1286,6 @@ def main(*args: str) -> int:
                 colors,
                 summary,
                 spellchecker,
-                ignore_words_cased,
                 exclude_lines,
                 file_opener,
                 word_regex,
diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py
index 4f19e71269..0d87eef366 100644
--- a/codespell_lib/spellchecker.py
+++ b/codespell_lib/spellchecker.py
@@ -17,8 +17,11 @@
 """
 
 from typing import (
+    Callable,
     Container,
     Dict,
+    Iterable,
+    Match,
     Optional,
     Sequence,
 )
@@ -28,6 +31,9 @@
 alt_chars = (("'", "’"),)  # noqa: RUF001
 
 
+LineTokenizer = Callable[[str], Iterable[Match[str]]]
+
+
 class Misspelling:
     def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None:
         self.candidates = candidates
@@ -35,9 +41,49 @@ def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None:
         self.reason = reason
 
 
+class DetectedMisspelling:
+
+    def __init__(self, word: str, lword: str, misspelling: Misspelling, match: Match[str]) -> None:
+        self.word = word
+        self.lword = lword
+        self.misspelling = misspelling
+        self.re_match = match
+
+
 class Spellchecker:
     def __init__(self) -> None:
         self._misspellings: Dict[str, Misspelling] = {}
+        self.ignore_words_cased: Container[str] = frozenset()
+
+    def spellcheck_line(
+        self,
+        line: str,
+        tokenizer: Callable[[str], Iterable[re.Match[str]]],
+        *,
+        extra_words_to_ignore: Container[str] = frozenset()
+    ) -> Iterable[DetectedMisspelling]:
+        misspellings = self._misspellings
+        ignore_words_cased = self.ignore_words_cased
+        for match in tokenizer(line):
+            word = match.group()
+            if word in ignore_words_cased:
+                continue
+            lword = word.lower()
+            misspelling = misspellings.get(lword)
+            if misspelling is not None and lword not in extra_words_to_ignore:
+                # Sometimes we find a 'misspelling' which is actually a valid word
+                # preceded by a string escape sequence.  Ignore such cases as
+                # they're usually false alarms; see issue #17 among others.
+                char_before_idx = match.start() - 1
+                if (
+                    char_before_idx >= 0
+                    and line[char_before_idx] == "\\"
+                    # bell, backspace, formfeed, newline, carriage-return, tab, vtab.
+                    and word.startswith(("a", "b", "f", "n", "r", "t", "v"))
+                    and lword[1:] not in misspellings
+                ):
+                    continue
+                yield DetectedMisspelling(word, lword, misspelling, match)
 
     def check_lower_cased_word(self, word: str) -> Optional[Misspelling]:
         """Check a given word against the loaded dictionaries

From 8bd3517ccc556744340d37f6a374697ca1aabdc7 Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Fri, 17 May 2024 11:17:21 +0000
Subject: [PATCH 06/11] De-indent loop body (whitespace-only /
 reformatting-only change)

Deliberately in a separate. There are no functional changes, but there
are some reformatting changes (line merges) as a consequence of the
de-indent.
---
 codespell_lib/_codespell.py | 122 +++++++++++++++++-------------------
 1 file changed, 58 insertions(+), 64 deletions(-)

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index 4f9aba8664..98efc12033 100644
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -965,80 +965,74 @@ def parse_file(
 
         issues = spellchecker.spellcheck_line(line, line_tokenizer, extra_words_to_ignore=extra_words_to_ignore)
         for issue in issues:
-                # TODO: De-indent in next commit
-                misspelling = issue.misspelling
-                word = issue.word
-                lword = issue.lword
+            misspelling = issue.misspelling
+            word = issue.word
+            lword = issue.lword
 
-                context_shown = False
-                fix = misspelling.fix
-                candidates = fix_case(word, misspelling.candidates)
+            context_shown = False
+            fix = misspelling.fix
+            candidates = fix_case(word, misspelling.candidates)
 
-                if options.interactive and lword not in asked_for:
-                    if context is not None:
-                        context_shown = True
-                        print_context(lines, i, context)
-                    fix, candidates = ask_for_word_fix(
-                        lines[i],
-                        issue,
-                        options.interactive,
-                        colors=colors,
-                    )
-                    asked_for.add(lword)
+            if options.interactive and lword not in asked_for:
+                if context is not None:
+                    context_shown = True
+                    print_context(lines, i, context)
+                fix, candidates = ask_for_word_fix(
+                    lines[i],
+                    issue,
+                    options.interactive,
+                    colors=colors,
+                )
+                asked_for.add(lword)
 
-                if summary and fix:
-                    summary.update(lword)
+            if summary and fix:
+                summary.update(lword)
 
-                if word in fixed_words:  # can skip because of re.sub below
-                    continue
+            if word in fixed_words:  # can skip because of re.sub below
+                continue
 
-                if options.write_changes and fix:
-                    changed = True
-                    lines[i] = re.sub(rf"\b{word}\b", candidates[0], lines[i])
-                    fixed_words.add(word)
-                    continue
+            if options.write_changes and fix:
+                changed = True
+                lines[i] = re.sub(rf"\b{word}\b", candidates[0], lines[i])
+                fixed_words.add(word)
+                continue
 
-                # otherwise warning was explicitly set by interactive mode
-                if (
-                    options.interactive & 2
-                    and not fix
-                    and not misspelling.reason
-                ):
-                    continue
+            # otherwise warning was explicitly set by interactive mode
+            if options.interactive & 2 and not fix and not misspelling.reason:
+                continue
 
-                cfilename = f"{colors.FILE}{filename}{colors.DISABLE}"
-                cline = f"{colors.FILE}{i + 1}{colors.DISABLE}"
-                cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
-                crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}"
+            cfilename = f"{colors.FILE}{filename}{colors.DISABLE}"
+            cline = f"{colors.FILE}{i + 1}{colors.DISABLE}"
+            cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
+            crightword = f"{colors.FWORD}{', '.join(candidates)}{colors.DISABLE}"
 
-                reason = misspelling.reason
-                if reason:
-                    if options.quiet_level & QuietLevels.DISABLED_FIXES:
-                        continue
-                    creason = f"  | {colors.FILE}{reason}{colors.DISABLE}"
-                else:
-                    if options.quiet_level & QuietLevels.NON_AUTOMATIC_FIXES:
-                        continue
-                    creason = ""
+            reason = misspelling.reason
+            if reason:
+                if options.quiet_level & QuietLevels.DISABLED_FIXES:
+                    continue
+                creason = f"  | {colors.FILE}{reason}{colors.DISABLE}"
+            else:
+                if options.quiet_level & QuietLevels.NON_AUTOMATIC_FIXES:
+                    continue
+                creason = ""
 
-                # If we get to this point (uncorrected error) we should change
-                # our bad_count and thus return value
-                bad_count += 1
+            # If we get to this point (uncorrected error) we should change
+            # our bad_count and thus return value
+            bad_count += 1
 
-                if (not context_shown) and (context is not None):
-                    print_context(lines, i, context)
-                if filename != "-":
-                    print(
-                        f"{cfilename}:{cline}: {cwrongword} "
-                        f"==> {crightword}{creason}"
-                    )
-                elif options.stdin_single_line:
-                    print(f"{cline}: {cwrongword} ==> {crightword}{creason}")
-                else:
-                    print(
-                        f"{cline}: {line.strip()}\n\t{cwrongword} "
-                        f"==> {crightword}{creason}"
-                    )
+            if (not context_shown) and (context is not None):
+                print_context(lines, i, context)
+            if filename != "-":
+                print(
+                    f"{cfilename}:{cline}: {cwrongword} " f"==> {crightword}{creason}"
+                )
+            elif options.stdin_single_line:
+                print(f"{cline}: {cwrongword} ==> {crightword}{creason}")
+            else:
+                print(
+                    f"{cline}: {line.strip()}\n\t{cwrongword} "
+                    f"==> {crightword}{creason}"
+                )
 
     if changed:
         if filename == "-":

From 7273c77edd389154d5e6a848633e506b411d76ae Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Fri, 17 May 2024 14:21:17 +0000
Subject: [PATCH 07/11] Support non-regex based tokens for `spellcheck_line`

The `Spellchecker` only needs the `group` method from the `re.Match`.
With a bit of generics and typing protocols, we can make the
`Spellchecker` work with any token type that has a `group()` method.

The `codespell` command line tool still assumes `re.Match` but it can
get that via its own line tokenizer, so it all works out for everyone.
---
 codespell_lib/_codespell.py   |   6 +-
 codespell_lib/spellchecker.py | 155 +++++++++++++++++++++++++++++++---
 2 files changed, 145 insertions(+), 16 deletions(-)

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index 98efc12033..8cbb1155b8 100644
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -716,7 +716,7 @@ def is_text_file(filename: str) -> bool:
 
 def ask_for_word_fix(
     line: str,
-    issue: DetectedMisspelling,
+    issue: "DetectedMisspelling[re.Match[str]]",
     interactivity: int,
     colors: TermColors,
 ) -> Tuple[bool, Sequence[str]]:
@@ -725,7 +725,7 @@ def ask_for_word_fix(
     if interactivity <= 0:
         return misspelling.fix, fix_case(wrongword, misspelling.candidates)
 
-    match = issue.re_match
+    match = issue.token
 
     line_ui = (
         f"{line[:match.start()]}"
@@ -841,7 +841,7 @@ def line_tokenizer_factory(
     uri_regex: Pattern[str],
     word_regex: Pattern[str],
     ignore_word_regex: Optional[Pattern[str]],
-) -> LineTokenizer:
+) -> "LineTokenizer[re.Match[str]]":
     def line_tokenizer(line: str) -> Iterable[Match[str]]:
         # If all URI spelling errors will be ignored, erase any URI before
         # extracting words. Otherwise, apply ignores after extracting words.
diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py
index 0d87eef366..9d1c5398d6 100644
--- a/codespell_lib/spellchecker.py
+++ b/codespell_lib/spellchecker.py
@@ -16,22 +16,135 @@
 Copyright (C) 2011  ProFUSION embedded systems
 """
 
+import os
+import re
 from typing import (
-    Callable,
     Container,
     Dict,
+    Generic,
     Iterable,
-    Match,
     Optional,
+    Protocol,
     Sequence,
+    TypeVar,
 )
 
 # Pass all misspellings through this translation table to generate
 # alternative misspellings and fixes.
 alt_chars = (("'", "’"),)  # noqa: RUF001
 
+T_co = TypeVar("T_co", bound="Token", covariant=True)
 
-LineTokenizer = Callable[[str], Iterable[Match[str]]]
+
+supported_languages_en = ("en", "en_GB", "en_US", "en_CA", "en_AU")
+supported_languages = supported_languages_en
+
+# Users might want to link this file into /usr/local/bin, so we resolve the
+# symbolic link path to the real path if necessary.
+_data_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
+_builtin_dictionaries = (
+    # name, desc, name, err in aspell, correction in aspell, \
+    # err dictionary array, rep dictionary array
+    # The arrays must contain the names of aspell dictionaries
+    # The aspell tests here aren't the ideal state, but the None's are
+    # realistic for obscure words
+    ("clear", "for unambiguous errors", "", False, None, supported_languages_en, None),
+    (
+        "rare",
+        "for rare (but valid) words that are likely to be errors",
+        "_rare",
+        None,
+        None,
+        None,
+        None,
+    ),
+    (
+        "informal",
+        "for making informal words more formal",
+        "_informal",
+        True,
+        True,
+        supported_languages_en,
+        supported_languages_en,
+    ),
+    (
+        "usage",
+        "for replacing phrasing with recommended terms",
+        "_usage",
+        None,
+        None,
+        None,
+        None,
+    ),
+    (
+        "code",
+        "for words from code and/or mathematics that are likely to be typos in other contexts (such as uint)",  # noqa: E501
+        "_code",
+        None,
+        None,
+        None,
+        None,
+    ),
+    (
+        "names",
+        "for valid proper names that might be typos",
+        "_names",
+        None,
+        None,
+        None,
+        None,
+    ),
+    (
+        "en-GB_to_en-US",
+        "for corrections from en-GB to en-US",
+        "_en-GB_to_en-US",
+        True,
+        True,
+        ("en_GB",),
+        ("en_US",),
+    ),
+)
+_builtin_default = "clear,rare"
+
+_builtin_default_as_tuple = tuple(_builtin_default.split(","))
+
+
+class UnknownBuiltinDictionaryError(ValueError):
+    def __init__(self, name: str) -> None:
+        super().__init__(f"Unknown built-in dictionary: {name}")
+
+
+class BuiltinDictionariesAlreadyLoadedError(TypeError):
+    def __init__(self) -> None:
+        super().__init__(
+            "load_builtin_dictionaries must not be called more than once",
+        )
+
+
+class LineTokenizer(Protocol[T_co]):
+    """Callable that splits a line into multiple tokens to be spellchecked
+
+    Generally, a regex will do for simple cases. A probably too simple one is:
+
+        >>> tokenizer = re.compile(r"[^ ]+").finditer
+
+    For more complex cases, either use more complex regexes or custom tokenization
+    code.
+    """
+
+    def __call__(self, line: str) -> Iterable[T_co]: ...
+
+
+class Token(Protocol):
+    """Describes a token
+
+    This is a protocol to support `re.Match[str]` (which codespell uses) and any
+    other tokenization method that our API consumers might be using.
+    """
+
+    def group(self) -> str: ...
+
+    def start(self) -> int: ...
 
 
 class Misspelling:
@@ -41,13 +154,18 @@ def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None:
         self.reason = reason
 
 
-class DetectedMisspelling:
-
-    def __init__(self, word: str, lword: str, misspelling: Misspelling, match: Match[str]) -> None:
+class DetectedMisspelling(Generic[T_co]):
+    def __init__(
+        self,
+        word: str,
+        lword: str,
+        misspelling: Misspelling,
+        token: T_co,
+    ) -> None:
         self.word = word
         self.lword = lword
         self.misspelling = misspelling
-        self.re_match = match
+        self.token = token
 
 
 class Spellchecker:
@@ -58,14 +176,25 @@ def __init__(self) -> None:
     def spellcheck_line(
         self,
         line: str,
-        tokenizer: Callable[[str], Iterable[re.Match[str]]],
+        tokenizer: LineTokenizer[T_co],
         *,
         extra_words_to_ignore: Container[str] = frozenset()
-    ) -> Iterable[DetectedMisspelling]:
+    ) -> Iterable[DetectedMisspelling[T_co]]:
+        """Tokenize and spellcheck a line
+
+        Split the line into tokens based using the provided tokenizer. See the doc
+        string for the class for an example.
+
+        :param line: The line to spellcheck.
+        :param tokenizer: A callable that will tokenize the line
+        :param extra_words_to_ignore: Extra words to ignore for this particular line
+          (such as content from a `codespell:ignore` comment)
+        """
         misspellings = self._misspellings
         ignore_words_cased = self.ignore_words_cased
-        for match in tokenizer(line):
-            word = match.group()
+
+        for token in tokenizer(line):
+            word = token.group()
             if word in ignore_words_cased:
                 continue
             lword = word.lower()
@@ -74,7 +203,7 @@ def spellcheck_line(
                 # Sometimes we find a 'misspelling' which is actually a valid word
                 # preceded by a string escape sequence.  Ignore such cases as
                 # they're usually false alarms; see issue #17 among others.
-                char_before_idx = match.start() - 1
+                char_before_idx = token.start() - 1
                 if (
                     char_before_idx >= 0
                     and line[char_before_idx] == "\\"
@@ -83,7 +212,7 @@ def spellcheck_line(
                     and lword[1:] not in misspellings
                 ):
                     continue
-                yield DetectedMisspelling(word, lword, misspelling, match)
+                yield DetectedMisspelling(word, lword, misspelling, token)
 
     def check_lower_cased_word(self, word: str) -> Optional[Misspelling]:
         """Check a given word against the loaded dictionaries

From c4d1738c0e0fce05cb5d98919e863f14c646b214 Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Fri, 17 May 2024 13:55:49 +0000
Subject: [PATCH 08/11] Speed up spellchecking by ignoring whitespace-only
 lines

The new API has introduced extra overhead per line being spellchecked.
One way of optimizing out this overhead, is to spellcheck fewer lines.
An obvious choice here, is to optimize out empty and whitespace-only
lines, since they will not have any typos at all (on account of not
having any words).

A side-effect of this change is that we now spellcheck lines with
trailing whitespace stripped. Semantically, this gives the same result
(per "whitespace never has typos"). Performance-wise, it is faster in
theory because the strings are now shorter (since we were calling
`.rstrip()` anyway). In pratice, I am not sure we are going to find
any real corpus where the trailing whitespace is noteworthy from a
performance point of view.

On the performance corpus from #3491, this takes out ~0.4s of
runtime brining us down to slightly above the 5.6s that made the
baseline.
---
 codespell_lib/_codespell.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index 8cbb1155b8..da9cbcafb6 100644
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -948,7 +948,8 @@ def parse_file(
     )
 
     for i, line in enumerate(lines):
-        if line.rstrip() in exclude_lines:
+        line = line.rstrip()
+        if not line or line in exclude_lines:
             continue
 
         extra_words_to_ignore = set()

From 3c08c9bac1217fa94516fefcc20e391799f7519d Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Fri, 17 May 2024 13:33:41 +0000
Subject: [PATCH 09/11] Move `codespell:ignore` check into `Spellchecker`

This makes the API automatically avoid some declared false-positives
that the command line tool would also filter.
---
 codespell_lib/_codespell.py   | 13 +------------
 codespell_lib/spellchecker.py | 25 ++++++++++++++++++++++---
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index da9cbcafb6..32aea6bbe1 100644
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -59,7 +59,6 @@
     "(\\b(?:https?|[ts]?ftp|file|git|smb)://[^\\s]+(?=$|\\s)|"
     "\\b[\\w.%+-]+@[\\w.-]+\\b)"
 )
-inline_ignore_regex = re.compile(r"[^\w\s]\s?codespell:ignore\b(\s+(?P<words>[\w,]*))?")
 USAGE = """
 \t%prog [OPTIONS] [file1 file2 ... fileN]
 """
@@ -952,20 +951,10 @@ def parse_file(
         if not line or line in exclude_lines:
             continue
 
-        extra_words_to_ignore = set()
-        match = inline_ignore_regex.search(line)
-        if match:
-            extra_words_to_ignore = set(
-                filter(None, (match.group("words") or "").split(","))
-            )
-            if not extra_words_to_ignore:
-                continue
-
         fixed_words = set()
         asked_for = set()
 
-        issues = spellchecker.spellcheck_line(line, line_tokenizer, extra_words_to_ignore=extra_words_to_ignore)
-        for issue in issues:
+        for issue in spellchecker.spellcheck_line(line, line_tokenizer):
             misspelling = issue.misspelling
             word = issue.word
             lword = issue.lword
diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py
index 9d1c5398d6..f1ad6885b6 100644
--- a/codespell_lib/spellchecker.py
+++ b/codespell_lib/spellchecker.py
@@ -21,6 +21,7 @@
 from typing import (
     Container,
     Dict,
+    FrozenSet,
     Generic,
     Iterable,
     Optional,
@@ -108,6 +109,8 @@
 
 _builtin_default_as_tuple = tuple(_builtin_default.split(","))
 
+_inline_ignore_regex = re.compile(r"[^\w\s]\s?codespell:ignore\b(\s+(?P<words>[\w,]*))?")
+
 
 class UnknownBuiltinDictionaryError(ValueError):
     def __init__(self, name: str) -> None:
@@ -173,12 +176,21 @@ def __init__(self) -> None:
         self._misspellings: Dict[str, Misspelling] = {}
         self.ignore_words_cased: Container[str] = frozenset()
 
+    def _parse_inline_ignore(self, line: str) -> Optional[FrozenSet[str]]:
+        inline_ignore_match = _inline_ignore_regex.search(line)
+        if inline_ignore_match:
+            words = frozenset(
+                filter(None, (inline_ignore_match.group("words") or "").split(","))
+            )
+            return words if words else None
+        return frozenset()
+
     def spellcheck_line(
         self,
         line: str,
         tokenizer: LineTokenizer[T_co],
         *,
-        extra_words_to_ignore: Container[str] = frozenset()
+        respect_inline_ignore: bool = True,
     ) -> Iterable[DetectedMisspelling[T_co]]:
         """Tokenize and spellcheck a line
 
@@ -187,12 +199,19 @@ def spellcheck_line(
 
         :param line: The line to spellcheck.
         :param tokenizer: A callable that will tokenize the line
-        :param extra_words_to_ignore: Extra words to ignore for this particular line
-          (such as content from a `codespell:ignore` comment)
+        :param respect_inline_ignore: Whether to check the line for
+           `codespell:ignore` instructions
+        :returns: An iterable of discovered typos.
         """
         misspellings = self._misspellings
         ignore_words_cased = self.ignore_words_cased
 
+        extra_words_to_ignore = (
+            self._parse_inline_ignore(line) if respect_inline_ignore else frozenset()
+        )
+        if extra_words_to_ignore is None:
+            return
+
         for token in tokenizer(line):
             word = token.group()
             if word in ignore_words_cased:

From ce280c94275565bec966f410db3ad25f26529545 Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Fri, 17 May 2024 13:41:23 +0000
Subject: [PATCH 10/11] Speed up `codespell:ignore` check by skipping the regex
 in most cases

The changes to provide a public API had some performance related costs
of about 1% runtime. There is no trivial way to offset this any
further without undermining the API we are building. However, we can
pull performance-related shenanigans to compenstate for the cost
introduced.

The codespell codebase unsurprisingly spends a vast majority of its
runtime in various regex related code such as `search` and `finditer`.

The best way to optimize runtime spend in regexes is to not do a regex
in the first place, since the regex engine has a rather steep overhead
over regular string primitives (that is the cost of flexibility). If
the regex rarely matches and there is a very easy static substring
that can be used to rule out the match, then you can speed up the code
by using `substring in string` as a conditional to skip the
regex. This is assuming the regex is used enough for the performance
to matter.

An obvious choice here falls on the `codespell:ignore` regex, because
it has a very distinctive substring in the form of `codespell:ignore`,
which will rule out almost all lines that will not match.

With this little trick, runtime goes from ~5.6s to ~4.9s on the corpus
mentioned in #3419.
---
 codespell_lib/spellchecker.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py
index f1ad6885b6..ac43074798 100644
--- a/codespell_lib/spellchecker.py
+++ b/codespell_lib/spellchecker.py
@@ -109,7 +109,10 @@
 
 _builtin_default_as_tuple = tuple(_builtin_default.split(","))
 
-_inline_ignore_regex = re.compile(r"[^\w\s]\s?codespell:ignore\b(\s+(?P<words>[\w,]*))?")
+_codespell_ignore_tag = "codespell:ignore"
+_inline_ignore_regex = re.compile(
+    rf"[^\w\s]\s?{_codespell_ignore_tag}\b(\s+(?P<words>[\w,]*))?"
+)
 
 
 class UnknownBuiltinDictionaryError(ValueError):
@@ -177,6 +180,8 @@ def __init__(self) -> None:
         self.ignore_words_cased: Container[str] = frozenset()
 
     def _parse_inline_ignore(self, line: str) -> Optional[FrozenSet[str]]:
+        if _codespell_ignore_tag not in line:
+            return frozenset()
         inline_ignore_match = _inline_ignore_regex.search(line)
         if inline_ignore_match:
             words = frozenset(

From ae0e8d2544a92c1a5b4b35ec6ffb2876a1829be7 Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Sat, 25 May 2024 09:41:58 +0000
Subject: [PATCH 11/11] Refactor: Rename `spellchecker.py` to
 `_spellchecker.py`

Per review comment.
---
 codespell_lib/_codespell.py                         | 10 +++++-----
 codespell_lib/{spellchecker.py => _spellchecker.py} |  0
 2 files changed, 5 insertions(+), 5 deletions(-)
 rename codespell_lib/{spellchecker.py => _spellchecker.py} (100%)

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index 32aea6bbe1..baa176cba1 100644
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -39,17 +39,17 @@
     Tuple,
 )
 
+from ._spellchecker import (
+    DetectedMisspelling,
+    LineTokenizer,
+    Spellchecker,
+)
 from ._text_util import fix_case
 
 # autogenerated by setuptools_scm
 from ._version import (  # type: ignore[import-not-found]
     __version__ as VERSION,  # noqa: N812
 )
-from .spellchecker import (
-    DetectedMisspelling,
-    LineTokenizer,
-    Spellchecker,
-)
 
 word_regex_def = r"[\w\-'’]+"  # noqa: RUF001
 # While we want to treat characters like ( or " as okay for a starting break,
diff --git a/codespell_lib/spellchecker.py b/codespell_lib/_spellchecker.py
similarity index 100%
rename from codespell_lib/spellchecker.py
rename to codespell_lib/_spellchecker.py