Fix indexing when looking into words within a margin

This comes up in two different branches - we want to split up a word so into two segments based on a minimum margin of characters at the beginning and end of the word. For example we might split "10th" as "1" and "0th" based on a margin of 1, and also "10" and "th" and "10t" "h". There is surprising behavior (in `&"áoóáoó"[5..=5]` for example, which panics) that an inclusive range on the right-hand side is equivalent to an exclusive range on the right side with the `end` being `+ 1`. That isn't true for byte indexes within a UTF-8 encoded string: it might be within a multi-byte codepoint. So we need to switch to a regular Rust range (exclusive on the right). This means using the word's `str::len` when the margin is `1` and the `nth - 2` codepoint's byte index otherwise. This commit refactors this all as a helper function (which seems very complex actually) which covers adjusting the indices as well, reducing some boilerplate between the callers. (It ends up being more code in absolute terms though - I wonder if just returning the range is wiser, or eliminating the shared function.)
helix-editor · Aug 26, 2024 · a8f9e33 · a8f9e33
1 parent 13eb168
commit a8f9e33
Showing 1 changed file with 47 additions and 28 deletions.
diff --git a/src/checker.rs b/src/checker.rs
@@ -1303,26 +1303,14 @@ impl<'a, S: BuildHasher> Checker<'a, S> {
         num_parts: usize,
         allow_bad_forceucase: Forceucase,
     ) -> Option<CompoundingResult> {
-        let min_num_chars = self
+        let min_chars = self
             .aff
             .options
             .compound_min_length
             .map(|len| len.get())
             .unwrap_or(3) as usize;
 
-        let start_byte = word[start_pos..]
-            .char_indices()
-            .nth(min_num_chars)
-            .map(|(byte, _)| byte + start_pos)?;
-        let end_byte = word
-            .char_indices()
-            .rev()
-            .take_while(|(byte, _)| *byte >= start_byte)
-            .nth(min_num_chars - 1)
-            .map(|(byte, _)| byte)?;
-
-        for (i, _) in word[start_byte..=end_byte].char_indices() {
-            let i = i + start_byte;
+        for i in bytes_within(word, start_pos, min_chars) {
             if let Some(result) = self.check_compound_classic::<MODE>(
                 word,
                 start_pos,
@@ -2045,20 +2033,7 @@ impl<'a, S: BuildHasher> Checker<'a, S> {
         // Note that this cannot be zero but we cast it to a usize for easier use.
         debug_assert_ne!(min_chars, 0);
 
-        let start_byte = word[start_pos..]
-            .char_indices()
-            .nth(min_chars)
-            .map(|(byte, _)| byte + start_pos)?;
-        let end_byte = word
-            .char_indices()
-            .rev()
-            .nth(min_chars - 1)
-            .map(|(byte, _)| byte)
-            .filter(|byte| *byte >= start_byte)?;
-
-        for (i, _) in word[start_byte..=end_byte].char_indices() {
-            // Adjust `i` to be the absolute index in the str.
-            let i = i + start_byte;
+        for i in bytes_within(word, start_pos, min_chars) {
             let Some((part1_stem, part1_flags)) =
                 self.aff
                     .words
@@ -2194,6 +2169,50 @@ fn prev_codepoint(word: &str, byte: usize) -> Option<(usize, char)> {
     word[..byte].char_indices().next_back()
 }
 
+/// An iterator over each byte index in the given `word` within the given margin.
+///
+/// For example in "abcd" with a margin of 1, this will yield `[1, 2]` (0-indexed).
+///
+/// `start` must be a valid byte index within `word` and `margin` must be non-zero.
+fn bytes_within(word: &str, start: usize, margin: usize) -> impl Iterator<Item = usize> + '_ {
+    // TODO: this seems unnecessarily complex. Can we just use prev/next_codepoint
+    // helpers instead?.
+    fn byte_range(word: &str, start: usize, margin: usize) -> Option<std::ops::Range<usize>> {
+        let start_byte = word[start..]
+            .char_indices()
+            .nth(margin)
+            .map(|(byte, _)| byte + start)?;
+        // We want the byte _after_ the `nth - 1` character from the back because the  range is
+        // exclusive on the right-hand side. We need an exclusive range  because `&word[n..=n]`
+        // has surprising behavior: it's equivalent to `&word[n..n + 1]` which is not correct when
+        // `n + 1` is not a valid byte index.
+        //
+        // When looking at the surrounding one character this must be the total word  length then
+        // (again - it's an exclusive range on the right). Using a saturating subtraction here
+        // to avoid a panic from `- 2` would not be equivalent.
+        let end_byte = if margin == 1 {
+            word.len()
+        } else {
+            word.char_indices()
+                .rev()
+                .take_while(|(byte, _)| *byte >= start_byte)
+                .nth(margin - 2)
+                .map(|(byte, _)| byte)?
+        };
+        Some(start_byte..end_byte)
+    }
+
+    debug_assert_ne!(margin, 0);
+
+    byte_range(word, start, margin)
+        .into_iter()
+        .flat_map(move |range| {
+            word[range.clone()]
+                .char_indices()
+                .map(move |(byte, _)| byte + range.start)
+        })
+}
+
 // TODO: rename?
 #[derive(Debug, PartialEq, Eq, Clone, Copy, Default)]
 pub(crate) enum Forceucase {