From 5ebdfdfb80681109f0412a5330b807ff17c9621d Mon Sep 17 00:00:00 2001 From: Kenton-r Date: Sun, 9 Nov 2025 13:16:47 +0700 Subject: [PATCH] Fix line breaking for Khmer text (issue #7218) --- components/segmenter/src/complex/language.rs | 13 ++- components/segmenter/src/line.rs | 85 ++++++++++++++++++-- 2 files changed, 89 insertions(+), 9 deletions(-) diff --git a/components/segmenter/src/complex/language.rs b/components/segmenter/src/complex/language.rs index 327eea5e20b..821078031a7 100644 --- a/components/segmenter/src/complex/language.rs +++ b/components/segmenter/src/complex/language.rs @@ -59,7 +59,11 @@ impl<'s> Iterator for LanguageIterator<'s> { fn next(&mut self) -> Option { let mut indices = self.rest.char_indices(); let lang = get_language(indices.next()?.1 as u32); - match indices.find(|&(_, ch)| get_language(ch as u32) != lang) { + match indices.find(|&(_, ch)| { + let ch_lang = get_language(ch as u32); + !(ch_lang == Language::Unknown && ch.is_whitespace()) + && ch_lang != lang + }) { Some((i, _)) => { let (result, rest) = self.rest.split_at(i); self.rest = rest; @@ -88,7 +92,12 @@ impl<'s> Iterator for LanguageIteratorUtf16<'s> { match self .rest .iter() - .position(|&ch| get_language(ch as u32) != lang) + .position(|&ch| { + let ch_lang = get_language(ch as u32); + !(ch_lang == Language::Unknown + && (ch == 0x20 || ch == 0x09 || ch == 0x0A || ch == 0x0D)) + && ch_lang != lang + }) { Some(i) => { let (result, rest) = self.rest.split_at(i); diff --git a/components/segmenter/src/line.rs b/components/segmenter/src/line.rs index 072aaaaef72..f5681312d40 100644 --- a/components/segmenter/src/line.rs +++ b/components/segmenter/src/line.rs @@ -879,14 +879,32 @@ impl Iterator for LineBreakIterator<'_, '_, Y> { // UAX14 doesn't have Thai etc, so use another way. if self.options.word_option != LineBreakWordOption::BreakAll - && Y::use_complex_breaking(self, left_codepoint) - && Y::use_complex_breaking(self, right_codepoint) { - let result = Y::line_handle_complex_language(self, left_codepoint); - if result.is_some() { - return result; + // Extended to handle SA-SPACE(s)-SA sequences + let should_use_complex = if Y::use_complex_breaking(self, left_codepoint) { + if Y::use_complex_breaking(self, right_codepoint) { + true // SA × SA + } else if right_prop == SP { + // SA × SP - check if SA continues after space(s) + self.peek_past_spaces_for_sa() + } else { + false + } + } else { + false + }; + + if should_use_complex { + let result = Y::line_handle_complex_language(self, left_codepoint); + if result.is_some() { + return result; + } } - // I may have to fetch text until non-SA character?. + } + + // Suppress UAX#14 breaks at SA × SP when SA continues after space(s) + if left_prop == SA && right_prop == SP && self.peek_past_spaces_for_sa() { + continue; } // If break_state is equals or grater than 0, it is alias of property. @@ -1067,6 +1085,31 @@ impl LineBreakIterator<'_, '_, Y> { _ => false, } } + + /// Helper: Check if spaces are followed by SA (complex script) characters. + /// Peeks past all consecutive SP (space) characters to see if SA continues. + /// Returns true if SA is found after space(s), false otherwise. + /// Restores iterator position before returning. + fn peek_past_spaces_for_sa(&mut self) -> bool { + let temp_iter = self.iter.clone(); + let temp_pos = self.current_pos_data; + self.advance_iter(); + + let mut has_sa = false; + while let Some(c) = self.get_current_codepoint() { + let p = self.get_linebreak_property(c); + if p == SP { + self.advance_iter(); + } else { + has_sa = p == SA; + break; + } + } + + self.iter = temp_iter; + self.current_pos_data = temp_pos; + has_sa + } } impl LineBreakType for Utf8 { @@ -1120,6 +1163,29 @@ fn line_handle_complex_language_utf8( where T: LineBreakType, { + /// Helper: Check if spaces are followed by more SA characters. + fn peek_past_spaces_for_sa(iter: &mut LineBreakIterator<'_, '_, T>) -> bool + where + T: LineBreakType, + { + let temp_iter = iter.iter.clone(); + let temp_pos = iter.current_pos_data; + + let mut has_sa = false; + while let Some(c) = iter.get_current_codepoint() { + if c == ' ' { + iter.advance_iter(); + } else { + has_sa = T::use_complex_breaking(iter, c); + break; + } + } + + iter.iter = temp_iter; + iter.current_pos_data = temp_pos; + has_sa + } + // word segmenter doesn't define break rules for some languages such as Thai. let start_iter = iter.iter.clone(); let start_point = iter.current_pos_data; @@ -1130,7 +1196,12 @@ where s.push(iter.get_current_codepoint()?); iter.advance_iter(); if let Some(current_codepoint) = iter.get_current_codepoint() { - if !T::use_complex_breaking(iter, current_codepoint) { + // Continue collecting if SA, or if space(s) followed by SA + if T::use_complex_breaking(iter, current_codepoint) { + continue; + } else if current_codepoint == ' ' && peek_past_spaces_for_sa(iter) { + continue; + } else { break; } } else {