Skip to content

Commit

Permalink
Merge pull request #105 from Manishearth/reorder-conformance
Browse files Browse the repository at this point in the history
Run reordering conformance tests, more docs for reordering, fix bug
  • Loading branch information
Manishearth authored Mar 21, 2023
2 parents 492381c + 511db3f commit a2a3ba1
Show file tree
Hide file tree
Showing 2 changed files with 185 additions and 75 deletions.
181 changes: 110 additions & 71 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -376,16 +376,93 @@ impl<'text> BidiInfo<'text> {
}
}

/// Re-order a line based on resolved levels and return only the embedding levels, one `Level`
/// per *byte*.
/// Produce the levels for this paragraph as needed for reordering, one level per *byte*
/// in the paragraph. The returned vector includes bytes that are not included
/// in the `line`, but will not adjust them.
///
/// This runs [Rule L1], you can run
/// [Rule L2] by calling [`Self::reorder_visual()`].
/// If doing so, you may prefer to use [`Self::reordered_levels_per_char()`] instead
/// to avoid non-byte indices.
///
/// For an all-in-one reordering solution, consider using [`Self::reorder_visual()`].
///
/// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
/// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
#[cfg_attr(feature = "flame_it", flamer::flame)]
pub fn reordered_levels(&self, para: &ParagraphInfo, line: Range<usize>) -> Vec<Level> {
let (levels, _) = self.visual_runs(para, line);
assert!(line.start <= self.levels.len());
assert!(line.end <= self.levels.len());

let mut levels = self.levels.clone();
let line_classes = &self.original_classes[line.clone()];
let line_levels = &mut levels[line.clone()];

// Reset some whitespace chars to paragraph level.
// <http://www.unicode.org/reports/tr9/#L1>
let line_str: &str = &self.text[line.clone()];
let mut reset_from: Option<usize> = Some(0);
let mut reset_to: Option<usize> = None;
let mut prev_level = para.level;
for (i, c) in line_str.char_indices() {
match line_classes[i] {
// Segment separator, Paragraph separator
B | S => {
assert_eq!(reset_to, None);
reset_to = Some(i + c.len_utf8());
if reset_from == None {
reset_from = Some(i);
}
}
// Whitespace, isolate formatting
WS | FSI | LRI | RLI | PDI => {
if reset_from == None {
reset_from = Some(i);
}
}
// <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
// same as above + set the level
RLE | LRE | RLO | LRO | PDF | BN => {
if reset_from == None {
reset_from = Some(i);
}
// also set the level to previous
line_levels[i] = prev_level;
}
_ => {
reset_from = None;
}
}
if let (Some(from), Some(to)) = (reset_from, reset_to) {
for level in &mut line_levels[from..to] {
*level = para.level;
}
reset_from = None;
reset_to = None;
}
prev_level = line_levels[i];
}
if let Some(from) = reset_from {
for level in &mut line_levels[from..] {
*level = para.level;
}
}
levels
}

/// Re-order a line based on resolved levels and return only the embedding levels, one `Level`
/// per *character*.
/// Produce the levels for this paragraph as needed for reordering, one level per *character*
/// in the paragraph. The returned vector includes characters that are not included
/// in the `line`, but will not adjust them.
///
/// This runs [Rule L1], you can run
/// [Rule L2] by calling [`Self::reorder_visual()`].
/// If doing so, you may prefer to use [`Self::reordered_levels_per_char()`] instead
/// to avoid non-byte indices.
///
/// For an all-in-one reordering solution, consider using [`Self::reorder_visual()`].
///
/// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
/// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
#[cfg_attr(feature = "flame_it", flamer::flame)]
pub fn reordered_levels_per_char(
&self,
Expand All @@ -397,6 +474,11 @@ impl<'text> BidiInfo<'text> {
}

/// Re-order a line based on resolved levels and return the line in display order.
///
/// This does not apply [Rule L3] or [Rule L4] around combining characters or mirroring.
///
/// [Rule L3]: https://www.unicode.org/reports/tr9/#L3
/// [Rule L4]: https://www.unicode.org/reports/tr9/#L4
#[cfg_attr(feature = "flame_it", flamer::flame)]
pub fn reorder_line(&self, para: &ParagraphInfo, line: Range<usize>) -> Cow<'text, str> {
let (levels, runs) = self.visual_runs(para, line.clone());
Expand Down Expand Up @@ -536,69 +618,33 @@ impl<'text> BidiInfo<'text> {
///
/// `line` is a range of bytes indices within `levels`.
///
/// The first return value is a vector of levels used by the reordering algorithm,
/// i.e. the result of [Rule L1]. The second return value is a vector of level runs,
/// the result of [Rule L2], showing the visual order that each level run (a run of text with the
/// same level) should be displayed. Within each run, the display order can be checked
/// against the Level vector.
///
/// This does not handle [Rule L3] (combining characters) or [Rule L4] (mirroring),
/// as that should be handled by the engine using this API.
///
/// Conceptually, this is the same as running [`Self::reordered_levels()`] followed by
/// [`Self::reorder_visual()`], however it returns the result as a list of level runs instead
/// of producing a level map, since one may wish to deal with the fact that this is operating on
/// byte rather than character indices.
///
/// <http://www.unicode.org/reports/tr9/#Reordering_Resolved_Levels>
///
/// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
/// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
/// [Rule L3]: https://www.unicode.org/reports/tr9/#L3
/// [Rule L4]: https://www.unicode.org/reports/tr9/#L4
#[cfg_attr(feature = "flame_it", flamer::flame)]
pub fn visual_runs(
&self,
para: &ParagraphInfo,
line: Range<usize>,
) -> (Vec<Level>, Vec<LevelRun>) {
assert!(line.start <= self.levels.len());
assert!(line.end <= self.levels.len());

let mut levels = self.levels.clone();
let line_classes = &self.original_classes[line.clone()];
let line_levels = &mut levels[line.clone()];

// Reset some whitespace chars to paragraph level.
// <http://www.unicode.org/reports/tr9/#L1>
let line_str: &str = &self.text[line.clone()];
let mut reset_from: Option<usize> = Some(0);
let mut reset_to: Option<usize> = None;
let mut prev_level = para.level;
for (i, c) in line_str.char_indices() {
match line_classes[i] {
// Segment separator, Paragraph separator
B | S => {
assert_eq!(reset_to, None);
reset_to = Some(i + c.len_utf8());
if reset_from == None {
reset_from = Some(i);
}
}
// Whitespace, isolate formatting
WS | FSI | LRI | RLI | PDI => {
if reset_from == None {
reset_from = Some(i);
}
}
// <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
// same as above + set the level
RLE | LRE | RLO | LRO | PDF | BN => {
if reset_from == None {
reset_from = Some(i);
}
// also set the level to previous
line_levels[i] = prev_level;
}
_ => {
reset_from = None;
}
}
if let (Some(from), Some(to)) = (reset_from, reset_to) {
for level in &mut line_levels[from..to] {
*level = para.level;
}
reset_from = None;
reset_to = None;
}
prev_level = line_levels[i];
}
if let Some(from) = reset_from {
for level in &mut line_levels[from..] {
*level = para.level;
}
}
let levels = self.reordered_levels(para, line.clone());

// Find consecutive level runs.
let mut runs = Vec::new();
Expand Down Expand Up @@ -626,31 +672,25 @@ impl<'text> BidiInfo<'text> {

// Stop at the lowest *odd* level.
min_level = min_level.new_lowest_ge_rtl().expect("Level error");

// This loop goes through contiguous chunks of level runs that have a level
// ≥ max_level and reverses their contents, reducing max_level by 1 each time.
//
// It can do this check with the original levels instead of checking reorderings because all
// prior reorderings will have been for contiguous chunks of levels >> max, which will
// be a subset of these chunks anyway.
while max_level >= min_level {
// Look for the start of a sequence of consecutive runs of max_level or higher.
let mut seq_start = 0;
while seq_start < run_count {
if self.levels[runs[seq_start].start] < max_level {
if levels[runs[seq_start].start] < max_level {
seq_start += 1;
continue;
}

// Found the start of a sequence. Now find the end.
let mut seq_end = seq_start + 1;
while seq_end < run_count {
if self.levels[runs[seq_end].start] < max_level {
if levels[runs[seq_end].start] < max_level {
break;
}
seq_end += 1;
}

// Reverse the runs within this sequence.
runs[seq_start..seq_end].reverse();

Expand All @@ -660,7 +700,6 @@ impl<'text> BidiInfo<'text> {
.lower(1)
.expect("Lowering embedding level below zero");
}

(levels, runs)
}

Expand Down Expand Up @@ -984,7 +1023,7 @@ mod tests {
// Testing for RLE Character
assert_eq!(
reorder_paras("\u{202B}abc אבג\u{202C}"),
vec!["\u{202B}\u{202C}גבא abc"]
vec!["\u{202b}גבא abc\u{202c}"]
);

// Testing neutral characters
Expand Down
79 changes: 75 additions & 4 deletions tests/conformance_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use std::collections::BTreeMap;
use unicode_bidi::bidi_class;
use unicode_bidi::{format_chars, level, BidiInfo, Level};

#[derive(Debug)]
#[allow(dead_code)] // lint ignores the Debug impl but that's what we use this for
struct Fail {
pub line_num: usize,
pub input_base_level: Option<Level>,
Expand All @@ -21,7 +23,41 @@ struct Fail {
pub exp_ordering: Vec<String>,
pub actual_base_level: Option<Level>,
pub actual_levels: Vec<Level>,
// TODO pub actual_ordering: Vec<String>,
/// A list of reordered indices, with X9 characters removed
pub actual_ordering: Vec<String>,
/// The full reordered index map (map[visual] = logical)
/// without X9 characters removed
pub actual_unfiltered_ordering: Vec<usize>,
}

/// Turns the output of BidiInfo::visual_runs() to a per-*character* visual-to-logical map,
/// compatible with that returned by `reorder_visual()`
fn reorder_map_from_visual_runs(info: &BidiInfo) -> Vec<usize> {
let para = &info.paragraphs[0];
let (levels, runs) = info.visual_runs(para, para.range.clone());
let char_index_map: BTreeMap<usize, usize> = info
.text
.char_indices()
.enumerate()
.map(|(logical, (byte, _ch))| (byte, logical))
.collect();
let mut map = Vec::new();
for run in runs {
if levels[run.start].is_rtl() {
for byte_idx in run.rev() {
if let Some(logical) = char_index_map.get(&byte_idx) {
map.push(*logical);
}
}
} else {
for byte_idx in run {
if let Some(logical) = char_index_map.get(&byte_idx) {
map.push(*logical);
}
}
}
}
map
}

#[test]
Expand Down Expand Up @@ -82,7 +118,23 @@ fn test_basic_conformance() {
let exp_levels: Vec<String> = exp_levels.iter().map(|x| x.to_owned()).collect();
let para = &bidi_info.paragraphs[0];
let levels = bidi_info.reordered_levels_per_char(para, para.range.clone());
if levels != exp_levels {

let reorder_map = BidiInfo::reorder_visual(&levels);
let visual_runs_levels = reorder_map_from_visual_runs(&bidi_info);

// The conformance tests only require this to be true for the non-ignored characters
// However, as an internal invariant of this crate we would like to ensure these stay
// the same to reduce confusion. This is an assert instead of appending to the `fails`
// list since it is testing an internal invariant between two APIs.
assert_eq!(reorder_map, visual_runs_levels, "Maps returned by reorder_visual() and visual_runs() must be the same, for line {line}");

let actual_ordering: Vec<String> = reorder_map
.iter()
.filter(|logical_idx| exp_levels[**logical_idx] != "x")
.map(|logical_idx| logical_idx.to_string())
.collect();

if levels != exp_levels || actual_ordering != exp_ordering {
fails.push(Fail {
line_num: line_idx + 1,
input_base_level,
Expand All @@ -91,6 +143,8 @@ fn test_basic_conformance() {
exp_base_level: None,
exp_levels: exp_levels.to_owned(),
exp_ordering: exp_ordering.to_owned(),
actual_ordering,
actual_unfiltered_ordering: reorder_map,
actual_base_level: None,
actual_levels: levels.to_owned(),
});
Expand Down Expand Up @@ -175,15 +229,32 @@ fn test_character_conformance() {
// Check levels
let para = &bidi_info.paragraphs[0];
let levels = bidi_info.reordered_levels_per_char(para, para.range.clone());
if levels != exp_levels {

let reorder_map = BidiInfo::reorder_visual(&levels);
let visual_runs_levels = reorder_map_from_visual_runs(&bidi_info);

// The conformance tests only require this to be true for the non-ignored characters
// However, as an internal invariant of this crate we would like to ensure these stay
// the same to reduce confusion. This is an assert instead of appending to the `fails`
// list since it is testing an internal invariant between two APIs.
assert_eq!(reorder_map, visual_runs_levels, "Maps returned by reorder_visual() and visual_runs() must be the same, for line {line}");

let actual_ordering: Vec<String> = reorder_map
.iter()
.filter(|logical_idx| exp_levels[**logical_idx] != "x")
.map(|logical_idx| logical_idx.to_string())
.collect();
if levels != exp_levels || exp_ordering != actual_ordering {
fails.push(Fail {
line_num: line_idx + 1,
input_base_level,
input_classes: vec![],
input_string: input_string.to_owned(),
exp_base_level: Some(exp_base_level),
exp_levels: exp_levels.to_owned(),
exp_ordering: exp_ordering.to_owned(),
exp_ordering: exp_ordering,
actual_ordering: actual_ordering,
actual_unfiltered_ordering: reorder_map,
actual_base_level: None,
actual_levels: levels.to_owned(),
});
Expand Down

0 comments on commit a2a3ba1

Please sign in to comment.