Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions library/core/src/unicode/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@ pub use unicode_data::conversions;
pub(crate) use unicode_data::alphabetic::lookup as Alphabetic;
pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend;
pub(crate) use unicode_data::lowercase::lookup as Lowercase;
pub(crate) use unicode_data::n::lookup as N;
pub(crate) use unicode_data::numeric::lookup as N;
pub(crate) use unicode_data::uppercase::lookup as Uppercase;
pub(crate) use unicode_data::white_space::lookup as White_Space;

pub(crate) mod printable;

mod rt;
#[allow(unreachable_pub)]
mod unicode_data;
pub mod unicode_data;

/// The version of [Unicode](https://www.unicode.org/) that the Unicode parts of
/// `char` and `str` methods are based on.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
//! Runtime support for `unicode_data`.

#[inline(always)]
const fn bitset_search<
pub(super) const fn bitset_search<
const N: usize,
const CHUNK_SIZE: usize,
const N1: usize,
Expand Down Expand Up @@ -46,23 +48,23 @@ const fn bitset_search<
}

#[repr(transparent)]
struct ShortOffsetRunHeader(u32);
pub(super) struct ShortOffsetRunHeader(pub(super) u32);

impl ShortOffsetRunHeader {
const fn new(start_index: usize, prefix_sum: u32) -> Self {
pub(super) const fn new(start_index: usize, prefix_sum: u32) -> Self {
assert!(start_index < (1 << 11));
assert!(prefix_sum < (1 << 21));

Self((start_index as u32) << 21 | prefix_sum)
}

#[inline]
const fn start_index(&self) -> usize {
pub(super) const fn start_index(&self) -> usize {
(self.0 >> 21) as usize
}

#[inline]
const fn prefix_sum(&self) -> u32 {
pub(super) const fn prefix_sum(&self) -> u32 {
self.0 & ((1 << 21) - 1)
}
}
Expand All @@ -72,7 +74,7 @@ impl ShortOffsetRunHeader {
/// - The last element of `short_offset_runs` must be greater than `std::char::MAX`.
/// - The start indices of all elements in `short_offset_runs` must be less than `OFFSETS`.
#[inline(always)]
unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
pub(super) unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
needle: char,
short_offset_runs: &[ShortOffsetRunHeader; SOR],
offsets: &[u8; OFFSETS],
Expand Down Expand Up @@ -126,3 +128,35 @@ unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
}
offset_idx % 2 == 1
}

/// # Safety
/// The second component of each tuple in `table` must either be:
/// - A valid `char`
/// - A value with the high bit (1 << 22) set, and the lower 22 bits
/// being a valid index into `multi`.
#[inline(always)]
pub(super) unsafe fn case_conversion(
c: char,
ascii_fn: fn(char) -> char,
table: &[(char, u32)],
multi: &[[char; 3]],
) -> [char; 3] {
const INDEX_MASK: u32 = 1 << 22;

if c.is_ascii() {
return [ascii_fn(c), '\0', '\0'];
}

let Ok(i) = table.binary_search_by(|&(key, _)| key.cmp(&c)) else {
return [c, '\0', '\0'];
};

let u = table[i].1;
match char::from_u32(u) {
Some(c) => [c, '\0', '\0'],
None => {
// SAFETY: Index comes from statically generated table
unsafe { *multi.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
}
}
}
2,630 changes: 1,370 additions & 1,260 deletions library/core/src/unicode/unicode_data.rs

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions library/coretests/tests/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@
#![feature(type_info)]
#![feature(uint_bit_width)]
#![feature(uint_gather_scatter_bits)]
#![feature(unicode_internals)]
#![feature(unsize)]
#![feature(unwrap_infallible)]
// tidy-alphabetical-end
Expand Down
97 changes: 97 additions & 0 deletions library/coretests/tests/unicode.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,102 @@
use core::unicode::unicode_data;
use std::ops::RangeInclusive;

mod test_data;

#[test]
pub fn version() {
let (major, _minor, _update) = core::char::UNICODE_VERSION;
assert!(major >= 10);
}

#[track_caller]
fn test_boolean_property(ranges: &[RangeInclusive<char>], lookup: fn(char) -> bool) {
let mut start = '\u{80}';
for range in ranges {
for c in start..*range.start() {
assert!(!lookup(c), "{c:?}");
}
for c in range.clone() {
assert!(lookup(c), "{c:?}");
}
start = char::from_u32(*range.end() as u32 + 1).unwrap();
}
for c in start..=char::MAX {
assert!(!lookup(c), "{c:?}");
}
}

#[track_caller]
fn test_case_mapping(ranges: &[(char, [char; 3])], lookup: fn(char) -> [char; 3]) {
let mut start = '\u{80}';
for &(key, val) in ranges {
for c in start..key {
assert_eq!(lookup(c), [c, '\0', '\0'], "{c:?}");
}
assert_eq!(lookup(key), val, "{key:?}");
start = char::from_u32(key as u32 + 1).unwrap();
}
for c in start..=char::MAX {
assert_eq!(lookup(c), [c, '\0', '\0'], "{c:?}");
}
}

#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn alphabetic() {
test_boolean_property(test_data::ALPHABETIC, unicode_data::alphabetic::lookup);
}

#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn case_ignorable() {
test_boolean_property(test_data::CASE_IGNORABLE, unicode_data::case_ignorable::lookup);
}

#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn cased() {
test_boolean_property(test_data::CASED, unicode_data::cased::lookup);
}

#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn grapheme_extend() {
test_boolean_property(test_data::GRAPHEME_EXTEND, unicode_data::grapheme_extend::lookup);
}

#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn lowercase() {
test_boolean_property(test_data::LOWERCASE, unicode_data::lowercase::lookup);
}

#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn numeric() {
test_boolean_property(test_data::NUMERIC, unicode_data::numeric::lookup);
}

#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn uppercase() {
test_boolean_property(test_data::UPPERCASE, unicode_data::uppercase::lookup);
}

#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn white_space() {
test_boolean_property(test_data::WHITE_SPACE, unicode_data::white_space::lookup);
}

#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn to_lowercase() {
test_case_mapping(test_data::TO_LOWER, unicode_data::conversions::to_lower);
}

#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn to_uppercase() {
test_case_mapping(test_data::TO_UPPER, unicode_data::conversions::to_upper);
}
Loading
Loading