Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changeset/chatty-hotels-sort.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
swc_atoms: major
hstr: major
---

fix(es/ast): Fix unicode unpaired surrogates handling
7 changes: 6 additions & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ jobs:
key: swc-exec-cache-${{ matrix.settings.crate }}-${{ hashFiles('**/Cargo.lock') }}

- name: Run cargo test
if: matrix.settings.crate != 'swc_plugin_backend_tests' && matrix.settings.crate != 'swc_ecma_parser' && matrix.settings.crate != 'swc_ecma_minifier' && matrix.settings.crate != 'swc_core' && matrix.settings.crate != 'swc_ecma_quote' && matrix.settings.crate != 'swc_cli' && matrix.settings.crate != 'binding_core_wasm'
if: matrix.settings.crate != 'swc_plugin_backend_tests' && matrix.settings.crate != 'swc_ecma_parser' && matrix.settings.crate != 'swc_ecma_minifier' && matrix.settings.crate != 'swc_core' && matrix.settings.crate != 'swc_ecma_quote' && matrix.settings.crate != 'swc_cli' && matrix.settings.crate != 'binding_core_wasm' && matrix.settings.crate != 'hstr'
run: |
cargo test -p ${{ matrix.settings.crate }}

Expand All @@ -317,6 +317,11 @@ jobs:
# export CARGO_TARGET_DIR=$(pwd)/target
cargo test -p swc_plugin_backend_tests --release

- name: Run cargo test (hstr)
if: matrix.settings.crate == 'hstr'
run: |
cargo test -p hstr --features serde

- name: Run cargo test (swc_ecma_minifier)
if: matrix.settings.crate == 'swc_ecma_minifier'
run: |
Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/hstr/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ kstring = { workspace = true }
num_cpus = { workspace = true }
par-iter = { workspace = true }
rand = { workspace = true }
serde_json = { workspace = true }
smartstring = { workspace = true }
smol_str = { workspace = true }
string_cache = { workspace = true }
Expand Down
15 changes: 15 additions & 0 deletions crates/hstr/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

use core::str;
use std::{
borrow::Borrow,
fmt::{Debug, Display},
hash::Hash,
mem::{self, forget, transmute, ManuallyDrop},
Expand Down Expand Up @@ -104,6 +105,7 @@ pub use wtf8_atom::Wtf8Atom;
/// - Atoms created via the `atom!` macro or `String::into` are stored in the
/// global atom store. By default, these atoms are never deallocated. To clean
/// up unused atoms, call [global_atom_store_gc].
#[repr(transparent)]
pub struct Atom {
// If this Atom is a dynamic one, this is *const Entry
unsafe_data: TaggedValue,
Expand Down Expand Up @@ -369,6 +371,19 @@ impl PartialEq<Atom> for str {
}
}

impl Borrow<Wtf8Atom> for Atom {
#[inline(always)]
fn borrow(&self) -> &Wtf8Atom {
// SAFETY:
// 1. Wtf8Atom is #[repr(transparent)] over TaggedValue
// 2. Atom is #[repr(transparent)] over TaggedValue
// 3. hstr::Atom and hstr::Wtf8Atom share the same TaggedValue
const _: () = assert!(std::mem::size_of::<Atom>() == std::mem::size_of::<Wtf8Atom>());
const _: () = assert!(std::mem::align_of::<Atom>() == std::mem::align_of::<Wtf8Atom>());
unsafe { transmute::<&Atom, &Wtf8Atom>(self) }
}
}

/// NOT A PUBLIC API
#[cfg(feature = "rkyv")]
impl rkyv::Archive for Atom {
Expand Down
161 changes: 158 additions & 3 deletions crates/hstr/src/wtf8/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ use core::{
slice, str,
str::FromStr,
};
use std::ops::Add;

mod not_quite_std;

Expand Down Expand Up @@ -68,15 +69,15 @@ impl CodePoint {
///
/// Only use when `value` is known to be less than or equal to 0x10FFFF.
#[inline]
pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint {
pub const unsafe fn from_u32_unchecked(value: u32) -> CodePoint {
CodePoint { value }
}

/// Create a new `CodePoint` if the value is a valid code point.
///
/// Return `None` if `value` is above 0x10FFFF.
#[inline]
pub fn from_u32(value: u32) -> Option<CodePoint> {
pub const fn from_u32(value: u32) -> Option<CodePoint> {
match value {
0..=0x10ffff => Some(CodePoint { value }),
_ => None,
Expand All @@ -87,7 +88,7 @@ impl CodePoint {
///
/// Since all Unicode scalar values are code points, this always succeds.
#[inline]
pub fn from_char(value: char) -> CodePoint {
pub const fn from_char(value: char) -> CodePoint {
CodePoint {
value: value as u32,
}
Expand Down Expand Up @@ -118,6 +119,18 @@ impl CodePoint {
pub fn to_char_lossy(&self) -> char {
self.to_char().unwrap_or('\u{FFFD}')
}

/// Return `true` if the code point is in the ASCII range.
#[inline]
pub fn is_ascii(&self) -> bool {
self.value <= 0x7f
}
}

impl PartialEq<char> for CodePoint {
fn eq(&self, other: &char) -> bool {
self.value == *other as u32
}
}

/// An owned, growable string of well-formed WTF-8 data.
Expand Down Expand Up @@ -165,6 +178,23 @@ impl FromStr for Wtf8Buf {
}
}

impl fmt::Write for Wtf8Buf {
fn write_str(&mut self, s: &str) -> std::fmt::Result {
self.push_str(s);
Ok(())
}
}

impl Add<&Wtf8> for Wtf8Buf {
type Output = Wtf8Buf;

fn add(self, rhs: &Wtf8) -> Self::Output {
let mut result = self;
result.push_wtf8(rhs);
result
}
}

impl Wtf8Buf {
/// Create an new, empty WTF-8 string.
#[inline]
Expand Down Expand Up @@ -313,6 +343,12 @@ impl Wtf8Buf {
self.bytes.truncate(new_len)
}

/// Clear the WTF-8 vector, removing all contents.
#[inline]
pub fn clear(&mut self) {
self.bytes.clear();
}

/// Consume the WTF-8 string and try to convert it to UTF-8.
///
/// This does not copy the data.
Expand Down Expand Up @@ -345,6 +381,26 @@ impl Wtf8Buf {
}
}
}

/// Create a [Wtf8Buf] from a WTF-8 encoded byte vector.
///
/// # Safety
///
/// The caller must ensure that `bytes` is a well-formed WTF-8 byte
/// sequence.
///
/// This means that:
/// - All bytes must form valid UTF-8 sequences OR valid surrogate code
/// point encodings
/// - Surrogate code points may appear unpaired and be encoded separately,
/// but if they are paired, it should be encoded as a single 4-byte UTF-8
/// sequence. For example, the byte sequence `[0xED, 0xA0, 0x80, 0xED,
/// 0xB0, 0x80]` is not valid WTF-8 because WTF-8 forbids encoding a
/// surrogate pair as two separate 3-byte sequences.
#[inline]
pub unsafe fn from_bytes_unchecked(bytes: Vec<u8>) -> Self {
Self { bytes }
}
}

/// Create a new WTF-8 string from an iterator of code points.
Expand Down Expand Up @@ -474,6 +530,12 @@ impl Wtf8 {
self.bytes.is_empty()
}

/// Return `true` if the string contains only ASCII characters.
#[inline]
pub const fn is_ascii(&self) -> bool {
self.bytes.is_ascii()
}

/// Return a slice of the given string for the byte range [`begin`..`end`).
///
/// # Failure
Expand Down Expand Up @@ -547,6 +609,34 @@ impl Wtf8 {
}
}

/// Returns `true` if this WTF-8 string contains the given character.
#[inline]
pub fn contains_char(&self, ch: char) -> bool {
let target = CodePoint::from_char(ch);
self.contains(target)
}

/// Returns `true` if this WTF-8 string contains the given code point.
#[inline]
pub fn contains(&self, code_point: CodePoint) -> bool {
self.code_points().any(|cp| cp == code_point)
}

/// Returns `true` if this WTF-8 string starts with the given UTF-8 string.
#[inline]
pub fn starts_with(&self, pattern: &str) -> bool {
if pattern.len() > self.len() {
return false;
}

let pattern_wtf8 = self.slice_to(pattern.len());
if let Some(pattern_str) = pattern_wtf8.as_str() {
pattern_str == pattern
} else {
false
}
}

/// Try to convert the string to UTF-8 and return a `&str` slice.
///
/// Return `None` if the string contains surrogates.
Expand Down Expand Up @@ -614,6 +704,46 @@ impl Wtf8 {
}
}

/// Returns the uppercase equivalent of this wtf8 slice, as a new [Wtf8Buf].
#[inline]
pub fn to_uppercase(&self) -> Wtf8Buf {
let mut result = Wtf8Buf::with_capacity(self.len());
for cp in self.code_points() {
if let Some(ch) = cp.to_char() {
for upper_ch in ch.to_uppercase() {
result.push_char(upper_ch);
}
} else {
// Surrogates are known to be in the code point range.
let code_point = unsafe { CodePoint::from_u32_unchecked(cp.to_u32()) };
// Skip the WTF-8 concatenation check,
// surrogate pairs are already decoded by utf16_items
not_quite_std::push_code_point(&mut result, code_point)
}
}
result
}

/// Returns the lowercase equivalent of this wtf8 slice, as a new [Wtf8Buf].
#[inline]
pub fn to_lowercase(&self) -> Wtf8Buf {
let mut result = Wtf8Buf::with_capacity(self.len());
for cp in self.code_points() {
if let Some(ch) = cp.to_char() {
for lower_ch in ch.to_lowercase() {
result.push_char(lower_ch);
}
} else {
// Surrogates are known to be in the code point range.
let code_point = unsafe { CodePoint::from_u32_unchecked(cp.to_u32()) };
// Skip the WTF-8 concatenation check,
// surrogate pairs are already decoded by utf16_items
not_quite_std::push_code_point(&mut result, code_point)
}
}
result
}

/// Create a WTF-8 from a WTF-8 encoded byte slice.
///
/// # Safety
Expand Down Expand Up @@ -770,6 +900,24 @@ impl PartialEq<Wtf8Buf> for &Wtf8 {
}
}

impl PartialEq<str> for &Wtf8 {
fn eq(&self, other: &str) -> bool {
match self.as_str() {
Some(s) => s == other,
None => false,
}
}
}

impl PartialEq<&str> for &Wtf8 {
fn eq(&self, other: &&str) -> bool {
match self.as_str() {
Some(s) => s == *other,
None => false,
}
}
}

impl hash::Hash for CodePoint {
#[inline]
fn hash<H: hash::Hasher>(&self, state: &mut H) {
Expand Down Expand Up @@ -824,6 +972,13 @@ impl<'a> From<&'a str> for &'a Wtf8 {
}
}

impl<'a> From<Wtf8Buf> for Cow<'a, Wtf8> {
#[inline]
fn from(s: Wtf8Buf) -> Cow<'a, Wtf8> {
Cow::Owned(s)
}
}

#[cfg(test)]
mod tests {
use alloc::{format, vec};
Expand Down
Loading
Loading