diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b7d1862b..71ae8659 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -138,11 +138,21 @@ jobs: run: cargo +stable install rustfilt - name: Check x86_64 inlining run: | - ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std.txt - ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std.txt "--features public_imp" - RUSTFLAGS="-C target-feature=+avx2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std-avx2.txt RUSTFLAGS="-C target-feature=+avx2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-nostd-avx2.txt --no-default-features RUSTFLAGS="-C target-feature=+sse4.2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-nostd-sse42.txt --no-default-features + - name: Check x86_64 inlining with avx2 autoselection + run: | + ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std-old.txt + ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std-old.txt "--features public_imp" + RUSTFLAGS="-C target-feature=+avx2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std-avx2.txt + if: ${{ matrix.toolchain == '1.38.0' }} + - name: Check x86_64 inlining with avx512 autoselection + run: | + ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std.txt + ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std.txt "--features public_imp" + RUSTFLAGS="-C target-feature=+avx512f,+avx512bw,+avx512vbmi,+avx512vbmi2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std-avx512.txt + RUSTFLAGS="-C target-feature=+avx512f,+avx512bw,+avx512vbmi,+avx512vbmi2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-nostd-avx512.txt --no-default-features + if: ${{ matrix.toolchain != '1.38.0' }} - uses: dtolnay/rust-toolchain@master with: toolchain: ${{ matrix.toolchain }} diff --git a/Cargo.toml b/Cargo.toml index f3ae8206..eccd5c1a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,3 +53,6 @@ targets = ["aarch64-unknown-linux-gnu", "wasm32-unknown-unknown", "wasm32-wasip1 [dependencies] flexpect = "0.1.1" + +[build-dependencies] +rustversion = "1.0.22" diff --git a/README.md b/README.md index bcdd9d59..ede39012 100644 --- a/README.md +++ b/README.md @@ -13,11 +13,12 @@ This library has been thoroughly tested with sample data as well as fuzzing and ## Features * `basic` API for the fastest validation, optimized for valid UTF-8 * `compat` API as a fully compatible replacement for `std::str::from_utf8()` +* 🆕 AVX 512 support on modern x86/x86-64 CPUs since Rust 1.89 * Supports AVX 2 and SSE 4.2 implementations on x86 and x86-64 * ARM64 (aarch64) SIMD is supported since Rust 1.61 * WASM (wasm32) SIMD is supported * 🆕 armv7 NEON support with the `armv7_neon` feature on nightly Rust -* x86-64: Up to 23 times faster than the std library on valid non-ASCII, up to four times faster on ASCI +* x86-64: Up to 23 times faster than the std library on valid non-ASCII, up to four times faster on ASCII * aarch64: Up to eleven times faster than the std library on valid non-ASCII, up to four times faster on ASCII (Apple Silicon) * Faster than the original simdjson implementation * Selects the fastest implementation at runtime based on CPU support (on x86) @@ -71,14 +72,17 @@ This comes at a slight performance penalty compared to the `basic` API even if t ## Implementation selection ### X86 -The fastest implementation is selected at runtime using the `std::is_x86_feature_detected!` macro, unless the CPU -targeted by the compiler supports the fastest available implementation. -So if you compile with `RUSTFLAGS="-C target-cpu=native"` on a recent x86-64 machine, the AVX 2 implementation is selected at -compile-time and runtime selection is disabled. +The fastest implementation is usually selected at runtime using the `std::is_x86_feature_detected!` macro. The AVX 512 +implementation is however only selected if the CPU support the VBMI2 features to avoid throttling happening with CPUs before +Intels Ice Lake microarchitecture. + +If you compile with `RUSTFLAGS="-C target-cpu=native"` on a recent x86-64 machine whichs support AVX 512 with Rust 1.89 or later, +the AVX 512 implementation is selected at compile-time and runtime selection is disabled. For no-std support (compiled with `--no-default-features`) the implementation is always selected at compile time based on the targeted CPU. Use `RUSTFLAGS="-C target-feature=+avx2"` for the AVX 2 implementation or `RUSTFLAGS="-C target-feature=+sse4.2"` -for the SSE 4.2 implementation. +for the SSE 4.2 implementation. For AVX 512 use `RUSTFLAGS="-C target-feature=+avx512f,+avx512bw,+avx512vbmi,+avx512vbmi2"` with +Rust 1.89 or later. ### ARM64 The SIMD implementation is used automatically since Rust 1.61. diff --git a/build.rs b/build.rs new file mode 100644 index 00000000..ca6ed851 --- /dev/null +++ b/build.rs @@ -0,0 +1,17 @@ +fn main() { + println!("cargo::rustc-check-cfg=cfg(avx512_stable)"); + // `if rustversion::cfg!(...)` is not supported in older Rust versions + if avx512_stable() { + println!("cargo:rustc-cfg=avx512_stable"); + } +} + +#[rustversion::since(1.89)] +fn avx512_stable() -> bool { + true +} + +#[rustversion::before(1.89)] +fn avx512_stable() -> bool { + false +} diff --git a/inlining/expected-methods-x86-nostd-avx512.txt b/inlining/expected-methods-x86-nostd-avx512.txt new file mode 100644 index 00000000..643c0462 --- /dev/null +++ b/inlining/expected-methods-x86-nostd-avx512.txt @@ -0,0 +1,5 @@ +simdutf8::implementation::helpers::get_compat_error +simdutf8::implementation::x86::validate_utf8_basic +simdutf8::implementation::x86::validate_utf8_basic_avx512 +simdutf8::implementation::x86::validate_utf8_compat +simdutf8::implementation::x86::validate_utf8_compat_avx512 diff --git a/inlining/expected-methods-x86-std-avx512.txt b/inlining/expected-methods-x86-std-avx512.txt new file mode 100644 index 00000000..643c0462 --- /dev/null +++ b/inlining/expected-methods-x86-std-avx512.txt @@ -0,0 +1,5 @@ +simdutf8::implementation::helpers::get_compat_error +simdutf8::implementation::x86::validate_utf8_basic +simdutf8::implementation::x86::validate_utf8_basic_avx512 +simdutf8::implementation::x86::validate_utf8_compat +simdutf8::implementation::x86::validate_utf8_compat_avx512 diff --git a/inlining/expected-methods-x86-std-old.txt b/inlining/expected-methods-x86-std-old.txt new file mode 100644 index 00000000..4cbccc66 --- /dev/null +++ b/inlining/expected-methods-x86-std-old.txt @@ -0,0 +1,9 @@ +simdutf8::implementation::helpers::get_compat_error +simdutf8::implementation::validate_utf8_basic_fallback +simdutf8::implementation::validate_utf8_compat_fallback +simdutf8::implementation::x86::avx2::validate_utf8_basic +simdutf8::implementation::x86::avx2::validate_utf8_compat +simdutf8::implementation::x86::sse42::validate_utf8_basic +simdutf8::implementation::x86::sse42::validate_utf8_compat +simdutf8::implementation::x86::validate_utf8_basic::get_fastest +simdutf8::implementation::x86::validate_utf8_compat::get_fastest diff --git a/inlining/expected-methods-x86-std.txt b/inlining/expected-methods-x86-std.txt index 4cbccc66..6edecc76 100644 --- a/inlining/expected-methods-x86-std.txt +++ b/inlining/expected-methods-x86-std.txt @@ -3,6 +3,8 @@ simdutf8::implementation::validate_utf8_basic_fallback simdutf8::implementation::validate_utf8_compat_fallback simdutf8::implementation::x86::avx2::validate_utf8_basic simdutf8::implementation::x86::avx2::validate_utf8_compat +simdutf8::implementation::x86::avx512::validate_utf8_basic +simdutf8::implementation::x86::avx512::validate_utf8_compat simdutf8::implementation::x86::sse42::validate_utf8_basic simdutf8::implementation::x86::sse42::validate_utf8_compat simdutf8::implementation::x86::validate_utf8_basic::get_fastest diff --git a/src/basic.rs b/src/basic.rs index 23a72951..02623d8b 100644 --- a/src/basic.rs +++ b/src/basic.rs @@ -197,6 +197,16 @@ pub mod imp { /// Includes the x86/x86-64 SIMD implementations. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub mod x86 { + /// Includes the validation implementation for AVX 512-compatible CPUs. + /// + /// Using the provided functionality on CPUs which do not support AVX 512 is undefined + /// behavior and will very likely cause a crash. + #[cfg(avx512_stable)] + pub mod avx512 { + pub use crate::implementation::x86::avx512::validate_utf8_basic as validate_utf8; + pub use crate::implementation::x86::avx512::ChunkedUtf8ValidatorImp; + pub use crate::implementation::x86::avx512::Utf8ValidatorImp; + } /// Includes the validation implementation for AVX 2-compatible CPUs. /// /// Using the provided functionality on CPUs which do not support AVX 2 is undefined diff --git a/src/compat.rs b/src/compat.rs index dadb4378..d9482fc2 100644 --- a/src/compat.rs +++ b/src/compat.rs @@ -105,6 +105,11 @@ pub mod imp { /// Includes the x86/x86-64 SIMD implementations. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub mod x86 { + /// Includes the validation implementation for AVX 512-compatible CPUs. + #[cfg(avx512_stable)] + pub mod avx512 { + pub use crate::implementation::x86::avx512::validate_utf8_compat as validate_utf8; + } /// Includes the validation implementation for AVX 2-compatible CPUs. pub mod avx2 { pub use crate::implementation::x86::avx2::validate_utf8_compat as validate_utf8; diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs index b0381aa7..68f7fad1 100644 --- a/src/implementation/algorithm.rs +++ b/src/implementation/algorithm.rs @@ -182,7 +182,10 @@ macro_rules! algorithm_simd { unsafe fn check_block(&mut self, input: SimdInput) { // WORKAROUND // necessary because the for loop is not unrolled on ARM64 - if input.vals.len() == 2 { + if input.vals.len() == 1 { + self.check_bytes(*input.vals.as_ptr()); + self.incomplete = Self::is_incomplete(*input.vals.as_ptr()); + } else if input.vals.len() == 2 { self.check_bytes(*input.vals.as_ptr()); self.check_bytes(*input.vals.as_ptr().add(1)); self.incomplete = Self::is_incomplete(*input.vals.as_ptr().add(1)); @@ -237,13 +240,7 @@ macro_rules! algorithm_simd { } if idx < len { - let mut tmpbuf = TempSimdChunk::new(); - crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64( - input.as_ptr().add(idx), - tmpbuf.0.as_mut_ptr(), - len - idx, - ); - let simd_input = SimdInput::new(tmpbuf.0.as_ptr()); + let simd_input = SimdInput::new_partial(input.as_ptr().add(idx), len-idx); algorithm.check_utf8(simd_input); } algorithm.check_incomplete_pending(); @@ -329,14 +326,7 @@ macro_rules! algorithm_simd { break; } if idx < len { - let mut tmpbuf = TempSimdChunk::new(); - crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64( - input.as_ptr().add(idx), - tmpbuf.0.as_mut_ptr(), - len - idx, - ); - let simd_input = SimdInput::new(tmpbuf.0.as_ptr()); - + let simd_input = SimdInput::new_partial(input.as_ptr().add(idx), len-idx); algorithm.check_utf8(simd_input); } algorithm.check_incomplete_pending(); @@ -534,6 +524,18 @@ macro_rules! simd_input_128_bit { } } + $(#[$feat])* + #[inline] + unsafe fn new_partial(ptr: *const u8, len: usize) -> Self { + let mut tmpbuf = TempSimdChunk::new(); + crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64( + ptr, + tmpbuf.0.as_mut_ptr(), + len, + ); + Self::new(tmpbuf.0.as_ptr()) + } + $(#[$feat])* #[inline] unsafe fn is_ascii(&self) -> bool { @@ -565,6 +567,18 @@ macro_rules! simd_input_256_bit { } } + $(#[$feat])* + #[inline] + unsafe fn new_partial(ptr: *const u8, len: usize) -> Self { + let mut tmpbuf = TempSimdChunk::new(); + crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64( + ptr, + tmpbuf.0.as_mut_ptr(), + len, + ); + Self::new(tmpbuf.0.as_ptr()) + } + $(#[$feat])* #[inline] unsafe fn is_ascii(&self) -> bool { @@ -573,3 +587,41 @@ macro_rules! simd_input_256_bit { } }; } + +macro_rules! simd_input_512_bit { + ($(#[$feat:meta])*) => { + #[repr(C)] + struct SimdInput { + vals: [SimdU8Value; 1], + } + + impl SimdInput { + $(#[$feat])* + #[inline] + unsafe fn new(ptr: *const u8) -> Self { + Self { + vals: [ + SimdU8Value::load_from(ptr), + ], + } + } + + + $(#[$feat])* + #[inline] + unsafe fn new_partial(ptr: *const u8, len: usize) -> Self { + Self { + vals: [ + SimdU8Value::load_from_partial(ptr, len), + ], + } + } + + $(#[$feat])* + #[inline] + unsafe fn is_ascii(&self) -> bool { + self.vals[0].is_ascii() + } + } + }; +} diff --git a/src/implementation/helpers.rs b/src/implementation/helpers.rs index 6fa0f18b..7796750f 100644 --- a/src/implementation/helpers.rs +++ b/src/implementation/helpers.rs @@ -139,6 +139,10 @@ impl TempSimdChunkA16 { #[allow(dead_code)] // only used if a 256-bit SIMD implementation is used pub(crate) struct TempSimdChunkA32(pub(crate) [u8; SIMD_CHUNK_SIZE]); +#[repr(C, align(64))] +#[allow(dead_code)] // only used if a 256-bit SIMD implementation is used +pub(crate) struct TempSimdChunkA64(pub(crate) [u8; SIMD_CHUNK_SIZE]); + #[allow(dead_code)] // only used if there is a SIMD implementation impl TempSimdChunkA32 { #[flexpect::e(clippy::inline_always)] @@ -148,6 +152,15 @@ impl TempSimdChunkA32 { } } +#[allow(dead_code)] // only used if there is a SIMD implementation +impl TempSimdChunkA64 { + #[flexpect::e(clippy::inline_always)] + #[inline(always)] // needs to be forced because otherwise it is not inlined on armv7 neo + pub(crate) const fn new() -> Self { + Self([0; SIMD_CHUNK_SIZE]) + } +} + #[derive(Clone, Copy)] #[allow(dead_code)] // only used if there is a SIMD implementation pub(crate) struct SimdU8Value(pub(crate) T) diff --git a/src/implementation/x86/avx512.rs b/src/implementation/x86/avx512.rs new file mode 100644 index 00000000..f3b69a9e --- /dev/null +++ b/src/implementation/x86/avx512.rs @@ -0,0 +1,276 @@ +//! Contains the x86-64 AVX512 UTF-8 validation implementation. + +#[cfg(target_arch = "x86")] +use core::arch::x86::{ + __m512i, _mm512_alignr_epi8, _mm512_and_si512, _mm512_loadu_si512, _mm512_maskz_loadu_epi8, + _mm512_movepi8_mask, _mm512_or_si512, _mm512_permutex2var_epi64, _mm512_set1_epi8, + _mm512_set_epi64, _mm512_set_epi8, _mm512_setzero_si512, _mm512_shuffle_epi8, + _mm512_srli_epi16, _mm512_subs_epu8, _mm512_test_epi8_mask, _mm512_xor_si512, _mm_prefetch, + _MM_HINT_T0, +}; + +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::{ + __m512i, _mm512_alignr_epi8, _mm512_and_si512, _mm512_loadu_si512, _mm512_maskz_loadu_epi8, + _mm512_movepi8_mask, _mm512_or_si512, _mm512_permutex2var_epi64, _mm512_set1_epi8, + _mm512_set_epi64, _mm512_set_epi8, _mm512_setzero_si512, _mm512_shuffle_epi8, + _mm512_srli_epi16, _mm512_subs_epu8, _mm512_test_epi8_mask, _mm512_xor_si512, _mm_prefetch, + _MM_HINT_T0, +}; + +use crate::implementation::helpers::Utf8CheckAlgorithm; + +// AVX 2 SIMD primitives + +type SimdU8Value = crate::implementation::helpers::SimdU8Value<__m512i>; + +impl SimdU8Value { + #[flexpect::e(clippy::cast_possible_wrap)] + #[flexpect::e(clippy::too_many_arguments)] + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn from_32_cut_off_leading( + v0: u8, + v1: u8, + v2: u8, + v3: u8, + v4: u8, + v5: u8, + v6: u8, + v7: u8, + v8: u8, + v9: u8, + v10: u8, + v11: u8, + v12: u8, + v13: u8, + v14: u8, + v15: u8, + v16: u8, + v17: u8, + v18: u8, + v19: u8, + v20: u8, + v21: u8, + v22: u8, + v23: u8, + v24: u8, + v25: u8, + v26: u8, + v27: u8, + v28: u8, + v29: u8, + v30: u8, + v31: u8, + ) -> Self { + Self::from(_mm512_set_epi8( + v31 as i8, v30 as i8, v29 as i8, v28 as i8, v27 as i8, v26 as i8, v25 as i8, v24 as i8, + v23 as i8, v22 as i8, v21 as i8, v20 as i8, v19 as i8, v18 as i8, v17 as i8, v16 as i8, + v15 as i8, v14 as i8, v13 as i8, v12 as i8, v11 as i8, v10 as i8, v9 as i8, v8 as i8, + v7 as i8, v6 as i8, v5 as i8, v4 as i8, v3 as i8, v2 as i8, v1 as i8, v0 as i8, + v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, + v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, + v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, + v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, + )) + } + + #[flexpect::e(clippy::too_many_arguments)] + #[flexpect::e(clippy::cast_possible_wrap)] + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn repeat_16( + v0: u8, + v1: u8, + v2: u8, + v3: u8, + v4: u8, + v5: u8, + v6: u8, + v7: u8, + v8: u8, + v9: u8, + v10: u8, + v11: u8, + v12: u8, + v13: u8, + v14: u8, + v15: u8, + ) -> Self { + Self::from(_mm512_set_epi8( + v15 as i8, v14 as i8, v13 as i8, v12 as i8, v11 as i8, v10 as i8, v9 as i8, v8 as i8, + v7 as i8, v6 as i8, v5 as i8, v4 as i8, v3 as i8, v2 as i8, v1 as i8, v0 as i8, + v15 as i8, v14 as i8, v13 as i8, v12 as i8, v11 as i8, v10 as i8, v9 as i8, v8 as i8, + v7 as i8, v6 as i8, v5 as i8, v4 as i8, v3 as i8, v2 as i8, v1 as i8, v0 as i8, + v15 as i8, v14 as i8, v13 as i8, v12 as i8, v11 as i8, v10 as i8, v9 as i8, v8 as i8, + v7 as i8, v6 as i8, v5 as i8, v4 as i8, v3 as i8, v2 as i8, v1 as i8, v0 as i8, + v15 as i8, v14 as i8, v13 as i8, v12 as i8, v11 as i8, v10 as i8, v9 as i8, v8 as i8, + v7 as i8, v6 as i8, v5 as i8, v4 as i8, v3 as i8, v2 as i8, v1 as i8, v0 as i8, + )) + } + + #[flexpect::e(clippy::cast_ptr_alignment)] + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn load_from(ptr: *const u8) -> Self { + Self::from(_mm512_loadu_si512(ptr.cast::<__m512i>())) + } + + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn load_from_partial(ptr: *const u8, len: usize) -> Self { + let res = _mm512_maskz_loadu_epi8(u64::MAX >> (64 - len), ptr.cast::()); + Self::from(res) + } + + #[flexpect::e(clippy::too_many_arguments)] + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn lookup_16( + self, + v0: u8, + v1: u8, + v2: u8, + v3: u8, + v4: u8, + v5: u8, + v6: u8, + v7: u8, + v8: u8, + v9: u8, + v10: u8, + v11: u8, + v12: u8, + v13: u8, + v14: u8, + v15: u8, + ) -> Self { + Self::from(_mm512_shuffle_epi8( + Self::repeat_16( + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + ) + .0, + self.0, + )) + } + + #[flexpect::e(clippy::cast_possible_wrap)] + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn splat(val: u8) -> Self { + Self::from(_mm512_set1_epi8(val as i8)) + } + + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn splat0() -> Self { + Self::from(_mm512_setzero_si512()) + } + + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn or(self, b: Self) -> Self { + Self::from(_mm512_or_si512(self.0, b.0)) + } + + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn and(self, b: Self) -> Self { + Self::from(_mm512_and_si512(self.0, b.0)) + } + + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn xor(self, b: Self) -> Self { + Self::from(_mm512_xor_si512(self.0, b.0)) + } + + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn saturating_sub(self, b: Self) -> Self { + Self::from(_mm512_subs_epu8(self.0, b.0)) + } + + // ugly but shr requires const generics + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn shr4(self) -> Self { + Self::from(_mm512_srli_epi16(self.0, 4)).and(Self::splat(0xFF >> 4)) + } + + // ugly but prev requires const generics + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn prev1(self, prev: Self) -> Self { + const SHIFT: i32 = 16 - 1; + return Self::from(_mm512_alignr_epi8( + self.0, + _mm512_permutex2var_epi64(prev.0, _mm512_set_epi64(13, 12, 11, 10, 9, 8, 7, 6), self.0), + SHIFT, + )); + } + // ugly but prev requires const generics + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn prev2(self, prev: Self) -> Self { + const SHIFT: i32 = 16 - 2; + return Self::from(_mm512_alignr_epi8( + self.0, + _mm512_permutex2var_epi64(prev.0, _mm512_set_epi64(13, 12, 11, 10, 9, 8, 7, 6), self.0), + SHIFT, + )); + } + + // ugly but prev requires const generics + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn prev3(self, prev: Self) -> Self { + const SHIFT: i32 = 16 - 3; + return Self::from(_mm512_alignr_epi8( + self.0, + _mm512_permutex2var_epi64(prev.0, _mm512_set_epi64(13, 12, 11, 10, 9, 8, 7, 6), self.0), + SHIFT, + )); + } + + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn any_bit_set(self) -> bool { + _mm512_test_epi8_mask(self.0, self.0) != 0 + } + + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn is_ascii(self) -> bool { + _mm512_movepi8_mask(self.0) == 0 + } +} + +impl From<__m512i> for SimdU8Value { + #[inline] + fn from(val: __m512i) -> Self { + Self(val) + } +} + +impl Utf8CheckAlgorithm { + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn must_be_2_3_continuation(prev2: SimdU8Value, prev3: SimdU8Value) -> SimdU8Value { + let is_third_byte = prev2.saturating_sub(SimdU8Value::splat(0xe0 - 0x80)); + let is_fourth_byte = prev3.saturating_sub(SimdU8Value::splat(0xf0 - 0x80)); + is_third_byte.or(is_fourth_byte) + } +} + +#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] +#[inline] +unsafe fn simd_prefetch(ptr: *const u8) { + _mm_prefetch(ptr.cast::(), _MM_HINT_T0); +} + +const PREFETCH: bool = true; +#[cfg(feature = "public_imp")] +use crate::implementation::helpers::TempSimdChunkA64 as TempSimdChunk; +simd_input_512_bit!(#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]); +algorithm_simd!(#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]); diff --git a/src/implementation/x86/mod.rs b/src/implementation/x86/mod.rs index 23b7515e..cd279e37 100644 --- a/src/implementation/x86/mod.rs +++ b/src/implementation/x86/mod.rs @@ -1,11 +1,69 @@ -#[cfg(any(feature = "std", feature = "public_imp", target_feature = "avx2"))] +#[cfg(all( + avx512_stable, + any( + feature = "public_imp", + // always availabe, except if no-std and no avx512 support + feature = "std", + all( + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + ) +)))] +pub(crate) mod avx512; + +#[cfg(any( + feature = "public_imp", + // std: sse 4.2 is available for auto-selection unless avx512 is selected at compile time + all( + feature = "std", + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )) + ), + // no-std: no avx512 -> select avx2 + all( + not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), + target_feature = "avx2" + ) +))] pub(crate) mod avx2; #[cfg(any( feature = "public_imp", - all(feature = "std", not(target_feature = "avx2")), + // std: sse 4.2 is available for auto-selection unless avx512 or avx2 are selected at compile time + all( + feature = "std", + not(any(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + ),all(not(avx512_stable), target_feature = "avx2"))), + ), + // no-std: no avx512, no avx2 -> select sse4.2 all( not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), not(target_feature = "avx2"), target_feature = "sse4.2" ) @@ -14,7 +72,17 @@ pub(crate) mod sse42; // validate_utf8_basic() std: implementation auto-selection -#[cfg(all(feature = "std", not(target_feature = "avx2")))] +#[cfg(all( + feature = "std", + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), + not(all(not(avx512_stable), target_feature = "avx2")) +))] #[inline] pub(crate) unsafe fn validate_utf8_basic( input: &[u8], @@ -26,9 +94,32 @@ pub(crate) unsafe fn validate_utf8_basic( type FnRaw = *mut (); type ValidateUtf8Fn = unsafe fn(input: &[u8]) -> Result<(), crate::basic::Utf8Error>; + #[cfg(avx512_stable)] + #[inline] + fn get_avx512_implementation() -> Option { + // Test for avx512vbmi2 to make sure we have a newer CPU with a non-throttling AVX-512 implementation + if std::is_x86_feature_detected!("avx512f") + && std::is_x86_feature_detected!("avx512bw") + && std::is_x86_feature_detected!("avx512vbmi") + && std::is_x86_feature_detected!("avx512vbmi2") + { + return Some(avx512::validate_utf8_basic); + } + None + } + + #[cfg(not(avx512_stable))] + #[inline] + fn get_avx512_implementation() -> Option { + None + } + + #[flexpect::e(clippy::option_if_let_else)] #[inline] fn get_fastest_available_implementation_basic() -> ValidateUtf8Fn { - if std::is_x86_feature_detected!("avx2") { + if let Some(fun) = get_avx512_implementation() { + fun + } else if std::is_x86_feature_detected!("avx2") { avx2::validate_utf8_basic } else if std::is_x86_feature_detected!("sse4.2") { sse42::validate_utf8_basic @@ -55,7 +146,51 @@ pub(crate) unsafe fn validate_utf8_basic( // validate_utf8_basic() no-std: implementation selection by config -#[cfg(target_feature = "avx2")] +#[cfg(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" +))] +pub(crate) unsafe fn validate_utf8_basic( + input: &[u8], +) -> core::result::Result<(), crate::basic::Utf8Error> { + if input.len() < super::helpers::SIMD_CHUNK_SIZE { + return super::validate_utf8_basic_fallback(input); + } + + validate_utf8_basic_avx512(input) +} + +#[cfg(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" +))] +#[inline(never)] +unsafe fn validate_utf8_basic_avx512( + input: &[u8], +) -> core::result::Result<(), crate::basic::Utf8Error> { + avx512::validate_utf8_basic(input) +} + +#[cfg(any( + all( + not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), + target_feature = "avx2" + ), + all(target_feature = "avx2", feature = "std", not(avx512_stable)) +))] pub(crate) unsafe fn validate_utf8_basic( input: &[u8], ) -> core::result::Result<(), crate::basic::Utf8Error> { @@ -66,7 +201,20 @@ pub(crate) unsafe fn validate_utf8_basic( validate_utf8_basic_avx2(input) } -#[cfg(target_feature = "avx2")] +#[cfg(any( + all( + not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), + target_feature = "avx2" + ), + all(target_feature = "avx2", feature = "std", not(avx512_stable)) +))] #[inline(never)] unsafe fn validate_utf8_basic_avx2( input: &[u8], @@ -76,6 +224,13 @@ unsafe fn validate_utf8_basic_avx2( #[cfg(all( not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), not(target_feature = "avx2"), target_feature = "sse4.2" ))] @@ -91,6 +246,13 @@ pub(crate) unsafe fn validate_utf8_basic( #[cfg(all( not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), not(target_feature = "avx2"), target_feature = "sse4.2" ))] @@ -103,6 +265,13 @@ unsafe fn validate_utf8_basic_sse42( #[cfg(all( not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), not(target_feature = "avx2"), not(target_feature = "sse4.2") ))] @@ -110,7 +279,17 @@ pub(crate) use super::validate_utf8_basic_fallback as validate_utf8_basic; // validate_utf8_compat() std: implementation auto-selection -#[cfg(all(feature = "std", not(target_feature = "avx2")))] +#[cfg(all( + feature = "std", + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), + not(all(not(avx512_stable), target_feature = "avx2")) +))] #[inline] pub(crate) unsafe fn validate_utf8_compat( input: &[u8], @@ -122,9 +301,32 @@ pub(crate) unsafe fn validate_utf8_compat( type FnRaw = *mut (); type ValidateUtf8CompatFn = unsafe fn(input: &[u8]) -> Result<(), crate::compat::Utf8Error>; + #[cfg(avx512_stable)] + #[inline] + fn get_avx512_implementation() -> Option { + // Test for avx512vbmi2 to make sure we have a newer CPU with a non-throttling AVX-512 implementation + if std::is_x86_feature_detected!("avx512f") + && std::is_x86_feature_detected!("avx512bw") + && std::is_x86_feature_detected!("avx512vbmi") + && std::is_x86_feature_detected!("avx512vbmi2") + { + return Some(avx512::validate_utf8_compat); + } + None + } + + #[cfg(not(avx512_stable))] + #[inline] + fn get_avx512_implementation() -> Option { + None + } + + #[flexpect::e(clippy::option_if_let_else)] #[inline] fn get_fastest_available_implementation_compat() -> ValidateUtf8CompatFn { - if std::is_x86_feature_detected!("avx2") { + if let Some(fun) = get_avx512_implementation() { + fun + } else if std::is_x86_feature_detected!("avx2") { avx2::validate_utf8_compat } else if std::is_x86_feature_detected!("sse4.2") { sse42::validate_utf8_compat @@ -151,7 +353,51 @@ pub(crate) unsafe fn validate_utf8_compat( // validate_utf8_basic() no-std: implementation selection by config -#[cfg(target_feature = "avx2")] +#[cfg(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" +))] +pub(crate) unsafe fn validate_utf8_compat( + input: &[u8], +) -> core::result::Result<(), crate::compat::Utf8Error> { + if input.len() < super::helpers::SIMD_CHUNK_SIZE { + return super::validate_utf8_compat_fallback(input); + } + + validate_utf8_compat_avx512(input) +} + +#[cfg(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" +))] +#[inline(never)] +unsafe fn validate_utf8_compat_avx512( + input: &[u8], +) -> core::result::Result<(), crate::compat::Utf8Error> { + avx512::validate_utf8_compat(input) +} + +#[cfg(any( + all( + not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), + target_feature = "avx2" + ), + all(target_feature = "avx2", feature = "std", not(avx512_stable)) +))] pub(crate) unsafe fn validate_utf8_compat( input: &[u8], ) -> core::result::Result<(), crate::compat::Utf8Error> { @@ -162,7 +408,20 @@ pub(crate) unsafe fn validate_utf8_compat( validate_utf8_compat_avx2(input) } -#[cfg(target_feature = "avx2")] +#[cfg(any( + all( + not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), + target_feature = "avx2" + ), + all(target_feature = "avx2", feature = "std", not(avx512_stable)) +))] #[inline(never)] unsafe fn validate_utf8_compat_avx2( input: &[u8], @@ -172,6 +431,13 @@ unsafe fn validate_utf8_compat_avx2( #[cfg(all( not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), not(target_feature = "avx2"), target_feature = "sse4.2" ))] @@ -187,6 +453,13 @@ pub(crate) unsafe fn validate_utf8_compat( #[cfg(all( not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), not(target_feature = "avx2"), target_feature = "sse4.2" ))] @@ -199,6 +472,13 @@ pub(crate) unsafe fn validate_utf8_compat_sse42( #[cfg(all( not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), not(target_feature = "avx2"), not(target_feature = "sse4.2") ))] diff --git a/src/lib.rs b/src/lib.rs index 1f05450c..0ef442fa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -95,14 +95,17 @@ //! ## Implementation selection //! //! ### X86 -//! The fastest implementation is selected at runtime using the `std::is_x86_feature_detected!` macro, unless the CPU -//! targeted by the compiler supports the fastest available implementation. -//! So if you compile with `RUSTFLAGS="-C target-cpu=native"` on a recent x86-64 machine, the AVX 2 implementation is selected at -//! compile-time and runtime selection is disabled. +//! The fastest implementation is usually selected at runtime using the `std::is_x86_feature_detected!` macro. The AVX 512 +//! implementation is however only selected if the CPU support the VBMI2 features to avoid throttling happening with CPUs before +//! Intels Ice Lake microarchitecture. +//! +//! If you compile with `RUSTFLAGS="-C target-cpu=native"` on a recent x86-64 machine whichs support AVX 512 with Rust 1.89 or later, +//! the AVX 512 implementation is selected at compile-time and runtime selection is disabled. //! //! For no-std support (compiled with `--no-default-features`) the implementation is always selected at compile time based on //! the targeted CPU. Use `RUSTFLAGS="-C target-feature=+avx2"` for the AVX 2 implementation or `RUSTFLAGS="-C target-feature=+sse4.2"` -//! for the SSE 4.2 implementation. +//! for the SSE 4.2 implementation. For AVX 512 use `RUSTFLAGS="-C target-feature=+avx512f,+avx512bw,+avx512vbmi,+avx512vbmi2"` with +//! Rust 1.89 or later. //! //! ### ARM64 //! The SIMD implementation is used automatically since Rust 1.61. diff --git a/tests/tests.rs b/tests/tests.rs index 082746c9..57431651 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -69,6 +69,23 @@ mod public_imp { #[allow(unused_variables)] // nothing to do if not SIMD implementation is available pub(super) fn test_valid(input: &[u8]) { if cfg!(any(target_arch = "x86", target_arch = "x86_64")) { + #[cfg(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + ))] + unsafe { + assert!(simdutf8::basic::imp::x86::avx512::validate_utf8(input).is_ok()); + assert!(simdutf8::compat::imp::x86::avx512::validate_utf8(input).is_ok()); + + test_streaming::(input, true); + test_chunked_streaming::( + input, true, + ); + } + #[cfg(target_feature = "avx2")] unsafe { assert!(simdutf8::basic::imp::x86::avx2::validate_utf8(input).is_ok()); @@ -138,6 +155,24 @@ mod public_imp { #[allow(unused_variables)] // nothing to do if not SIMD implementation is available pub(super) fn test_invalid(input: &[u8], valid_up_to: usize, error_len: Option) { if cfg!(any(target_arch = "x86", target_arch = "x86_64")) { + #[cfg(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + ))] + unsafe { + assert!(simdutf8::basic::imp::x86::avx512::validate_utf8(input).is_err()); + let err = simdutf8::compat::imp::x86::avx512::validate_utf8(input).unwrap_err(); + assert_eq!(err.valid_up_to(), valid_up_to); + assert_eq!(err.error_len(), error_len); + + test_streaming::(input, false); + test_chunked_streaming::( + input, false, + ); + } #[cfg(target_feature = "avx2")] unsafe { assert!(simdutf8::basic::imp::x86::avx2::validate_utf8(input).is_err()); @@ -268,6 +303,24 @@ mod public_imp { } } + #[test] + #[should_panic] + #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + ) + ))] + fn test_avx2_chunked_panic() { + test_chunked_streaming_with_chunk_size::< + simdutf8::basic::imp::x86::avx512::ChunkedUtf8ValidatorImp, + >(b"abcd", 1, true); + } + #[test] #[should_panic] #[cfg(all(