initial avx512 implementation

hkratz · hkratz · commit eb704c005461 · 2025-11-21T06:16:09.000Z
diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs
@@ -182,7 +182,10 @@ macro_rules! algorithm_simd {
             unsafe fn check_block(&mut self, input: SimdInput) {
                 // WORKAROUND
                 // necessary because the for loop is not unrolled on ARM64
-                if input.vals.len() == 2 {
+                if input.vals.len() == 1 {
+                    self.check_bytes(*input.vals.as_ptr());
+                    self.incomplete = Self::is_incomplete(*input.vals.as_ptr());
+                } else if input.vals.len() == 2 {
                     self.check_bytes(*input.vals.as_ptr());
                     self.check_bytes(*input.vals.as_ptr().add(1));
                     self.incomplete = Self::is_incomplete(*input.vals.as_ptr().add(1));
@@ -573,3 +576,30 @@ macro_rules! simd_input_256_bit {
         }
     };
 }
+
+macro_rules! simd_input_512_bit {
+    ($(#[$feat:meta])*) => {
+        #[repr(C)]
+        struct SimdInput {
+            vals: [SimdU8Value; 1],
+        }
+
+        impl SimdInput {
+            $(#[$feat])*
+            #[inline]
+            unsafe fn new(ptr: *const u8) -> Self {
+                Self {
+                    vals: [
+                        SimdU8Value::load_from(ptr),
+                    ],
+                }
+            }
+
+            $(#[$feat])*
+            #[inline]
+            unsafe fn is_ascii(&self) -> bool {
+                self.vals[0].is_ascii()
+            }
+        }
+    };
+}
diff --git a/src/implementation/helpers.rs b/src/implementation/helpers.rs
@@ -139,6 +139,10 @@ impl TempSimdChunkA16 {
 #[allow(dead_code)] // only used if a 256-bit SIMD implementation is used
 pub(crate) struct TempSimdChunkA32(pub(crate) [u8; SIMD_CHUNK_SIZE]);
 
+#[repr(C, align(64))]
+#[allow(dead_code)] // only used if a 256-bit SIMD implementation is used
+pub(crate) struct TempSimdChunkA64(pub(crate) [u8; SIMD_CHUNK_SIZE]);
+
 #[allow(dead_code)] // only used if there is a SIMD implementation
 impl TempSimdChunkA32 {
     #[flexpect::e(clippy::inline_always)]
@@ -148,6 +152,15 @@ impl TempSimdChunkA32 {
     }
 }
 
+#[allow(dead_code)] // only used if there is a SIMD implementation
+impl TempSimdChunkA64 {
+    #[flexpect::e(clippy::inline_always)]
+    #[inline(always)] // needs to be forced because otherwise it is not inlined on armv7 neo
+    pub(crate) const fn new() -> Self {
+        Self([0; SIMD_CHUNK_SIZE])
+    }
+}
+
 #[derive(Clone, Copy)]
 #[allow(dead_code)] // only used if there is a SIMD implementation
 pub(crate) struct SimdU8Value<T>(pub(crate) T)
diff --git a/src/implementation/x86/avx512.rs b/src/implementation/x86/avx512.rs
@@ -0,0 +1,271 @@
+//! Contains the x86-64 AVX512 UTF-8 validation implementation.
+
+use core::arch::x86_64::{
+    __m512i, _mm512_alignr_epi8, _mm512_and_si512, _mm512_cmpgt_epi8_mask, _mm512_loadu_si512,
+    _mm512_maskz_abs_epi8, _mm512_or_si512, _mm512_permutex2var_epi64, _mm512_set1_epi8,
+    _mm512_set_epi64, _mm512_setzero_si512, _mm512_shuffle_epi8, _mm512_srli_epi16,
+    _mm512_subs_epu8, _mm512_test_epi8_mask, _mm512_xor_si512, _mm_prefetch, _MM_HINT_T0,
+};
+use core::arch::x86_64::{_mm512_movepi8_mask, _mm512_set_epi8};
+
+use crate::implementation::helpers::Utf8CheckAlgorithm;
+
+// AVX 2 SIMD primitives
+
+type SimdU8Value = crate::implementation::helpers::SimdU8Value<__m512i>;
+
+impl SimdU8Value {
+    #[flexpect::e(clippy::cast_possible_wrap)]
+    #[flexpect::e(clippy::too_many_arguments)]
+    #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
+    #[inline]
+    unsafe fn from_32_cut_off_leading(
+        v0: u8,
+        v1: u8,
+        v2: u8,
+        v3: u8,
+        v4: u8,
+        v5: u8,
+        v6: u8,
+        v7: u8,
+        v8: u8,
+        v9: u8,
+        v10: u8,
+        v11: u8,
+        v12: u8,
+        v13: u8,
+        v14: u8,
+        v15: u8,
+        v16: u8,
+        v17: u8,
+        v18: u8,
+        v19: u8,
+        v20: u8,
+        v21: u8,
+        v22: u8,
+        v23: u8,
+        v24: u8,
+        v25: u8,
+        v26: u8,
+        v27: u8,
+        v28: u8,
+        v29: u8,
+        v30: u8,
+        v31: u8,
+    ) -> Self {
+        Self::from(_mm512_set_epi8(
+            v31 as i8, v30 as i8, v29 as i8, v28 as i8, v27 as i8, v26 as i8, v25 as i8, v24 as i8,
+            v23 as i8, v22 as i8, v21 as i8, v20 as i8, v19 as i8, v18 as i8, v17 as i8, v16 as i8,
+            v15 as i8, v14 as i8, v13 as i8, v12 as i8, v11 as i8, v10 as i8, v9 as i8, v8 as i8,
+            v7 as i8, v6 as i8, v5 as i8, v4 as i8, v3 as i8, v2 as i8, v1 as i8, v0 as i8,
+            v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8,
+            v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8,
+            v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8,
+            v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8,
+        ))
+    }
+
+    #[flexpect::e(clippy::too_many_arguments)]
+    #[flexpect::e(clippy::cast_possible_wrap)]
+    #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
+    #[inline]
+    unsafe fn repeat_16(
+        v0: u8,
+        v1: u8,
+        v2: u8,
+        v3: u8,
+        v4: u8,
+        v5: u8,
+        v6: u8,
+        v7: u8,
+        v8: u8,
+        v9: u8,
+        v10: u8,
+        v11: u8,
+        v12: u8,
+        v13: u8,
+        v14: u8,
+        v15: u8,
+    ) -> Self {
+        Self::from(_mm512_set_epi8(
+            v15 as i8, v14 as i8, v13 as i8, v12 as i8, v11 as i8, v10 as i8, v9 as i8, v8 as i8,
+            v7 as i8, v6 as i8, v5 as i8, v4 as i8, v3 as i8, v2 as i8, v1 as i8, v0 as i8,
+            v15 as i8, v14 as i8, v13 as i8, v12 as i8, v11 as i8, v10 as i8, v9 as i8, v8 as i8,
+            v7 as i8, v6 as i8, v5 as i8, v4 as i8, v3 as i8, v2 as i8, v1 as i8, v0 as i8,
+            v15 as i8, v14 as i8, v13 as i8, v12 as i8, v11 as i8, v10 as i8, v9 as i8, v8 as i8,
+            v7 as i8, v6 as i8, v5 as i8, v4 as i8, v3 as i8, v2 as i8, v1 as i8, v0 as i8,
+            v15 as i8, v14 as i8, v13 as i8, v12 as i8, v11 as i8, v10 as i8, v9 as i8, v8 as i8,
+            v7 as i8, v6 as i8, v5 as i8, v4 as i8, v3 as i8, v2 as i8, v1 as i8, v0 as i8,
+        ))
+    }
+
+    #[flexpect::e(clippy::cast_ptr_alignment)]
+    #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
+    #[inline]
+    unsafe fn load_from(ptr: *const u8) -> Self {
+        Self::from(_mm512_loadu_si512(ptr.cast::<__m512i>()))
+    }
+
+    #[flexpect::e(clippy::too_many_arguments)]
+    #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
+    #[inline]
+    unsafe fn lookup_16(
+        self,
+        v0: u8,
+        v1: u8,
+        v2: u8,
+        v3: u8,
+        v4: u8,
+        v5: u8,
+        v6: u8,
+        v7: u8,
+        v8: u8,
+        v9: u8,
+        v10: u8,
+        v11: u8,
+        v12: u8,
+        v13: u8,
+        v14: u8,
+        v15: u8,
+    ) -> Self {
+        Self::from(_mm512_shuffle_epi8(
+            Self::repeat_16(
+                v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+            )
+            .0,
+            self.0,
+        ))
+    }
+
+    #[flexpect::e(clippy::cast_possible_wrap)]
+    #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
+    #[inline]
+    unsafe fn splat(val: u8) -> Self {
+        Self::from(_mm512_set1_epi8(val as i8))
+    }
+
+    #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
+    #[inline]
+    unsafe fn splat0() -> Self {
+        Self::from(_mm512_setzero_si512())
+    }
+
+    #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
+    #[inline]
+    unsafe fn or(self, b: Self) -> Self {
+        Self::from(_mm512_or_si512(self.0, b.0))
+    }
+
+    #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
+    #[inline]
+    unsafe fn and(self, b: Self) -> Self {
+        Self::from(_mm512_and_si512(self.0, b.0))
+    }
+
+    #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
+    #[inline]
+    unsafe fn xor(self, b: Self) -> Self {
+        Self::from(_mm512_xor_si512(self.0, b.0))
+    }
+
+    #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
+    #[inline]
+    unsafe fn saturating_sub(self, b: Self) -> Self {
+        Self::from(_mm512_subs_epu8(self.0, b.0))
+    }
+
+    // ugly but shr<N> requires const generics
+    #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
+    #[inline]
+    unsafe fn shr4(self) -> Self {
+        Self::from(_mm512_srli_epi16(self.0, 4)).and(Self::splat(0xFF >> 4))
+    }
+
+    // ugly but prev<N> requires const generics
+    #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
+    #[inline]
+    unsafe fn prev1(self, prev: Self) -> Self {
+        const SHIFT: i32 = 16 - 1;
+        return Self::from(_mm512_alignr_epi8(
+            self.0,
+            _mm512_permutex2var_epi64(prev.0, _mm512_set_epi64(13, 12, 11, 10, 9, 8, 7, 6), self.0),
+            SHIFT,
+        ));
+    }
+    // ugly but prev<N> requires const generics
+    #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
+    #[inline]
+    unsafe fn prev2(self, prev: Self) -> Self {
+        const SHIFT: i32 = 16 - 2;
+        return Self::from(_mm512_alignr_epi8(
+            self.0,
+            _mm512_permutex2var_epi64(prev.0, _mm512_set_epi64(13, 12, 11, 10, 9, 8, 7, 6), self.0),
+            SHIFT,
+        ));
+    }
+
+    // ugly but prev<N> requires const generics
+    #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
+    #[inline]
+    unsafe fn prev3(self, prev: Self) -> Self {
+        const SHIFT: i32 = 16 - 3;
+        return Self::from(_mm512_alignr_epi8(
+            self.0,
+            _mm512_permutex2var_epi64(prev.0, _mm512_set_epi64(13, 12, 11, 10, 9, 8, 7, 6), self.0),
+            SHIFT,
+        ));
+    }
+
+    #[flexpect::e(clippy::cast_possible_wrap)]
+    #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
+    #[inline]
+    unsafe fn signed_gt(self, other: Self) -> Self {
+        Self::from(_mm512_maskz_abs_epi8(
+            _mm512_cmpgt_epi8_mask(self.0, other.0),
+            _mm512_set1_epi8(0x80u8 as i8),
+        ))
+    }
+
+    #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
+    #[inline]
+    unsafe fn any_bit_set(self) -> bool {
+        _mm512_test_epi8_mask(self.0, self.0) != 0
+    }
+
+    #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
+    #[inline]
+    unsafe fn is_ascii(self) -> bool {
+        _mm512_movepi8_mask(self.0) == 0
+    }
+}
+
+impl From<__m512i> for SimdU8Value {
+    #[inline]
+    fn from(val: __m512i) -> Self {
+        Self(val)
+    }
+}
+
+impl Utf8CheckAlgorithm<SimdU8Value> {
+    #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
+    #[inline]
+    unsafe fn must_be_2_3_continuation(prev2: SimdU8Value, prev3: SimdU8Value) -> SimdU8Value {
+        let is_third_byte = prev2.saturating_sub(SimdU8Value::splat(0b1110_0000 - 1));
+        let is_fourth_byte = prev3.saturating_sub(SimdU8Value::splat(0b1111_0000 - 1));
+
+        is_third_byte
+            .or(is_fourth_byte)
+            .signed_gt(SimdU8Value::splat0())
+    }
+}
+
+#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
+#[inline]
+unsafe fn simd_prefetch(ptr: *const u8) {
+    _mm_prefetch(ptr.cast::<i8>(), _MM_HINT_T0);
+}
+
+const PREFETCH: bool = true;
+use crate::implementation::helpers::TempSimdChunkA64 as TempSimdChunk;
+simd_input_512_bit!(#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]);
+algorithm_simd!(#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]);
diff --git a/src/implementation/x86/mod.rs b/src/implementation/x86/mod.rs
@@ -1,6 +1,10 @@
 #[cfg(any(feature = "std", feature = "public_imp", target_feature = "avx2"))]
 pub(crate) mod avx2;
 
+// TODO: require actually necessary AVX-512 features
+#[cfg(any(feature = "std", feature = "public_imp", target_feature = "avx2"))]
+pub(crate) mod avx512;
+
 #[cfg(any(
     feature = "public_imp",
     all(feature = "std", not(target_feature = "avx2")),
@@ -28,7 +32,14 @@ pub(crate) unsafe fn validate_utf8_basic(
 
     #[inline]
     fn get_fastest_available_implementation_basic() -> ValidateUtf8Fn {
-        if std::is_x86_feature_detected!("avx2") {
+        // Test for avx512vbmi2 to make sure we have a newer CPU with a non-throttling AVX-512 implementation
+        if std::is_x86_feature_detected!("avx512f")
+            && std::is_x86_feature_detected!("avx512bw")
+            && std::is_x86_feature_detected!("avx512vbmi")
+            && std::is_x86_feature_detected!("avx512vbmi2")
+        {
+            avx512::validate_utf8_basic
+        } else if std::is_x86_feature_detected!("avx2") {
             avx2::validate_utf8_basic
         } else if std::is_x86_feature_detected!("sse4.2") {
             sse42::validate_utf8_basic
@@ -124,7 +135,13 @@ pub(crate) unsafe fn validate_utf8_compat(
 
     #[inline]
     fn get_fastest_available_implementation_compat() -> ValidateUtf8CompatFn {
-        if std::is_x86_feature_detected!("avx2") {
+        if std::is_x86_feature_detected!("avx512f")
+            && std::is_x86_feature_detected!("avx512bw")
+            && std::is_x86_feature_detected!("avx512vbmi")
+            && std::is_x86_feature_detected!("avx512vbmi2")
+        {
+            avx512::validate_utf8_compat
+        } else if std::is_x86_feature_detected!("avx2") {
             avx2::validate_utf8_compat
         } else if std::is_x86_feature_detected!("sse4.2") {
             sse42::validate_utf8_compat