Skip to content

Commit eb704c0

Browse files
committed
initial avx512 implementation
1 parent c046554 commit eb704c0

File tree

4 files changed

+334
-3
lines changed

4 files changed

+334
-3
lines changed

src/implementation/algorithm.rs

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,10 @@ macro_rules! algorithm_simd {
182182
unsafe fn check_block(&mut self, input: SimdInput) {
183183
// WORKAROUND
184184
// necessary because the for loop is not unrolled on ARM64
185-
if input.vals.len() == 2 {
185+
if input.vals.len() == 1 {
186+
self.check_bytes(*input.vals.as_ptr());
187+
self.incomplete = Self::is_incomplete(*input.vals.as_ptr());
188+
} else if input.vals.len() == 2 {
186189
self.check_bytes(*input.vals.as_ptr());
187190
self.check_bytes(*input.vals.as_ptr().add(1));
188191
self.incomplete = Self::is_incomplete(*input.vals.as_ptr().add(1));
@@ -573,3 +576,30 @@ macro_rules! simd_input_256_bit {
573576
}
574577
};
575578
}
579+
580+
macro_rules! simd_input_512_bit {
581+
($(#[$feat:meta])*) => {
582+
#[repr(C)]
583+
struct SimdInput {
584+
vals: [SimdU8Value; 1],
585+
}
586+
587+
impl SimdInput {
588+
$(#[$feat])*
589+
#[inline]
590+
unsafe fn new(ptr: *const u8) -> Self {
591+
Self {
592+
vals: [
593+
SimdU8Value::load_from(ptr),
594+
],
595+
}
596+
}
597+
598+
$(#[$feat])*
599+
#[inline]
600+
unsafe fn is_ascii(&self) -> bool {
601+
self.vals[0].is_ascii()
602+
}
603+
}
604+
};
605+
}

src/implementation/helpers.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,10 @@ impl TempSimdChunkA16 {
139139
#[allow(dead_code)] // only used if a 256-bit SIMD implementation is used
140140
pub(crate) struct TempSimdChunkA32(pub(crate) [u8; SIMD_CHUNK_SIZE]);
141141

142+
#[repr(C, align(64))]
143+
#[allow(dead_code)] // only used if a 256-bit SIMD implementation is used
144+
pub(crate) struct TempSimdChunkA64(pub(crate) [u8; SIMD_CHUNK_SIZE]);
145+
142146
#[allow(dead_code)] // only used if there is a SIMD implementation
143147
impl TempSimdChunkA32 {
144148
#[flexpect::e(clippy::inline_always)]
@@ -148,6 +152,15 @@ impl TempSimdChunkA32 {
148152
}
149153
}
150154

155+
#[allow(dead_code)] // only used if there is a SIMD implementation
156+
impl TempSimdChunkA64 {
157+
#[flexpect::e(clippy::inline_always)]
158+
#[inline(always)] // needs to be forced because otherwise it is not inlined on armv7 neo
159+
pub(crate) const fn new() -> Self {
160+
Self([0; SIMD_CHUNK_SIZE])
161+
}
162+
}
163+
151164
#[derive(Clone, Copy)]
152165
#[allow(dead_code)] // only used if there is a SIMD implementation
153166
pub(crate) struct SimdU8Value<T>(pub(crate) T)

src/implementation/x86/avx512.rs

Lines changed: 271 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,271 @@
1+
//! Contains the x86-64 AVX512 UTF-8 validation implementation.
2+
3+
use core::arch::x86_64::{
4+
__m512i, _mm512_alignr_epi8, _mm512_and_si512, _mm512_cmpgt_epi8_mask, _mm512_loadu_si512,
5+
_mm512_maskz_abs_epi8, _mm512_or_si512, _mm512_permutex2var_epi64, _mm512_set1_epi8,
6+
_mm512_set_epi64, _mm512_setzero_si512, _mm512_shuffle_epi8, _mm512_srli_epi16,
7+
_mm512_subs_epu8, _mm512_test_epi8_mask, _mm512_xor_si512, _mm_prefetch, _MM_HINT_T0,
8+
};
9+
use core::arch::x86_64::{_mm512_movepi8_mask, _mm512_set_epi8};
10+
11+
use crate::implementation::helpers::Utf8CheckAlgorithm;
12+
13+
// AVX 2 SIMD primitives
14+
15+
type SimdU8Value = crate::implementation::helpers::SimdU8Value<__m512i>;
16+
17+
impl SimdU8Value {
18+
#[flexpect::e(clippy::cast_possible_wrap)]
19+
#[flexpect::e(clippy::too_many_arguments)]
20+
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
21+
#[inline]
22+
unsafe fn from_32_cut_off_leading(
23+
v0: u8,
24+
v1: u8,
25+
v2: u8,
26+
v3: u8,
27+
v4: u8,
28+
v5: u8,
29+
v6: u8,
30+
v7: u8,
31+
v8: u8,
32+
v9: u8,
33+
v10: u8,
34+
v11: u8,
35+
v12: u8,
36+
v13: u8,
37+
v14: u8,
38+
v15: u8,
39+
v16: u8,
40+
v17: u8,
41+
v18: u8,
42+
v19: u8,
43+
v20: u8,
44+
v21: u8,
45+
v22: u8,
46+
v23: u8,
47+
v24: u8,
48+
v25: u8,
49+
v26: u8,
50+
v27: u8,
51+
v28: u8,
52+
v29: u8,
53+
v30: u8,
54+
v31: u8,
55+
) -> Self {
56+
Self::from(_mm512_set_epi8(
57+
v31 as i8, v30 as i8, v29 as i8, v28 as i8, v27 as i8, v26 as i8, v25 as i8, v24 as i8,
58+
v23 as i8, v22 as i8, v21 as i8, v20 as i8, v19 as i8, v18 as i8, v17 as i8, v16 as i8,
59+
v15 as i8, v14 as i8, v13 as i8, v12 as i8, v11 as i8, v10 as i8, v9 as i8, v8 as i8,
60+
v7 as i8, v6 as i8, v5 as i8, v4 as i8, v3 as i8, v2 as i8, v1 as i8, v0 as i8,
61+
v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8,
62+
v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8,
63+
v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8,
64+
v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8,
65+
))
66+
}
67+
68+
#[flexpect::e(clippy::too_many_arguments)]
69+
#[flexpect::e(clippy::cast_possible_wrap)]
70+
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
71+
#[inline]
72+
unsafe fn repeat_16(
73+
v0: u8,
74+
v1: u8,
75+
v2: u8,
76+
v3: u8,
77+
v4: u8,
78+
v5: u8,
79+
v6: u8,
80+
v7: u8,
81+
v8: u8,
82+
v9: u8,
83+
v10: u8,
84+
v11: u8,
85+
v12: u8,
86+
v13: u8,
87+
v14: u8,
88+
v15: u8,
89+
) -> Self {
90+
Self::from(_mm512_set_epi8(
91+
v15 as i8, v14 as i8, v13 as i8, v12 as i8, v11 as i8, v10 as i8, v9 as i8, v8 as i8,
92+
v7 as i8, v6 as i8, v5 as i8, v4 as i8, v3 as i8, v2 as i8, v1 as i8, v0 as i8,
93+
v15 as i8, v14 as i8, v13 as i8, v12 as i8, v11 as i8, v10 as i8, v9 as i8, v8 as i8,
94+
v7 as i8, v6 as i8, v5 as i8, v4 as i8, v3 as i8, v2 as i8, v1 as i8, v0 as i8,
95+
v15 as i8, v14 as i8, v13 as i8, v12 as i8, v11 as i8, v10 as i8, v9 as i8, v8 as i8,
96+
v7 as i8, v6 as i8, v5 as i8, v4 as i8, v3 as i8, v2 as i8, v1 as i8, v0 as i8,
97+
v15 as i8, v14 as i8, v13 as i8, v12 as i8, v11 as i8, v10 as i8, v9 as i8, v8 as i8,
98+
v7 as i8, v6 as i8, v5 as i8, v4 as i8, v3 as i8, v2 as i8, v1 as i8, v0 as i8,
99+
))
100+
}
101+
102+
#[flexpect::e(clippy::cast_ptr_alignment)]
103+
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
104+
#[inline]
105+
unsafe fn load_from(ptr: *const u8) -> Self {
106+
Self::from(_mm512_loadu_si512(ptr.cast::<__m512i>()))
107+
}
108+
109+
#[flexpect::e(clippy::too_many_arguments)]
110+
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
111+
#[inline]
112+
unsafe fn lookup_16(
113+
self,
114+
v0: u8,
115+
v1: u8,
116+
v2: u8,
117+
v3: u8,
118+
v4: u8,
119+
v5: u8,
120+
v6: u8,
121+
v7: u8,
122+
v8: u8,
123+
v9: u8,
124+
v10: u8,
125+
v11: u8,
126+
v12: u8,
127+
v13: u8,
128+
v14: u8,
129+
v15: u8,
130+
) -> Self {
131+
Self::from(_mm512_shuffle_epi8(
132+
Self::repeat_16(
133+
v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
134+
)
135+
.0,
136+
self.0,
137+
))
138+
}
139+
140+
#[flexpect::e(clippy::cast_possible_wrap)]
141+
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
142+
#[inline]
143+
unsafe fn splat(val: u8) -> Self {
144+
Self::from(_mm512_set1_epi8(val as i8))
145+
}
146+
147+
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
148+
#[inline]
149+
unsafe fn splat0() -> Self {
150+
Self::from(_mm512_setzero_si512())
151+
}
152+
153+
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
154+
#[inline]
155+
unsafe fn or(self, b: Self) -> Self {
156+
Self::from(_mm512_or_si512(self.0, b.0))
157+
}
158+
159+
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
160+
#[inline]
161+
unsafe fn and(self, b: Self) -> Self {
162+
Self::from(_mm512_and_si512(self.0, b.0))
163+
}
164+
165+
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
166+
#[inline]
167+
unsafe fn xor(self, b: Self) -> Self {
168+
Self::from(_mm512_xor_si512(self.0, b.0))
169+
}
170+
171+
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
172+
#[inline]
173+
unsafe fn saturating_sub(self, b: Self) -> Self {
174+
Self::from(_mm512_subs_epu8(self.0, b.0))
175+
}
176+
177+
// ugly but shr<N> requires const generics
178+
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
179+
#[inline]
180+
unsafe fn shr4(self) -> Self {
181+
Self::from(_mm512_srli_epi16(self.0, 4)).and(Self::splat(0xFF >> 4))
182+
}
183+
184+
// ugly but prev<N> requires const generics
185+
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
186+
#[inline]
187+
unsafe fn prev1(self, prev: Self) -> Self {
188+
const SHIFT: i32 = 16 - 1;
189+
return Self::from(_mm512_alignr_epi8(
190+
self.0,
191+
_mm512_permutex2var_epi64(prev.0, _mm512_set_epi64(13, 12, 11, 10, 9, 8, 7, 6), self.0),
192+
SHIFT,
193+
));
194+
}
195+
// ugly but prev<N> requires const generics
196+
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
197+
#[inline]
198+
unsafe fn prev2(self, prev: Self) -> Self {
199+
const SHIFT: i32 = 16 - 2;
200+
return Self::from(_mm512_alignr_epi8(
201+
self.0,
202+
_mm512_permutex2var_epi64(prev.0, _mm512_set_epi64(13, 12, 11, 10, 9, 8, 7, 6), self.0),
203+
SHIFT,
204+
));
205+
}
206+
207+
// ugly but prev<N> requires const generics
208+
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
209+
#[inline]
210+
unsafe fn prev3(self, prev: Self) -> Self {
211+
const SHIFT: i32 = 16 - 3;
212+
return Self::from(_mm512_alignr_epi8(
213+
self.0,
214+
_mm512_permutex2var_epi64(prev.0, _mm512_set_epi64(13, 12, 11, 10, 9, 8, 7, 6), self.0),
215+
SHIFT,
216+
));
217+
}
218+
219+
#[flexpect::e(clippy::cast_possible_wrap)]
220+
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
221+
#[inline]
222+
unsafe fn signed_gt(self, other: Self) -> Self {
223+
Self::from(_mm512_maskz_abs_epi8(
224+
_mm512_cmpgt_epi8_mask(self.0, other.0),
225+
_mm512_set1_epi8(0x80u8 as i8),
226+
))
227+
}
228+
229+
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
230+
#[inline]
231+
unsafe fn any_bit_set(self) -> bool {
232+
_mm512_test_epi8_mask(self.0, self.0) != 0
233+
}
234+
235+
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
236+
#[inline]
237+
unsafe fn is_ascii(self) -> bool {
238+
_mm512_movepi8_mask(self.0) == 0
239+
}
240+
}
241+
242+
impl From<__m512i> for SimdU8Value {
243+
#[inline]
244+
fn from(val: __m512i) -> Self {
245+
Self(val)
246+
}
247+
}
248+
249+
impl Utf8CheckAlgorithm<SimdU8Value> {
250+
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
251+
#[inline]
252+
unsafe fn must_be_2_3_continuation(prev2: SimdU8Value, prev3: SimdU8Value) -> SimdU8Value {
253+
let is_third_byte = prev2.saturating_sub(SimdU8Value::splat(0b1110_0000 - 1));
254+
let is_fourth_byte = prev3.saturating_sub(SimdU8Value::splat(0b1111_0000 - 1));
255+
256+
is_third_byte
257+
.or(is_fourth_byte)
258+
.signed_gt(SimdU8Value::splat0())
259+
}
260+
}
261+
262+
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
263+
#[inline]
264+
unsafe fn simd_prefetch(ptr: *const u8) {
265+
_mm_prefetch(ptr.cast::<i8>(), _MM_HINT_T0);
266+
}
267+
268+
const PREFETCH: bool = true;
269+
use crate::implementation::helpers::TempSimdChunkA64 as TempSimdChunk;
270+
simd_input_512_bit!(#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]);
271+
algorithm_simd!(#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]);

src/implementation/x86/mod.rs

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
#[cfg(any(feature = "std", feature = "public_imp", target_feature = "avx2"))]
22
pub(crate) mod avx2;
33

4+
// TODO: require actually necessary AVX-512 features
5+
#[cfg(any(feature = "std", feature = "public_imp", target_feature = "avx2"))]
6+
pub(crate) mod avx512;
7+
48
#[cfg(any(
59
feature = "public_imp",
610
all(feature = "std", not(target_feature = "avx2")),
@@ -28,7 +32,14 @@ pub(crate) unsafe fn validate_utf8_basic(
2832

2933
#[inline]
3034
fn get_fastest_available_implementation_basic() -> ValidateUtf8Fn {
31-
if std::is_x86_feature_detected!("avx2") {
35+
// Test for avx512vbmi2 to make sure we have a newer CPU with a non-throttling AVX-512 implementation
36+
if std::is_x86_feature_detected!("avx512f")
37+
&& std::is_x86_feature_detected!("avx512bw")
38+
&& std::is_x86_feature_detected!("avx512vbmi")
39+
&& std::is_x86_feature_detected!("avx512vbmi2")
40+
{
41+
avx512::validate_utf8_basic
42+
} else if std::is_x86_feature_detected!("avx2") {
3243
avx2::validate_utf8_basic
3344
} else if std::is_x86_feature_detected!("sse4.2") {
3445
sse42::validate_utf8_basic
@@ -124,7 +135,13 @@ pub(crate) unsafe fn validate_utf8_compat(
124135

125136
#[inline]
126137
fn get_fastest_available_implementation_compat() -> ValidateUtf8CompatFn {
127-
if std::is_x86_feature_detected!("avx2") {
138+
if std::is_x86_feature_detected!("avx512f")
139+
&& std::is_x86_feature_detected!("avx512bw")
140+
&& std::is_x86_feature_detected!("avx512vbmi")
141+
&& std::is_x86_feature_detected!("avx512vbmi2")
142+
{
143+
avx512::validate_utf8_compat
144+
} else if std::is_x86_feature_detected!("avx2") {
128145
avx2::validate_utf8_compat
129146
} else if std::is_x86_feature_detected!("sse4.2") {
130147
sse42::validate_utf8_compat

0 commit comments

Comments
 (0)