rust-lang · Amanieu · Nov 30, 2024 · Nov 18, 2024 · Nov 30, 2024 · Nov 30, 2024
diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs
@@ -18,6 +18,8 @@
 //! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
 //! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
 
+use core::hint::unreachable_unchecked;
+
 use crate::core_arch::{simd::*, x86::*};
 use crate::intrinsics::simd::*;
 
@@ -164,7 +166,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
     static_assert_uimm_bits!(IMM8, 8);
     // If palignr is shifting the pair of vectors more than the size of two
     // lanes, emit zero.
-    if IMM8 > 32 {
+    if IMM8 >= 32 {
         return _mm256_setzero_si256();
     }
     // If palignr is shifting the pair of input vectors more than one lane,
@@ -178,6 +180,10 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
     let a = a.as_i8x32();
     let b = b.as_i8x32();
 
+    if IMM8 == 16 {
+        return transmute(a);
+    }
+
     let r: i8x32 = match IMM8 % 16 {
         0 => simd_shuffle!(
             b,
@@ -307,7 +313,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
                 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
             ],
         ),
-        _ => b,
+        _ => unreachable_unchecked(),
     };
     transmute(r)
 }
@@ -5305,16 +5311,6 @@ mod tests {
         );
         assert_eq_m256i(r, expected);
 
-        #[rustfmt::skip]
-        let expected = _mm256_setr_epi8(
-            -1, -2, -3, -4, -5, -6, -7, -8,
-            -9, -10, -11, -12, -13, -14, -15, -16, -17,
-            -18, -19, -20, -21, -22, -23, -24, -25,
-            -26, -27, -28, -29, -30, -31, -32,
-        );
-        let r = _mm256_alignr_epi8::<16>(a, b);
-        assert_eq_m256i(r, expected);
-
         let r = _mm256_alignr_epi8::<15>(a, b);
         #[rustfmt::skip]
         let expected = _mm256_setr_epi8(
@@ -5327,6 +5323,9 @@ mod tests {
 
         let r = _mm256_alignr_epi8::<0>(a, b);
         assert_eq_m256i(r, b);
+
+        let r = _mm256_alignr_epi8::<16>(a, b);
+        assert_eq_m256i(r, a);
     }
 
     #[simd_test(enable = "avx2")]

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
@@ -4,6 +4,8 @@ use crate::{
     ptr,
 };
 
+use core::hint::unreachable_unchecked;
+
 #[cfg(test)]
 use stdarch_test::assert_instr;
 
@@ -10850,6 +10852,8 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
 }
 
 /// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst.
+/// Unlike [`_mm_alignr_epi8`], [`_mm256_alignr_epi8`] functions, where the entire input vectors are concatenated to the temporary result,
+/// this concatenation happens in 4 steps, where each step builds 32-byte temporary result.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_alignr_epi8&expand=263)
 #[inline]
@@ -10860,7 +10864,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
 pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
     // If palignr is shifting the pair of vectors more than the size of two
     // lanes, emit zero.
-    if IMM8 > 32 {
+    if IMM8 >= 32 {
         return _mm512_setzero_si512();
     }
     // If palignr is shifting the pair of input vectors more than one lane,
@@ -10873,6 +10877,10 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
     let a = a.as_i8x64();
     let b = b.as_i8x64();
 
+    if IMM8 == 16 {
+        return transmute(a);
+    }
+
     let r: i8x64 = match IMM8 % 16 {
         0 => simd_shuffle!(
             b,
@@ -11031,7 +11039,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
                 121, 122, 123, 124, 125, 126,
             ],
         ),
-        _ => b,
+        _ => unreachable_unchecked(),
     };
     transmute(r)
 }

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
@@ -6,6 +6,7 @@ use crate::{
     mem, ptr,
 };
 
+use core::hint::unreachable_unchecked;
 #[cfg(test)]
 use stdarch_test::assert_instr;
 
@@ -26202,6 +26203,8 @@ pub unsafe fn _mm_mask_blend_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d
 
 /// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 64 bytes (16 elements) in dst.
 ///
+/// <div class="warning">Only lowest <strong>4 bits</strong> are used from the mask (shift at maximum by 60 bytes)!</div>
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_alignr_epi32&expand=245)
 #[inline]
 #[target_feature(enable = "avx512f")]
@@ -26269,7 +26272,8 @@ pub unsafe fn _mm512_alignr_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __
         12 => simd_shuffle!(a, b, [28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
         13 => simd_shuffle!(a, b, [29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
         14 => simd_shuffle!(a, b, [30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
-        _ => simd_shuffle!(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
+        15 => simd_shuffle!(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
+        _ => unreachable_unchecked(),
     };
     transmute(r)
 }
@@ -26313,6 +26317,8 @@ pub unsafe fn _mm512_maskz_alignr_epi32<const IMM8: i32>(
 
 /// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst.
 ///
+/// <div class="warning">Only lowest <strong>3 bits</strong> are used from the mask (shift at maximum by 28 bytes)!</div>
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi32&expand=242)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
@@ -26323,7 +26329,7 @@ pub unsafe fn _mm256_alignr_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __
     static_assert_uimm_bits!(IMM8, 8);
     let a = a.as_i32x8();
     let b = b.as_i32x8();
-    let imm8: i32 = IMM8 % 16;
+    let imm8: i32 = IMM8 % 8;
     let r: i32x8 = match imm8 {
         0 => simd_shuffle!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
         1 => simd_shuffle!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
@@ -26333,14 +26339,7 @@ pub unsafe fn _mm256_alignr_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __
         5 => simd_shuffle!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
         6 => simd_shuffle!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
         7 => simd_shuffle!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
-        8 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
-        9 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
-        10 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
-        11 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
-        12 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
-        13 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
-        14 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
-        _ => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+        _ => unreachable_unchecked(),
     };
     transmute(r)
 }
@@ -26384,6 +26383,8 @@ pub unsafe fn _mm256_maskz_alignr_epi32<const IMM8: i32>(
 
 /// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst.
 ///
+/// <div class="warning">Only lowest <strong>2 bits</strong> are used from the mask (shift at maximum by 12 bytes)!</div>
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi32&expand=239)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
@@ -26394,16 +26395,13 @@ pub unsafe fn _mm_alignr_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m12
     static_assert_uimm_bits!(IMM8, 8);
     let a = a.as_i32x4();
     let b = b.as_i32x4();
-    let imm8: i32 = IMM8 % 8;
+    let imm8: i32 = IMM8 % 4;
     let r: i32x4 = match imm8 {
         0 => simd_shuffle!(a, b, [4, 5, 6, 7]),
         1 => simd_shuffle!(a, b, [5, 6, 7, 0]),
         2 => simd_shuffle!(a, b, [6, 7, 0, 1]),
         3 => simd_shuffle!(a, b, [7, 0, 1, 2]),
-        4 => simd_shuffle!(a, b, [0, 1, 2, 3]),
-        5 => simd_shuffle!(a, b, [1, 2, 3, 0]),
-        6 => simd_shuffle!(a, b, [2, 3, 0, 1]),
-        _ => simd_shuffle!(a, b, [3, 0, 1, 2]),
+        _ => unreachable_unchecked(),
     };
     transmute(r)
 }
@@ -26447,6 +26445,8 @@ pub unsafe fn _mm_maskz_alignr_epi32<const IMM8: i32>(
 
 /// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 64 bytes (8 elements) in dst.
 ///
+/// <div class="warning">Only lowest <strong>3 bits</strong> are used from the mask (shift at maximum by 56 bytes)!</div>
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_alignr_epi64&expand=254)
 #[inline]
 #[target_feature(enable = "avx512f")]
@@ -26464,7 +26464,8 @@ pub unsafe fn _mm512_alignr_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __
         4 => simd_shuffle!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
         5 => simd_shuffle!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
         6 => simd_shuffle!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
-        _ => simd_shuffle!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
+        7 => simd_shuffle!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
+        _ => unreachable_unchecked(),
     };
     transmute(r)
 }
@@ -26508,6 +26509,8 @@ pub unsafe fn _mm512_maskz_alignr_epi64<const IMM8: i32>(
 
 /// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst.
 ///
+/// <div class="warning">Only lowest <strong>2 bits</strong> are used from the mask (shift at maximum by 24 bytes)!</div>
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi64&expand=251)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
@@ -26516,16 +26519,13 @@ pub unsafe fn _mm512_maskz_alignr_epi64<const IMM8: i32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm256_alignr_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8: i32 = IMM8 % 8;
+    let imm8: i32 = IMM8 % 4;
     let r: i64x4 = match imm8 {
         0 => simd_shuffle!(a, b, [4, 5, 6, 7]),
         1 => simd_shuffle!(a, b, [5, 6, 7, 0]),
         2 => simd_shuffle!(a, b, [6, 7, 0, 1]),
         3 => simd_shuffle!(a, b, [7, 0, 1, 2]),
-        4 => simd_shuffle!(a, b, [0, 1, 2, 3]),
-        5 => simd_shuffle!(a, b, [1, 2, 3, 4]),
-        6 => simd_shuffle!(a, b, [2, 3, 4, 5]),
-        _ => simd_shuffle!(a, b, [3, 4, 5, 6]),
+        _ => unreachable_unchecked(),
     };
     transmute(r)
 }
@@ -26569,6 +26569,8 @@ pub unsafe fn _mm256_maskz_alignr_epi64<const IMM8: i32>(
 
 /// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst.
 ///
+/// <div class="warning">Only lowest <strong>bit</strong> is used from the mask (shift at maximum by 8 bytes)!</div>
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi64&expand=248)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
@@ -26577,12 +26579,11 @@ pub unsafe fn _mm256_maskz_alignr_epi64<const IMM8: i32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_alignr_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8: i32 = IMM8 % 4;
+    let imm8: i32 = IMM8 % 2;
     let r: i64x2 = match imm8 {
         0 => simd_shuffle!(a, b, [2, 3]),
         1 => simd_shuffle!(a, b, [3, 0]),
-        2 => simd_shuffle!(a, b, [0, 1]),
-        _ => simd_shuffle!(a, b, [1, 2]),
+        _ => unreachable_unchecked(),
     };
     transmute(r)
 }

diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
@@ -11189,8 +11189,11 @@ mod tests {
         let r = _mm256_alignr_epi64::<0>(a, b);
         let e = _mm256_set_epi64x(8, 7, 6, 5);
         assert_eq_m256i(r, e);
+        let r = _mm256_alignr_epi64::<1>(a, b);
+        let e = _mm256_set_epi64x(1, 8, 7, 6);
+        assert_eq_m256i(r, e);
         let r = _mm256_alignr_epi64::<6>(a, b);
-        let e = _mm256_set_epi64x(6, 5, 4, 3);
+        let e = _mm256_set_epi64x(2, 1, 8, 7);
         assert_eq_m256i(r, e);
     }