Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix family of _mmX_alignr_epiX functions #1678

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 11 additions & 12 deletions crates/core_arch/src/x86/avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate

use core::hint::unreachable_unchecked;

use crate::core_arch::{simd::*, x86::*};
use crate::intrinsics::simd::*;

Expand Down Expand Up @@ -164,7 +166,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
static_assert_uimm_bits!(IMM8, 8);
// If palignr is shifting the pair of vectors more than the size of two
// lanes, emit zero.
if IMM8 > 32 {
if IMM8 >= 32 {
return _mm256_setzero_si256();
}
// If palignr is shifting the pair of input vectors more than one lane,
Expand All @@ -178,6 +180,10 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
let a = a.as_i8x32();
let b = b.as_i8x32();

if IMM8 == 16 {
marxin marked this conversation as resolved.
Show resolved Hide resolved
return transmute(a);
}

let r: i8x32 = match IMM8 % 16 {
0 => simd_shuffle!(
b,
Expand Down Expand Up @@ -307,7 +313,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
],
),
_ => b,
_ => unreachable_unchecked(),
};
transmute(r)
}
Expand Down Expand Up @@ -5305,16 +5311,6 @@ mod tests {
);
assert_eq_m256i(r, expected);

#[rustfmt::skip]
let expected = _mm256_setr_epi8(
-1, -2, -3, -4, -5, -6, -7, -8,
-9, -10, -11, -12, -13, -14, -15, -16, -17,
-18, -19, -20, -21, -22, -23, -24, -25,
-26, -27, -28, -29, -30, -31, -32,
);
let r = _mm256_alignr_epi8::<16>(a, b);
assert_eq_m256i(r, expected);

let r = _mm256_alignr_epi8::<15>(a, b);
#[rustfmt::skip]
let expected = _mm256_setr_epi8(
Expand All @@ -5327,6 +5323,9 @@ mod tests {

let r = _mm256_alignr_epi8::<0>(a, b);
assert_eq_m256i(r, b);

let r = _mm256_alignr_epi8::<16>(a, b);
assert_eq_m256i(r, a);
}

#[simd_test(enable = "avx2")]
Expand Down
12 changes: 10 additions & 2 deletions crates/core_arch/src/x86/avx512bw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ use crate::{
ptr,
};

use core::hint::unreachable_unchecked;

#[cfg(test)]
use stdarch_test::assert_instr;

Expand Down Expand Up @@ -10850,6 +10852,8 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
}

/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst.
/// Unlike [`_mm_alignr_epi8`], [`_mm256_alignr_epi8`] functions, where the entire input vectors are concatenated to the temporary result,
/// this concatenation happens in 4 steps, where each step builds 32-byte temporary result.
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_alignr_epi8&expand=263)
#[inline]
Expand All @@ -10860,7 +10864,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
// If palignr is shifting the pair of vectors more than the size of two
// lanes, emit zero.
if IMM8 > 32 {
if IMM8 >= 32 {
return _mm512_setzero_si512();
}
// If palignr is shifting the pair of input vectors more than one lane,
Expand All @@ -10873,6 +10877,10 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
let a = a.as_i8x64();
let b = b.as_i8x64();

if IMM8 == 16 {
return transmute(a);
}

let r: i8x64 = match IMM8 % 16 {
0 => simd_shuffle!(
b,
Expand Down Expand Up @@ -11031,7 +11039,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
121, 122, 123, 124, 125, 126,
],
),
_ => b,
_ => unreachable_unchecked(),
};
transmute(r)
}
Expand Down
49 changes: 25 additions & 24 deletions crates/core_arch/src/x86/avx512f.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use crate::{
mem, ptr,
};

use core::hint::unreachable_unchecked;
#[cfg(test)]
use stdarch_test::assert_instr;

Expand Down Expand Up @@ -26202,6 +26203,8 @@ pub unsafe fn _mm_mask_blend_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d

/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 64 bytes (16 elements) in dst.
///
/// <div class="warning">Only lowest <strong>4 bits</strong> are used from the mask (shift at maximum by 60 bytes)!</div>
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_alignr_epi32&expand=245)
#[inline]
#[target_feature(enable = "avx512f")]
Expand Down Expand Up @@ -26269,7 +26272,8 @@ pub unsafe fn _mm512_alignr_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __
12 => simd_shuffle!(a, b, [28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
13 => simd_shuffle!(a, b, [29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
14 => simd_shuffle!(a, b, [30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
_ => simd_shuffle!(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
15 => simd_shuffle!(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
_ => unreachable_unchecked(),
};
transmute(r)
}
Expand Down Expand Up @@ -26313,6 +26317,8 @@ pub unsafe fn _mm512_maskz_alignr_epi32<const IMM8: i32>(

/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst.
///
/// <div class="warning">Only lowest <strong>3 bits</strong> are used from the mask (shift at maximum by 28 bytes)!</div>
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi32&expand=242)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
Expand All @@ -26323,7 +26329,7 @@ pub unsafe fn _mm256_alignr_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i32x8();
let b = b.as_i32x8();
let imm8: i32 = IMM8 % 16;
let imm8: i32 = IMM8 % 8;
let r: i32x8 = match imm8 {
0 => simd_shuffle!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
1 => simd_shuffle!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
Expand All @@ -26333,14 +26339,7 @@ pub unsafe fn _mm256_alignr_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __
5 => simd_shuffle!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
6 => simd_shuffle!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
7 => simd_shuffle!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
8 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
9 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
10 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
11 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
12 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
13 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
14 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
_ => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
_ => unreachable_unchecked(),
};
transmute(r)
}
Expand Down Expand Up @@ -26384,6 +26383,8 @@ pub unsafe fn _mm256_maskz_alignr_epi32<const IMM8: i32>(

/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst.
///
/// <div class="warning">Only lowest <strong>2 bits</strong> are used from the mask (shift at maximum by 12 bytes)!</div>
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi32&expand=239)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
Expand All @@ -26394,16 +26395,13 @@ pub unsafe fn _mm_alignr_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m12
static_assert_uimm_bits!(IMM8, 8);
let a = a.as_i32x4();
let b = b.as_i32x4();
let imm8: i32 = IMM8 % 8;
let imm8: i32 = IMM8 % 4;
let r: i32x4 = match imm8 {
0 => simd_shuffle!(a, b, [4, 5, 6, 7]),
1 => simd_shuffle!(a, b, [5, 6, 7, 0]),
2 => simd_shuffle!(a, b, [6, 7, 0, 1]),
3 => simd_shuffle!(a, b, [7, 0, 1, 2]),
4 => simd_shuffle!(a, b, [0, 1, 2, 3]),
5 => simd_shuffle!(a, b, [1, 2, 3, 0]),
6 => simd_shuffle!(a, b, [2, 3, 0, 1]),
_ => simd_shuffle!(a, b, [3, 0, 1, 2]),
_ => unreachable_unchecked(),
};
transmute(r)
}
Expand Down Expand Up @@ -26447,6 +26445,8 @@ pub unsafe fn _mm_maskz_alignr_epi32<const IMM8: i32>(

/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 64 bytes (8 elements) in dst.
///
/// <div class="warning">Only lowest <strong>3 bits</strong> are used from the mask (shift at maximum by 56 bytes)!</div>
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_alignr_epi64&expand=254)
#[inline]
#[target_feature(enable = "avx512f")]
Expand All @@ -26464,7 +26464,8 @@ pub unsafe fn _mm512_alignr_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __
4 => simd_shuffle!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
5 => simd_shuffle!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
6 => simd_shuffle!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
_ => simd_shuffle!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
7 => simd_shuffle!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
_ => unreachable_unchecked(),
};
transmute(r)
}
Expand Down Expand Up @@ -26508,6 +26509,8 @@ pub unsafe fn _mm512_maskz_alignr_epi64<const IMM8: i32>(

/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst.
///
/// <div class="warning">Only lowest <strong>2 bits</strong> are used from the mask (shift at maximum by 24 bytes)!</div>
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi64&expand=251)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
Expand All @@ -26516,16 +26519,13 @@ pub unsafe fn _mm512_maskz_alignr_epi64<const IMM8: i32>(
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_alignr_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
static_assert_uimm_bits!(IMM8, 8);
let imm8: i32 = IMM8 % 8;
let imm8: i32 = IMM8 % 4;
let r: i64x4 = match imm8 {
0 => simd_shuffle!(a, b, [4, 5, 6, 7]),
1 => simd_shuffle!(a, b, [5, 6, 7, 0]),
2 => simd_shuffle!(a, b, [6, 7, 0, 1]),
3 => simd_shuffle!(a, b, [7, 0, 1, 2]),
4 => simd_shuffle!(a, b, [0, 1, 2, 3]),
5 => simd_shuffle!(a, b, [1, 2, 3, 4]),
6 => simd_shuffle!(a, b, [2, 3, 4, 5]),
_ => simd_shuffle!(a, b, [3, 4, 5, 6]),
_ => unreachable_unchecked(),
};
transmute(r)
}
Expand Down Expand Up @@ -26569,6 +26569,8 @@ pub unsafe fn _mm256_maskz_alignr_epi64<const IMM8: i32>(

/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst.
///
/// <div class="warning">Only lowest <strong>bit</strong> is used from the mask (shift at maximum by 8 bytes)!</div>
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi64&expand=248)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
Expand All @@ -26577,12 +26579,11 @@ pub unsafe fn _mm256_maskz_alignr_epi64<const IMM8: i32>(
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_alignr_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
static_assert_uimm_bits!(IMM8, 8);
let imm8: i32 = IMM8 % 4;
let imm8: i32 = IMM8 % 2;
let r: i64x2 = match imm8 {
0 => simd_shuffle!(a, b, [2, 3]),
1 => simd_shuffle!(a, b, [3, 0]),
2 => simd_shuffle!(a, b, [0, 1]),
_ => simd_shuffle!(a, b, [1, 2]),
_ => unreachable_unchecked(),
};
transmute(r)
}
Expand Down
5 changes: 4 additions & 1 deletion crates/core_arch/src/x86_64/avx512f.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11189,8 +11189,11 @@ mod tests {
let r = _mm256_alignr_epi64::<0>(a, b);
let e = _mm256_set_epi64x(8, 7, 6, 5);
assert_eq_m256i(r, e);
let r = _mm256_alignr_epi64::<1>(a, b);
let e = _mm256_set_epi64x(1, 8, 7, 6);
assert_eq_m256i(r, e);
let r = _mm256_alignr_epi64::<6>(a, b);
let e = _mm256_set_epi64x(6, 5, 4, 3);
let e = _mm256_set_epi64x(2, 1, 8, 7);
assert_eq_m256i(r, e);
}

Expand Down