diff --git a/src/i16x8_.rs b/src/i16x8_.rs index 8b01d0b5..f8a8a76c 100644 --- a/src/i16x8_.rs +++ b/src/i16x8_.rs @@ -654,32 +654,94 @@ impl i16x8 { #[inline] #[must_use] pub fn reduce_add(self) -> i16 { - let arr: [i16; 8] = cast(self); + pick! { + if #[cfg(target_feature="sse2")] { + // there is a horizontal add instruction on ssse3, but apparently it is very slow on some AMD CPUs + let hi64 = shuffle_ai_f32_all_m128i::<0b01_00_11_10>(self.sse); + let sum64 = add_i16_m128i(self.sse, hi64); + let hi32 = shuffle_ai_f32_all_m128i::<0b11_10_00_01>(sum64); + let sum32 = add_i16_m128i(sum64, hi32); + let lo16 = shr_imm_u32_m128i::<16>(sum32); + let sum16 = add_i16_m128i(sum32, lo16); + extract_i16_as_i32_m128i::<0>(sum16) as i16 + } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ + unsafe { vaddvq_s16(self.neon) } + } else { + let arr: [i16; 8] = cast(self); - (arr[0].wrapping_add(arr[1]).wrapping_add(arr[2].wrapping_add(arr[3]))) - .wrapping_add( - arr[4].wrapping_add(arr[5]).wrapping_add(arr[6].wrapping_add(arr[7])), - ) + // most boring implementation possible so optimizer doesn't overthink this + let mut r = arr[0]; + r = r.wrapping_add(arr[1]); + r = r.wrapping_add(arr[2]); + r = r.wrapping_add(arr[3]); + r = r.wrapping_add(arr[4]); + r = r.wrapping_add(arr[5]); + r = r.wrapping_add(arr[6]); + r.wrapping_add(arr[7]) + } + } } /// horizontal min of all the elements of the vector #[inline] #[must_use] pub fn reduce_min(self) -> i16 { - let arr: [i16; 8] = cast(self); + pick! { + if #[cfg(target_feature="sse2")] { + let hi64 = shuffle_ai_f32_all_m128i::<0b01_00_11_10>(self.sse); + let sum64 = min_i16_m128i(self.sse, hi64); + let hi32 = shuffle_ai_f32_all_m128i::<0b11_10_00_01>(sum64); + let sum32 = min_i16_m128i(sum64, hi32); + let lo16 = shr_imm_u32_m128i::<16>(sum32); + let sum16 = min_i16_m128i(sum32, lo16); + extract_i16_as_i32_m128i::<0>(sum16) as i16 + } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ + unsafe { vminvq_s16(self.neon) } + } else { + let arr: [i16; 8] = cast(self); - (arr[0].min(arr[1]).min(arr[2].min(arr[3]))) - .min(arr[4].min(arr[5]).min(arr[6].min(arr[7]))) + // most boring implementation possible so optimizer doesn't overthink this + let mut r = arr[0]; + r = r.min(arr[1]); + r = r.min(arr[2]); + r = r.min(arr[3]); + r = r.min(arr[4]); + r = r.min(arr[5]); + r = r.min(arr[6]); + r.min(arr[7]) + } + } } /// horizontal max of all the elements of the vector #[inline] #[must_use] pub fn reduce_max(self) -> i16 { - let arr: [i16; 8] = cast(self); + pick! { + if #[cfg(target_feature="sse2")] { + let hi64 = shuffle_ai_f32_all_m128i::<0b01_00_11_10>(self.sse); + let sum64 = max_i16_m128i(self.sse, hi64); + let hi32 = shuffle_ai_f32_all_m128i::<0b11_10_00_01>(sum64); + let sum32 = max_i16_m128i(sum64, hi32); + let lo16 = shr_imm_u32_m128i::<16>(sum32); + let sum16 = max_i16_m128i(sum32, lo16); + extract_i16_as_i32_m128i::<0>(sum16) as i16 + } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ + unsafe { vmaxvq_s16(self.neon) } + } else { + let arr: [i16; 8] = cast(self); - (arr[0].max(arr[1]).max(arr[2].max(arr[3]))) - .max(arr[4].max(arr[5]).max(arr[6].max(arr[7]))) + // most boring implementation possible so optimizer doesn't overthink this + let mut r = arr[0]; + r = r.max(arr[1]); + r = r.max(arr[2]); + r = r.max(arr[3]); + r = r.max(arr[4]); + r = r.max(arr[5]); + r = r.max(arr[6]); + r.max(arr[7]) + } + } } #[inline] diff --git a/src/i32x8_.rs b/src/i32x8_.rs index 64e888e6..e187429e 100644 --- a/src/i32x8_.rs +++ b/src/i32x8_.rs @@ -303,14 +303,41 @@ impl i32x8 { } } else { i32x8::new([ - v.as_array_ref()[0] as i32, - v.as_array_ref()[1] as i32, - v.as_array_ref()[2] as i32, - v.as_array_ref()[3] as i32, - v.as_array_ref()[4] as i32, - v.as_array_ref()[5] as i32, - v.as_array_ref()[6] as i32, - v.as_array_ref()[7] as i32, + i32::from(v.as_array_ref()[0]), + i32::from(v.as_array_ref()[1]), + i32::from(v.as_array_ref()[2]), + i32::from(v.as_array_ref()[3]), + i32::from(v.as_array_ref()[4]), + i32::from(v.as_array_ref()[5]), + i32::from(v.as_array_ref()[6]), + i32::from(v.as_array_ref()[7]), + ]) + } + } + } + + /// widens and zero extends to i32x8 + #[inline] + #[must_use] + pub fn from_u16x8(v: u16x8) -> Self { + pick! { + if #[cfg(target_feature="avx2")] { + i32x8 { avx2:convert_to_i32_m256i_from_u16_m128i(v.sse) } + } else if #[cfg(target_feature="sse2")] { + i32x8 { + a: i32x4 { sse: shr_imm_u32_m128i::<16>( unpack_low_i16_m128i(v.sse, v.sse)) }, + b: i32x4 { sse: shr_imm_u32_m128i::<16>( unpack_high_i16_m128i(v.sse, v.sse)) }, + } + } else { + i32x8::new([ + i32::from(v.as_array_ref()[0]), + i32::from(v.as_array_ref()[1]), + i32::from(v.as_array_ref()[2]), + i32::from(v.as_array_ref()[3]), + i32::from(v.as_array_ref()[4]), + i32::from(v.as_array_ref()[5]), + i32::from(v.as_array_ref()[6]), + i32::from(v.as_array_ref()[7]), ]) } } diff --git a/tests/all_tests/t_i32x8.rs b/tests/all_tests/t_i32x8.rs index d8bd1ace..3e44df04 100644 --- a/tests/all_tests/t_i32x8.rs +++ b/tests/all_tests/t_i32x8.rs @@ -231,6 +231,24 @@ fn impl_from_i16x8() { assert_eq!(actual, expected); } +#[test] +fn impl_from_u16x8() { + let a = u16x8::from([1, 2, 3, 4, 5, i16::MAX as u16, u16::MAX - 1, u16::MAX]); + let actual = i32x8::from_u16x8(a); + let expected = i32x8::from([ + 1, + 2, + 3, + 4, + 5, + i16::MAX as i32, + (u16::MAX - 1) as i32, + u16::MAX as i32, + ]); + + assert_eq!(actual, expected); +} + #[test] fn test_i16x8_move_mask() { let a = i16x8::from([-1, 0, -2, -3, -1, 0, -2, -3]);