diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs index 853a47e9..384c0c63 100644 --- a/fearless_simd/src/generated/avx2.rs +++ b/fearless_simd/src/generated/avx2.rs @@ -178,6 +178,10 @@ impl Simd for Avx2 { unsafe { _mm256_setr_m128(a.into(), b.into()).simd_into(self) } } #[inline(always)] + fn widen_f32x4(self, a: f32x4) -> f64x4 { + unsafe { _mm256_cvtps_pd(a.into()).simd_into(self) } + } + #[inline(always)] fn reinterpret_f64_f32x4(self, a: f32x4) -> f64x2 { f64x2 { val: bytemuck::cast(a.val), @@ -1449,6 +1453,15 @@ impl Simd for Avx2 { } } #[inline(always)] + fn widen_f32x8(self, a: f32x8) -> f64x8 { + unsafe { + let (a0, a1) = self.split_f32x8(a); + let high = _mm256_cvtps_pd(a0.into()).simd_into(self); + let low = _mm256_cvtps_pd(a1.into()).simd_into(self); + self.combine_f64x4(high, low) + } + } + #[inline(always)] fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { f64x4 { val: bytemuck::cast(a.val), @@ -2818,6 +2831,10 @@ impl Simd for Avx2 { } } #[inline(always)] + fn narrow_f64x4(self, a: f64x4) -> f32x4 { + unsafe { _mm256_cvtpd_ps(a.into()).simd_into(self) } + } + #[inline(always)] fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { f32x8 { val: bytemuck::cast(a.val), @@ -3052,6 +3069,18 @@ impl Simd for Avx2 { (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] + fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { + crate::Fallback::new() + .load_interleaved_128_f32x16(src) + .val + .simd_into(self) + } + #[inline(always)] + fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { + let fb = crate::Fallback::new(); + fb.store_interleaved_128_f32x16(a.val.simd_into(fb), dest); + } + #[inline(always)] fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { let (a0, a1) = self.split_f32x16(a); self.combine_f64x4( @@ -3068,18 +3097,6 @@ impl Simd for Avx2 { ) } #[inline(always)] - fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { - crate::Fallback::new() - .load_interleaved_128_f32x16(src) - .val - .simd_into(self) - } - #[inline(always)] - fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { - let fb = crate::Fallback::new(); - fb.store_interleaved_128_f32x16(a.val.simd_into(fb), dest); - } - #[inline(always)] fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { let (a0, a1) = self.split_f32x16(a); self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1)) @@ -4484,6 +4501,15 @@ impl Simd for Avx2 { (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] + fn narrow_f64x8(self, a: f64x8) -> f32x8 { + let (a, b) = self.split_f64x8(a); + unsafe { + let lo = _mm256_cvtpd_ps(a.into()); + let hi = _mm256_cvtpd_ps(b.into()); + _mm256_setr_m128(lo, hi).simd_into(self) + } + } + #[inline(always)] fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { let (a0, a1) = self.split_f64x8(a); self.combine_f32x8( diff --git a/fearless_simd/src/generated/fallback.rs b/fearless_simd/src/generated/fallback.rs index 8e7d1db5..75de5b59 100644 --- a/fearless_simd/src/generated/fallback.rs +++ b/fearless_simd/src/generated/fallback.rs @@ -338,6 +338,16 @@ impl Simd for Fallback { result.simd_into(self) } #[inline(always)] + fn widen_f32x4(self, a: f32x4) -> f64x4 { + [ + a[0usize] as f64, + a[1usize] as f64, + a[2usize] as f64, + a[3usize] as f64, + ] + .simd_into(self) + } + #[inline(always)] fn reinterpret_f64_f32x4(self, a: f32x4) -> f64x2 { f64x2 { val: bytemuck::cast(a.val), @@ -3251,6 +3261,11 @@ impl Simd for Fallback { (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] + fn widen_f32x8(self, a: f32x8) -> f64x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f64x4(self.widen_f32x4(a0), self.widen_f32x4(a1)) + } + #[inline(always)] fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { let (a0, a1) = self.split_f32x8(a); self.combine_f64x2( @@ -4684,6 +4699,16 @@ impl Simd for Fallback { (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] + fn narrow_f64x4(self, a: f64x4) -> f32x4 { + [ + a[0usize] as f32, + a[1usize] as f32, + a[2usize] as f32, + a[3usize] as f32, + ] + .simd_into(self) + } + #[inline(always)] fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { let (a0, a1) = self.split_f64x4(a); self.combine_f32x4( @@ -4934,22 +4959,6 @@ impl Simd for Fallback { (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] - fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f64x4( - self.reinterpret_f64_f32x8(a0), - self.reinterpret_f64_f32x8(a1), - ) - } - #[inline(always)] - fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_i32x8( - self.reinterpret_i32_f32x8(a0), - self.reinterpret_i32_f32x8(a1), - ) - } - #[inline(always)] fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { [ src[0usize], @@ -4980,6 +4989,22 @@ impl Simd for Fallback { ]; } #[inline(always)] + fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f64x4( + self.reinterpret_f64_f32x8(a0), + self.reinterpret_f64_f32x8(a1), + ) + } + #[inline(always)] + fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_i32x8( + self.reinterpret_i32_f32x8(a0), + self.reinterpret_i32_f32x8(a1), + ) + } + #[inline(always)] fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { let (a0, a1) = self.split_f32x16(a); self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1)) @@ -6489,6 +6514,11 @@ impl Simd for Fallback { (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] + fn narrow_f64x8(self, a: f64x8) -> f32x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f32x4(self.narrow_f64x4(a0), self.narrow_f64x4(a1)) + } + #[inline(always)] fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { let (a0, a1) = self.split_f64x8(a); self.combine_f32x8( diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs index a533ce76..2f03bb9b 100644 --- a/fearless_simd/src/generated/neon.rs +++ b/fearless_simd/src/generated/neon.rs @@ -184,6 +184,14 @@ impl Simd for Neon { result.simd_into(self) } #[inline(always)] + fn widen_f32x4(self, a: f32x4) -> f64x4 { + unsafe { + let low = vcvt_f64_f32(vget_low_f32(a.into())); + let high = vcvt_high_f64_f32(a.into()); + float64x2x2_t(low, high).simd_into(self) + } + } + #[inline(always)] fn reinterpret_f64_f32x4(self, a: f32x4) -> f64x2 { unsafe { vreinterpretq_f64_f32(a.into()).simd_into(self) } } @@ -1401,6 +1409,11 @@ impl Simd for Neon { (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] + fn widen_f32x8(self, a: f32x8) -> f64x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f64x4(self.widen_f32x4(a0), self.widen_f32x4(a1)) + } + #[inline(always)] fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { let (a0, a1) = self.split_f32x8(a); self.combine_f64x2( @@ -2821,6 +2834,13 @@ impl Simd for Neon { (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] + fn narrow_f64x4(self, a: f64x4) -> f32x4 { + unsafe { + let converted: float64x2x2_t = a.into(); + vcvt_high_f32_f64(vcvt_f32_f64(converted.0), converted.1).simd_into(self) + } + } + #[inline(always)] fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { let (a0, a1) = self.split_f64x4(a); self.combine_f32x4( @@ -3071,6 +3091,14 @@ impl Simd for Neon { (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] + fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { + unsafe { vld4q_f32(src.as_ptr()).simd_into(self) } + } + #[inline(always)] + fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { + unsafe { vst4q_f32(dest.as_mut_ptr(), a.into()) } + } + #[inline(always)] fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { let (a0, a1) = self.split_f32x16(a); self.combine_f64x4( @@ -3087,14 +3115,6 @@ impl Simd for Neon { ) } #[inline(always)] - fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { - unsafe { vld4q_f32(src.as_ptr()).simd_into(self) } - } - #[inline(always)] - fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { - unsafe { vst4q_f32(dest.as_mut_ptr(), a.into()) } - } - #[inline(always)] fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { let (a0, a1) = self.split_f32x16(a); self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1)) @@ -4465,6 +4485,11 @@ impl Simd for Neon { (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] + fn narrow_f64x8(self, a: f64x8) -> f32x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f32x4(self.narrow_f64x4(a0), self.narrow_f64x4(a1)) + } + #[inline(always)] fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { let (a0, a1) = self.split_f64x8(a); self.combine_f32x8( diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs index e762c279..b10ea490 100644 --- a/fearless_simd/src/generated/simd_trait.rs +++ b/fearless_simd/src/generated/simd_trait.rs @@ -96,6 +96,7 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static { fn trunc_f32x4(self, a: f32x4) -> f32x4; fn select_f32x4(self, a: mask32x4, b: f32x4, c: f32x4) -> f32x4; fn combine_f32x4(self, a: f32x4, b: f32x4) -> f32x8; + fn widen_f32x4(self, a: f32x4) -> f64x4; fn reinterpret_f64_f32x4(self, a: f32x4) -> f64x2; fn reinterpret_i32_f32x4(self, a: f32x4) -> i32x4; fn reinterpret_u8_f32x4(self, a: f32x4) -> u8x16; @@ -374,6 +375,7 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static { fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8; fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16; fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4); + fn widen_f32x8(self, a: f32x8) -> f64x8; fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4; fn reinterpret_i32_f32x8(self, a: f32x8) -> i32x8; fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32; @@ -619,6 +621,7 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static { fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4; fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8; fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2); + fn narrow_f64x4(self, a: f64x4) -> f32x4; fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8; fn splat_mask64x4(self, val: i64) -> mask64x4; fn not_mask64x4(self, a: mask64x4) -> mask64x4; @@ -663,10 +666,10 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static { fn trunc_f32x16(self, a: f32x16) -> f32x16; fn select_f32x16(self, a: mask32x16, b: f32x16, c: f32x16) -> f32x16; fn split_f32x16(self, a: f32x16) -> (f32x8, f32x8); - fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8; - fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16; fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16; fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> (); + fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8; + fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16; fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64; fn reinterpret_u32_f32x16(self, a: f32x16) -> u32x16; fn cvt_u32_f32x16(self, a: f32x16) -> u32x16; @@ -905,6 +908,7 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static { fn trunc_f64x8(self, a: f64x8) -> f64x8; fn select_f64x8(self, a: mask64x8, b: f64x8, c: f64x8) -> f64x8; fn split_f64x8(self, a: f64x8) -> (f64x4, f64x4); + fn narrow_f64x8(self, a: f64x8) -> f32x8; fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16; fn splat_mask64x8(self, val: i64) -> mask64x8; fn not_mask64x8(self, a: mask64x8) -> mask64x8; diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs index b6d53c39..9a48e569 100644 --- a/fearless_simd/src/generated/sse4_2.rs +++ b/fearless_simd/src/generated/sse4_2.rs @@ -186,6 +186,16 @@ impl Simd for Sse4_2 { result.simd_into(self) } #[inline(always)] + fn widen_f32x4(self, a: f32x4) -> f64x4 { + unsafe { + let raw = a.into(); + let high = _mm_cvtps_pd(raw).simd_into(self); + let low = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128::<8>(_mm_castps_si128(raw)))) + .simd_into(self); + self.combine_f64x2(high, low) + } + } + #[inline(always)] fn reinterpret_f64_f32x4(self, a: f32x4) -> f64x2 { f64x2 { val: bytemuck::cast(a.val), @@ -1527,6 +1537,11 @@ impl Simd for Sse4_2 { (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] + fn widen_f32x8(self, a: f32x8) -> f64x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f64x4(self.widen_f32x4(a0), self.widen_f32x4(a1)) + } + #[inline(always)] fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { let (a0, a1) = self.split_f32x8(a); self.combine_f64x2( @@ -2949,6 +2964,15 @@ impl Simd for Sse4_2 { (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] + fn narrow_f64x4(self, a: f64x4) -> f32x4 { + let (a, b) = self.split_f64x4(a); + unsafe { + let lo = _mm_cvtpd_ps(a.into()); + let hi = _mm_cvtpd_ps(b.into()); + _mm_movelh_ps(lo, hi).simd_into(self) + } + } + #[inline(always)] fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { let (a0, a1) = self.split_f64x4(a); self.combine_f32x4( @@ -3199,6 +3223,18 @@ impl Simd for Sse4_2 { (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] + fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { + crate::Fallback::new() + .load_interleaved_128_f32x16(src) + .val + .simd_into(self) + } + #[inline(always)] + fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { + let fb = crate::Fallback::new(); + fb.store_interleaved_128_f32x16(a.val.simd_into(fb), dest); + } + #[inline(always)] fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { let (a0, a1) = self.split_f32x16(a); self.combine_f64x4( @@ -3215,18 +3251,6 @@ impl Simd for Sse4_2 { ) } #[inline(always)] - fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { - crate::Fallback::new() - .load_interleaved_128_f32x16(src) - .val - .simd_into(self) - } - #[inline(always)] - fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { - let fb = crate::Fallback::new(); - fb.store_interleaved_128_f32x16(a.val.simd_into(fb), dest); - } - #[inline(always)] fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { let (a0, a1) = self.split_f32x16(a); self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1)) @@ -4623,6 +4647,11 @@ impl Simd for Sse4_2 { (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] + fn narrow_f64x8(self, a: f64x8) -> f32x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f32x4(self.narrow_f64x4(a0), self.narrow_f64x4(a1)) + } + #[inline(always)] fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { let (a0, a1) = self.split_f64x8(a); self.combine_f32x8( diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs index 1383f7ca..f6f9623f 100644 --- a/fearless_simd/src/generated/wasm.rs +++ b/fearless_simd/src/generated/wasm.rs @@ -169,6 +169,12 @@ impl Simd for WasmSimd128 { result.simd_into(self) } #[inline(always)] + fn widen_f32x4(self, a: f32x4) -> f64x4 { + let low = f64x2_promote_low_f32x4(a.into()); + let high = f64x2_promote_low_f32x4(u64x2_shuffle::<1, 1>(a.into(), a.into())); + self.combine_f64x2(low.simd_into(self), high.simd_into(self)) + } + #[inline(always)] fn reinterpret_f64_f32x4(self, a: f32x4) -> f64x2 { ::from(a).simd_into(self) } @@ -1356,6 +1362,11 @@ impl Simd for WasmSimd128 { (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] + fn widen_f32x8(self, a: f32x8) -> f64x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f64x4(self.widen_f32x4(a0), self.widen_f32x4(a1)) + } + #[inline(always)] fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { let (a0, a1) = self.split_f32x8(a); self.combine_f64x2( @@ -2776,6 +2787,14 @@ impl Simd for WasmSimd128 { (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] + fn narrow_f64x4(self, a: f64x4) -> f32x4 { + let (low, high) = self.split_f64x4(a); + let low = f32x4_demote_f64x2_zero(low.into()); + let high = f32x4_demote_f64x2_zero(high.into()); + let result = u64x2_shuffle::<0, 2>(low, high); + result.simd_into(self) + } + #[inline(always)] fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { let (a0, a1) = self.split_f64x4(a); self.combine_f32x4( @@ -3026,22 +3045,6 @@ impl Simd for WasmSimd128 { (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] - fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f64x4( - self.reinterpret_f64_f32x8(a0), - self.reinterpret_f64_f32x8(a1), - ) - } - #[inline(always)] - fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_i32x8( - self.reinterpret_i32_f32x8(a0), - self.reinterpret_i32_f32x8(a1), - ) - } - #[inline(always)] fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { let v0: v128 = unsafe { v128_load(src[0 * 4usize..].as_ptr() as *const v128) }; let v1: v128 = unsafe { v128_load(src[1 * 4usize..].as_ptr() as *const v128) }; @@ -3084,6 +3087,22 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f64x4( + self.reinterpret_f64_f32x8(a0), + self.reinterpret_f64_f32x8(a1), + ) + } + #[inline(always)] + fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_i32x8( + self.reinterpret_i32_f32x8(a0), + self.reinterpret_i32_f32x8(a1), + ) + } + #[inline(always)] fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { let (a0, a1) = self.split_f32x16(a); self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1)) @@ -4580,6 +4599,11 @@ impl Simd for WasmSimd128 { (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] + fn narrow_f64x8(self, a: f64x8) -> f32x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f32x4(self.narrow_f64x4(a0), self.narrow_f64x4(a1)) + } + #[inline(always)] fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { let (a0, a1) = self.split_f64x8(a); self.combine_f32x8( diff --git a/fearless_simd_gen/src/mk_avx2.rs b/fearless_simd_gen/src/mk_avx2.rs index 5e565d4d..a11965cf 100644 --- a/fearless_simd_gen/src/mk_avx2.rs +++ b/fearless_simd_gen/src/mk_avx2.rs @@ -352,8 +352,8 @@ pub(crate) fn handle_widen_narrow( } "narrow" => { let dst_width = t.n_bits(); - match (dst_width, vec_ty.n_bits()) { - (128, 256) => { + match (vec_ty.scalar, dst_width, vec_ty.n_bits()) { + (ScalarType::Unsigned, 128, 256) => { let mask = match t.scalar_bits { 8 => { quote! { 0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1 } @@ -371,7 +371,14 @@ pub(crate) fn handle_widen_narrow( } } } - (256, 512) => { + (ScalarType::Float, 128, 256) => { + quote! { + unsafe { + _mm256_cvtpd_ps(a.into()).simd_into(self) + } + } + } + (ScalarType::Unsigned, 256, 512) => { let mask = set1_intrinsic(&VecType::new( vec_ty.scalar, vec_ty.scalar_bits, @@ -398,6 +405,17 @@ pub(crate) fn handle_widen_narrow( } } } + (ScalarType::Float, 256, 512) => { + let split = format_ident!("split_{}", vec_ty.rust_name()); + quote! { + let (a, b) = self.#split(a); + unsafe { + let lo = _mm256_cvtpd_ps(a.into()); + let hi = _mm256_cvtpd_ps(b.into()); + _mm256_setr_m128(lo, hi).simd_into(self) + } + } + } _ => unimplemented!(), } } diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs index 693ae9fc..0551781f 100644 --- a/fearless_simd_gen/src/mk_neon.rs +++ b/fearless_simd_gen/src/mk_neon.rs @@ -185,49 +185,125 @@ fn mk_simd_impl(level: Level) -> TokenStream { } } OpSig::WidenNarrow(target_ty) => { + assert_eq!( + vec_ty.scalar, target_ty.scalar, + "widen/narrow ops are done between the same scalar type" + ); + assert_eq!( + vec_ty.len, target_ty.len, + "widen/narrow ops are done preserve the vector length" + ); + let vec_scalar_ty = vec_ty.scalar.rust(vec_ty.scalar_bits); let target_scalar_ty = target_ty.scalar.rust(target_ty.scalar_bits); - if method == "narrow" { - let arch = neon::arch_ty(vec_ty); - - let id1 = - Ident::new(&format!("vmovn_{}", vec_scalar_ty), Span::call_site()); - let id2 = Ident::new( - &format!("vcombine_{}", target_scalar_ty), - Span::call_site(), - ); - - quote! { - #method_sig { - unsafe { - let converted: #arch = a.into(); - let low = #id1(converted.0); - let high = #id1(converted.1); + match method { + "widen" => { + assert_eq!( + vec_ty.scalar_bits * 2, + target_ty.scalar_bits, + "the destination is twice as wide as the source" + ); - #id2(low, high).simd_into(self) + let arch = neon::arch_ty(&target_ty); + + if vec_ty.scalar == ScalarType::Float { + assert_eq!( + vec_ty.scalar_bits, 32, + "only widening from f32x4 to f64x4 is supported" + ); + assert_eq!( + vec_ty.len, 4, + "only widening from f32x4 to f64x4 is supported" + ); + + quote! { + #method_sig { + unsafe { + let low = vcvt_f64_f32(vget_low_f32(a.into())); + let high = vcvt_high_f64_f32(a.into()); + + #arch(low, high).simd_into(self) + } + } + } + } else { + let id1 = Ident::new( + &format!("vmovl_{}", vec_scalar_ty), + Span::call_site(), + ); + let id2 = Ident::new( + &format!("vget_low_{}", vec_scalar_ty), + Span::call_site(), + ); + let id3 = Ident::new( + &format!("vget_high_{}", vec_scalar_ty), + Span::call_site(), + ); + + quote! { + #method_sig { + unsafe { + let low = #id1(#id2(a.into())); + let high = #id1(#id3(a.into())); + + #arch(low, high).simd_into(self) + } + } } } } - } else { - let arch = neon::arch_ty(&target_ty); - let id1 = - Ident::new(&format!("vmovl_{}", vec_scalar_ty), Span::call_site()); - let id2 = - Ident::new(&format!("vget_low_{}", vec_scalar_ty), Span::call_site()); - let id3 = - Ident::new(&format!("vget_high_{}", vec_scalar_ty), Span::call_site()); - - quote! { - #method_sig { - unsafe { - let low = #id1(#id2(a.into())); - let high = #id1(#id3(a.into())); + "narrow" => { + assert_eq!( + vec_ty.scalar_bits / 2, + target_ty.scalar_bits, + "the destination is half as wide as the source" + ); - #arch(low, high).simd_into(self) + let arch = neon::arch_ty(vec_ty); + + if vec_ty.scalar == ScalarType::Float { + assert_eq!( + vec_ty.scalar_bits, 64, + "only narrowing from f64x4 to f32x4 is supported" + ); + assert_eq!( + vec_ty.len, 4, + "only narrowing from f64x4 to f32x4 is supported" + ); + + quote! { + #method_sig { + unsafe { + let converted: #arch = a.into(); + vcvt_high_f32_f64(vcvt_f32_f64(converted.0), converted.1).simd_into(self) + } + } + } + } else { + let id1 = Ident::new( + &format!("vmovn_{}", vec_scalar_ty), + Span::call_site(), + ); + let id2 = Ident::new( + &format!("vcombine_{}", target_scalar_ty), + Span::call_site(), + ); + + quote! { + #method_sig { + unsafe { + let converted: #arch = a.into(); + let low = #id1(converted.0); + let high = #id1(converted.1); + + #id2(low, high).simd_into(self) + } + } } } } + _ => unreachable!(), } } OpSig::Binary => { diff --git a/fearless_simd_gen/src/mk_sse4_2.rs b/fearless_simd_gen/src/mk_sse4_2.rs index e0bcd653..f3ad873b 100644 --- a/fearless_simd_gen/src/mk_sse4_2.rs +++ b/fearless_simd_gen/src/mk_sse4_2.rs @@ -348,6 +348,23 @@ pub(crate) fn handle_widen_narrow( } .rust_name() ); + let shifted = if vec_ty.scalar == ScalarType::Float { + let to_ints = cast_ident( + ScalarType::Float, + ScalarType::Int, + vec_ty.scalar_bits, + vec_ty.n_bits(), + ); + let from_ints = cast_ident( + ScalarType::Int, + ScalarType::Float, + vec_ty.scalar_bits, + vec_ty.n_bits(), + ); + quote! { #from_ints(_mm_srli_si128::<8>(#to_ints(raw))) } + } else { + quote! { _mm_srli_si128::<8>(raw) } + }; quote! { #method_sig { unsafe { @@ -355,37 +372,55 @@ pub(crate) fn handle_widen_narrow( let high = #extend(raw).simd_into(self); // Shift by 8 since we want to get the higher part into the // lower position. - let low = #extend(_mm_srli_si128::<8>(raw)).simd_into(self); + let low = #extend(#shifted).simd_into(self); self.#combine(high, low) } } } } "narrow" => { - let mask = set1_intrinsic(&VecType::new( - vec_ty.scalar, - vec_ty.scalar_bits, - vec_ty.len / 2, - )); - let pack = pack_intrinsic( - vec_ty.scalar_bits, - matches!(vec_ty.scalar, ScalarType::Int), - t.n_bits(), - ); - let split = format_ident!("split_{}", vec_ty.rust_name()); - quote! { - #method_sig { - let (a, b) = self.#split(a); - unsafe { - // Note that SSE4.2 only has an intrinsic for saturating cast, - // but not wrapping. - let mask = #mask(0xFF); - let lo_masked = _mm_and_si128(a.into(), mask); - let hi_masked = _mm_and_si128(b.into(), mask); - let result = #pack(lo_masked, hi_masked); - result.simd_into(self) + match (vec_ty.scalar, t.n_bits(), vec_ty.n_bits()) { + (ScalarType::Unsigned, 128, 256) => { + let mask = set1_intrinsic(&VecType::new( + vec_ty.scalar, + vec_ty.scalar_bits, + vec_ty.len / 2, + )); + let pack = pack_intrinsic( + vec_ty.scalar_bits, + matches!(vec_ty.scalar, ScalarType::Int), + t.n_bits(), + ); + let split = format_ident!("split_{}", vec_ty.rust_name()); + quote! { + #method_sig { + let (a, b) = self.#split(a); + unsafe { + // Note that SSE4.2 only has an intrinsic for saturating cast, + // but not wrapping. + let mask = #mask(0xFF); + let lo_masked = _mm_and_si128(a.into(), mask); + let hi_masked = _mm_and_si128(b.into(), mask); + let result = #pack(lo_masked, hi_masked); + result.simd_into(self) + } + } } } + (ScalarType::Float, 128, 256) => { + let split = format_ident!("split_{}", vec_ty.rust_name()); + quote! { + #method_sig { + let (a, b) = self.#split(a); + unsafe { + let lo = _mm_cvtpd_ps(a.into()); + let hi = _mm_cvtpd_ps(b.into()); + _mm_movelh_ps(lo, hi).simd_into(self) + } + } + } + } + _ => unimplemented!(), } } _ => unreachable!(), diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs index 56305658..6e9fc660 100644 --- a/fearless_simd_gen/src/mk_wasm.rs +++ b/fearless_simd_gen/src/mk_wasm.rs @@ -325,35 +325,67 @@ fn mk_simd_impl(level: Level) -> TokenStream { } } OpSig::WidenNarrow(to_ty) => { + assert_eq!(vec_ty.scalar, to_ty.scalar); + assert_eq!(vec_ty.len, to_ty.len); match method { "widen" => { - assert_eq!(vec_ty.rust_name(), "u8x16"); - assert_eq!(to_ty.rust_name(), "u16x16"); - quote! { - #method_sig { - let low = u16x8_extend_low_u8x16(a.into()); - let high = u16x8_extend_high_u8x16(a.into()); - self.combine_u16x8(low.simd_into(self), high.simd_into(self)) + assert_eq!(vec_ty.scalar_bits * 2, to_ty.scalar_bits); + + match (vec_ty.scalar, vec_ty.len, vec_ty.scalar_bits) { + (ScalarType::Unsigned, 16, 8) => { + quote! { + #method_sig { + let low = u16x8_extend_low_u8x16(a.into()); + let high = u16x8_extend_high_u8x16(a.into()); + self.combine_u16x8(low.simd_into(self), high.simd_into(self)) + } + } + } + (ScalarType::Float, 4, 32) => { + quote! { + #method_sig { + let low = f64x2_promote_low_f32x4(a.into()); + let high = f64x2_promote_low_f32x4(u64x2_shuffle::<1, 1>(a.into(), a.into())); + self.combine_f64x2(low.simd_into(self), high.simd_into(self)) + } + } } + _ => unimplemented!(), } } "narrow" => { - assert_eq!(vec_ty.rust_name(), "u16x16"); - assert_eq!(to_ty.rust_name(), "u8x16"); - // WASM SIMD only has saturating narrowing instructions, so we emulate - // truncated narrowing by masking out the - quote! { - #method_sig { - let mask = u16x8_splat(0xFF); - let (low, high) = self.split_u16x16(a); - let low_masked = v128_and(low.into(), mask); - let high_masked = v128_and(high.into(), mask); - let result = u8x16_narrow_i16x8(low_masked, high_masked); - result.simd_into(self) + assert_eq!(vec_ty.scalar_bits / 2, to_ty.scalar_bits); + + match (vec_ty.scalar, vec_ty.len, vec_ty.scalar_bits) { + (ScalarType::Unsigned, 16, 16) => { + // WASM SIMD only has saturating narrowing instructions, so we emulate + // truncated narrowing by masking out the upper byte of each u16 + quote! { + #method_sig { + let mask = u16x8_splat(0xFF); + let (low, high) = self.split_u16x16(a); + let low_masked = v128_and(low.into(), mask); + let high_masked = v128_and(high.into(), mask); + let result = u8x16_narrow_i16x8(low_masked, high_masked); + result.simd_into(self) + } + } + } + (ScalarType::Float, 4, 64) => { + quote! { + #method_sig { + let (low, high) = self.split_f64x4(a); + let low = f32x4_demote_f64x2_zero(low.into()); + let high = f32x4_demote_f64x2_zero(high.into()); + let result = u64x2_shuffle::<0, 2>(low, high); + result.simd_into(self) + } + } } + _ => unimplemented!(), } } - _ => unimplemented!(), + _ => unreachable!(), } } OpSig::LoadInterleaved(block_size, count) => { diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs index bf6b1631..d876ea2e 100644 --- a/fearless_simd_gen/src/ops.rs +++ b/fearless_simd_gen/src/ops.rs @@ -127,8 +127,23 @@ pub fn ops_for_type(ty: &VecType, cvt: bool) -> Vec<(&str, OpSig)> { ops.push(("neg", OpSig::Unary)); } - if ty.scalar == ScalarType::Float { - if cvt { + if let (ScalarType::Unsigned | ScalarType::Float, 8 | 16 | 32, 512) = + (ty.scalar, ty.scalar_bits, ty.n_bits()) + { + ops.push(("load_interleaved_128", OpSig::LoadInterleaved(128, 4))); + ops.push(("store_interleaved_128", OpSig::StoreInterleaved(128, 4))); + } + + if cvt { + if let Some(widened) = ty.widened() { + ops.push(("widen", OpSig::WidenNarrow(widened))); + } + + if let Some(narrowed) = ty.narrowed() { + ops.push(("narrow", OpSig::WidenNarrow(narrowed))); + } + + if ty.scalar == ScalarType::Float { if ty.scalar_bits == 64 { ops.push(("reinterpret_f32", OpSig::Reinterpret(ScalarType::Float, 32))); } else { @@ -138,44 +153,23 @@ pub fn ops_for_type(ty: &VecType, cvt: bool) -> Vec<(&str, OpSig)> { } } - if ty.scalar_bits == 64 { - return ops; - } - } - - if matches!(ty.scalar, ScalarType::Unsigned | ScalarType::Float) && ty.n_bits() == 512 { - ops.push(("load_interleaved_128", OpSig::LoadInterleaved(128, 4))); - } - - if matches!(ty.scalar, ScalarType::Unsigned | ScalarType::Float) && ty.n_bits() == 512 { - ops.push(("store_interleaved_128", OpSig::StoreInterleaved(128, 4))); - } - - if cvt { - if matches!(ty.scalar, ScalarType::Unsigned) { - if let Some(widened) = ty.widened() { - ops.push(("widen", OpSig::WidenNarrow(widened))); + // TODO: provide reinterpret ops for f64; it should be very straightforward + if !(ty.scalar == ScalarType::Float && ty.scalar_bits == 64) { + if valid_reinterpret(ty, ScalarType::Unsigned, 8) { + ops.push(( + "reinterpret_u8", + OpSig::Reinterpret(ScalarType::Unsigned, 8), + )); } - if let Some(narrowed) = ty.narrowed() { - ops.push(("narrow", OpSig::WidenNarrow(narrowed))); + if valid_reinterpret(ty, ScalarType::Unsigned, 32) { + ops.push(( + "reinterpret_u32", + OpSig::Reinterpret(ScalarType::Unsigned, 32), + )); } } - if valid_reinterpret(ty, ScalarType::Unsigned, 8) { - ops.push(( - "reinterpret_u8", - OpSig::Reinterpret(ScalarType::Unsigned, 8), - )); - } - - if valid_reinterpret(ty, ScalarType::Unsigned, 32) { - ops.push(( - "reinterpret_u32", - OpSig::Reinterpret(ScalarType::Unsigned, 32), - )); - } - match (ty.scalar, ty.scalar_bits) { (ScalarType::Float, 32) => { ops.push(("cvt_u32", OpSig::Cvt(ScalarType::Unsigned, 32))); diff --git a/fearless_simd_gen/src/types.rs b/fearless_simd_gen/src/types.rs index ed0639a0..de544fa3 100644 --- a/fearless_simd_gen/src/types.rs +++ b/fearless_simd_gen/src/types.rs @@ -74,27 +74,19 @@ impl VecType { } pub fn widened(&self) -> Option { - if matches!(self.scalar, ScalarType::Mask | ScalarType::Float) - || self.n_bits() > 256 - || self.scalar_bits != 8 - { - return None; + match (self.scalar, self.n_bits(), self.scalar_bits) { + (ScalarType::Unsigned, 128 | 256, 8) => Some(Self::new(self.scalar, 16, self.len)), + (ScalarType::Float, 128 | 256, 32) => Some(Self::new(self.scalar, 64, self.len)), + _ => None, } - - let scalar_bits = self.scalar_bits * 2; - Some(Self::new(self.scalar, scalar_bits, self.len)) } pub fn narrowed(&self) -> Option { - if matches!(self.scalar, ScalarType::Mask | ScalarType::Float) - || self.n_bits() < 256 - || self.scalar_bits != 16 - { - return None; + match (self.scalar, self.n_bits(), self.scalar_bits) { + (ScalarType::Unsigned, 256 | 512, 16) => Some(Self::new(self.scalar, 8, self.len)), + (ScalarType::Float, 256 | 512, 64) => Some(Self::new(self.scalar, 32, self.len)), + _ => None, } - - let scalar_bits = self.scalar_bits / 2; - Some(Self::new(self.scalar, scalar_bits, self.len)) } pub fn mask_ty(&self) -> Self { diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs index c7706348..3b899de3 100644 --- a/fearless_simd_tests/tests/harness/mod.rs +++ b/fearless_simd_tests/tests/harness/mod.rs @@ -9,6 +9,10 @@ clippy::unseparated_literal_suffix, reason = "TODO: https://github.com/linebender/fearless_simd/issues/40" )] +#![expect( + clippy::approx_constant, + reason = "these constants are good test vectors" +)] //! Tests for `fearless_simd`. @@ -2380,6 +2384,71 @@ fn narrow_u16x32(simd: S) { ); } +#[simd_test] +fn widen_f32x4(simd: S) { + let a = f32x4::from_slice(simd, &[1.0, -2.5, 3.14159, 0.0]); + assert_eq!(simd.widen_f32x4(a).val, [1.0, -2.5, 3.141590118408203, 0.0]); +} + +#[simd_test] +fn widen_f32x8(simd: S) { + let a = f32x8::from_slice( + simd, + &[1.0, -2.5, 3.14159, 0.0, 100.25, -0.5, f32::MAX, f32::MIN], + ); + let result = simd.widen_f32x8(a); + assert_eq!( + result.val, + [ + 1.0, + -2.5, + 3.141590118408203, + 0.0, + 100.25, + -0.5, + f32::MAX as f64, + f32::MIN as f64 + ] + ); +} + +#[simd_test] +fn narrow_f64x4(simd: S) { + let a = f64x4::from_slice(simd, &[1.0, -2.5, 3.14159265358979, 0.0]); + assert_eq!(simd.narrow_f64x4(a).val, [1.0, -2.5, 3.1415927, 0.0]); +} + +#[simd_test] +fn narrow_f64x8(simd: S) { + let a = f64x8::from_slice( + simd, + &[ + 1.0, + -2.5, + 3.14159265358979, + 0.0, + 100.25, + -0.5, + f64::MAX, + f64::MIN, + ], + ); + let result = simd.narrow_f64x8(a); + assert_eq!( + result.val, + [ + 1.0, + -2.5, + 3.1415927, + 0.0, + 100.25, + -0.5, + f32::INFINITY, + f32::NEG_INFINITY + ] + ); +} + #[simd_test] fn abs_f64x2(simd: S) { let a = f64x2::from_slice(simd, &[-1.5, 2.5]);