From 679c22b9bb45b5ac9d88b544ed5af6fa677b9ccd Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 8 Dec 2025 15:55:39 +0000 Subject: [PATCH 01/12] WIP --- Cargo.toml | 1 + src/kernels/dit.rs | 33 +++++++++++++-------------------- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 99ae92f..d0bcf30 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,6 +19,7 @@ num-complex = { version = "0.4.6", features = ["bytemuck"], optional = true } bytemuck = { version = "1.23.2", optional = true } wide = "0.8.1" rayon = { version = "1.11.0", optional = true } +fearless_simd = { git = "https://github.com/linebender/fearless_simd.git", version = "0.3.0" } [features] default = [] diff --git a/src/kernels/dit.rs b/src/kernels/dit.rs index 879b556..d582258 100644 --- a/src/kernels/dit.rs +++ b/src/kernels/dit.rs @@ -5,7 +5,8 @@ use core::f32; use num_traits::Float; -use wide::{f32x16, f32x4, f32x8, f64x4, f64x8}; +use fearless_simd::{Simd, SimdBase, SimdFrom}; +use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; use crate::kernels::common::fft_chunk_2; @@ -114,32 +115,24 @@ pub fn fft_dit_chunk_4_simd_f32(reals: &mut [f32], imags: &mut [f32]) { } /// DIT butterfly for chunk_size == 8 (f64) with SIMD -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_chunk_8_simd_f64(reals: &mut [f64], imags: &mut [f64]) { +#[inline(always)] // required by fearless_simd +pub fn fft_dit_chunk_8_simd_f64(simd: S, reals: &mut [f64], imags: &mut [f64]) { const DIST: usize = 4; const CHUNK_SIZE: usize = DIST << 1; - let two = f64x4::splat(2.0); - let sqrt2_2 = f64x4::new([ + let two = f64x4::splat(simd, 2.0); + let sqrt2_2 = f64x4::simd_from([ 1.0, // W_8^0 real std::f64::consts::FRAC_1_SQRT_2, // W_8^1 real (sqrt(2)/2) 0.0, // W_8^2 real -std::f64::consts::FRAC_1_SQRT_2, // W_8^3 real (-sqrt(2)/2) - ]); - let sqrt2_2_im = f64x4::new([ + ], simd); + let sqrt2_2_im = f64x4::simd_from([ 0.0, // W_8^0 imag -std::f64::consts::FRAC_1_SQRT_2, // W_8^1 imag (-sqrt(2)/2) -1.0, // W_8^2 imag -std::f64::consts::FRAC_1_SQRT_2, // W_8^3 imag (-sqrt(2)/2) - ]); + ], simd); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -147,10 +140,10 @@ pub fn fft_dit_chunk_8_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST); let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); - let in0_re = f64x4::new(reals_s0[0..4].try_into().unwrap()); - let in1_re = f64x4::new(reals_s1[0..4].try_into().unwrap()); - let in0_im = f64x4::new(imags_s0[0..4].try_into().unwrap()); - let in1_im = f64x4::new(imags_s1[0..4].try_into().unwrap()); + let in0_re = f64x4::simd_from(reals_s0[0..4].try_into().unwrap(), simd); + let in1_re = f64x4::simd_from(reals_s1[0..4].try_into().unwrap(), simd); + let in0_im = f64x4::simd_from(imags_s0[0..4].try_into().unwrap(), simd); + let in1_im = f64x4::simd_from(imags_s1[0..4].try_into().unwrap(), simd); // out0.re = (in0.re + w.re * in1.re) - w.im * in1.im let out0_re = sqrt2_2_im.mul_neg_add(in1_im, sqrt2_2.mul_add(in1_re, in0_re)); From f7a11330e453a8e1b0cb957aed536cd796f69ed0 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Fri, 12 Dec 2025 12:31:40 +0000 Subject: [PATCH 02/12] Adapt conversion to slice for fearless_simd --- src/kernels/dit.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/kernels/dit.rs b/src/kernels/dit.rs index a4a0460..a0c204b 100644 --- a/src/kernels/dit.rs +++ b/src/kernels/dit.rs @@ -5,7 +5,7 @@ use core::f32; use num_traits::Float; -use fearless_simd::{Simd, SimdBase, SimdFrom}; +use fearless_simd::{Simd, SimdBase, SimdFrom, SimdFloat}; use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; use crate::kernels::common::fft_chunk_2; @@ -154,10 +154,10 @@ pub fn fft_dit_chunk_8_simd_f64(simd: S, reals: &mut [f64], imags: &mut let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0.copy_from_slice(out0_re.as_array()); - imags_s0.copy_from_slice(out0_im.as_array()); - reals_s1.copy_from_slice(out1_re.as_array()); - imags_s1.copy_from_slice(out1_im.as_array()); + reals_s0.copy_from_slice(out0_re.as_slice()); + imags_s0.copy_from_slice(out0_im.as_slice()); + reals_s1.copy_from_slice(out1_re.as_slice()); + imags_s1.copy_from_slice(out1_im.as_slice()); }); } From 28166a9159af3741cb9665cf4bf58edf5e399a89 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Fri, 12 Dec 2025 12:40:33 +0000 Subject: [PATCH 03/12] Fully convert fft_dit_chunk_8_simd_f64 to fearless_simd --- src/kernels/dit.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/kernels/dit.rs b/src/kernels/dit.rs index a0c204b..182822e 100644 --- a/src/kernels/dit.rs +++ b/src/kernels/dit.rs @@ -6,7 +6,7 @@ use core::f32; use num_traits::Float; use fearless_simd::{Simd, SimdBase, SimdFrom, SimdFloat}; -use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; +use wide::{f32x16, f32x4, f32x8, f64x4, f64x8}; use crate::kernels::common::fft_chunk_2; @@ -117,6 +117,7 @@ pub fn fft_dit_chunk_4_simd_f32(reals: &mut [f32], imags: &mut [f32]) { /// DIT butterfly for chunk_size == 8 (f64) with SIMD #[inline(always)] // required by fearless_simd pub fn fft_dit_chunk_8_simd_f64(simd: S, reals: &mut [f64], imags: &mut [f64]) { + use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; const DIST: usize = 4; const CHUNK_SIZE: usize = DIST << 1; @@ -140,10 +141,10 @@ pub fn fft_dit_chunk_8_simd_f64(simd: S, reals: &mut [f64], imags: &mut let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST); let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); - let in0_re = f64x4::simd_from(reals_s0[0..4].try_into().unwrap(), simd); - let in1_re = f64x4::simd_from(reals_s1[0..4].try_into().unwrap(), simd); - let in0_im = f64x4::simd_from(imags_s0[0..4].try_into().unwrap(), simd); - let in1_im = f64x4::simd_from(imags_s1[0..4].try_into().unwrap(), simd); + let in0_re = f64x4::simd_from(<[f64; 4]>::try_from(&reals_s0[0..4]).unwrap(), simd); + let in1_re = f64x4::simd_from(<[f64; 4]>::try_from(&reals_s1[0..4]).unwrap(), simd); + let in0_im = f64x4::simd_from(<[f64; 4]>::try_from(&imags_s0[0..4]).unwrap(), simd); + let in1_im = f64x4::simd_from(<[f64; 4]>::try_from(&imags_s1[0..4]).unwrap(), simd); // out0.re = (in0.re + w.re * in1.re) - w.im * in1.im let out0_re = sqrt2_2_im.mul_add(-in1_im, sqrt2_2.mul_add(in1_re, in0_re)); From 1333b094c5a997fac2bd2e3f05d92ed50314acb6 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Fri, 12 Dec 2025 13:15:52 +0000 Subject: [PATCH 04/12] Convert the rest of DIT functions to fearless_simd --- src/kernels/dit.rs | 491 +++++++++++++++++++-------------------------- 1 file changed, 207 insertions(+), 284 deletions(-) diff --git a/src/kernels/dit.rs b/src/kernels/dit.rs index 182822e..3e0c9ac 100644 --- a/src/kernels/dit.rs +++ b/src/kernels/dit.rs @@ -17,16 +17,8 @@ pub fn fft_dit_chunk_2(reals: &mut [T], imags: &mut [T]) { } /// DIT butterfly for chunk_size == 4 (f64) -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_chunk_4_simd_f64(reals: &mut [f64], imags: &mut [f64]) { +#[inline(always)] // required by fearless_simd +pub fn fft_dit_chunk_4_simd_f64(_simd: S, reals: &mut [f64], imags: &mut [f64]) { const DIST: usize = 2; const CHUNK_SIZE: usize = DIST << 1; @@ -66,16 +58,8 @@ pub fn fft_dit_chunk_4_simd_f64(reals: &mut [f64], imags: &mut [f64]) { } /// DIT butterfly for chunk_size == 4 (f32) -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_chunk_4_simd_f32(reals: &mut [f32], imags: &mut [f32]) { +#[inline(always)] // required by fearless_simd +pub fn fft_dit_chunk_4_simd_f32(_simd: S, reals: &mut [f32], imags: &mut [f32]) { const DIST: usize = 2; const CHUNK_SIZE: usize = DIST << 1; @@ -163,32 +147,25 @@ pub fn fft_dit_chunk_8_simd_f64(simd: S, reals: &mut [f64], imags: &mut } /// DIT butterfly for chunk_size == 8 (f32) with SIMD -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_chunk_8_simd_f32(reals: &mut [f32], imags: &mut [f32]) { +#[inline(always)] // required by fearless_simd +pub fn fft_dit_chunk_8_simd_f32(simd: S, reals: &mut [f32], imags: &mut [f32]) { + use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; const DIST: usize = 4; const CHUNK_SIZE: usize = DIST << 1; - let two = f32x4::splat(2.0); - let sqrt2_2 = f32x4::new([ + let two = f32x4::splat(simd, 2.0); + let sqrt2_2 = f32x4::simd_from([ 1.0_f32, // W_8^0 real std::f32::consts::FRAC_1_SQRT_2, // W_8^1 real (sqrt(2)/2) 0.0_f32, // W_8^2 real -std::f32::consts::FRAC_1_SQRT_2, // W_8^3 real (-sqrt(2)/2) - ]); - let sqrt2_2_im = f32x4::new([ + ], simd); + let sqrt2_2_im = f32x4::simd_from([ 0.0_f32, // W_8^0 imag -std::f32::consts::FRAC_1_SQRT_2, // W_8^1 imag (-sqrt(2)/2) -1.0_f32, // W_8^2 imag -std::f32::consts::FRAC_1_SQRT_2, // W_8^3 imag (-sqrt(2)/2) - ]); + ], simd); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -196,10 +173,10 @@ pub fn fft_dit_chunk_8_simd_f32(reals: &mut [f32], imags: &mut [f32]) { let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST); let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); - let in0_re = f32x4::new(reals_s0[0..4].try_into().unwrap()); - let in1_re = f32x4::new(reals_s1[0..4].try_into().unwrap()); - let in0_im = f32x4::new(imags_s0[0..4].try_into().unwrap()); - let in1_im = f32x4::new(imags_s1[0..4].try_into().unwrap()); + let in0_re = f32x4::simd_from(<[f32; 4]>::try_from(&reals_s0[0..4]).unwrap(), simd); + let in1_re = f32x4::simd_from(<[f32; 4]>::try_from(&reals_s1[0..4]).unwrap(), simd); + let in0_im = f32x4::simd_from(<[f32; 4]>::try_from(&imags_s0[0..4]).unwrap(), simd); + let in1_im = f32x4::simd_from(<[f32; 4]>::try_from(&imags_s1[0..4]).unwrap(), simd); // out0.re = (in0.re + w.re * in1.re) - w.im * in1.im let out0_re = sqrt2_2_im.mul_add(-in1_im, sqrt2_2.mul_add(in1_re, in0_re)); @@ -210,31 +187,24 @@ pub fn fft_dit_chunk_8_simd_f32(reals: &mut [f32], imags: &mut [f32]) { let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0.copy_from_slice(out0_re.as_array()); - imags_s0.copy_from_slice(out0_im.as_array()); - reals_s1.copy_from_slice(out1_re.as_array()); - imags_s1.copy_from_slice(out1_im.as_array()); + reals_s0.copy_from_slice(out0_re.as_slice()); + imags_s0.copy_from_slice(out0_im.as_slice()); + reals_s1.copy_from_slice(out1_re.as_slice()); + imags_s1.copy_from_slice(out1_im.as_slice()); }); } -/// DIT butterfly for chunk_size == 16 (f64) -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_chunk_16_simd_f64(reals: &mut [f64], imags: &mut [f64]) { +/// DIT butterfly for chunk_size == 16 (f64) +#[inline(always)] // required by fearless_simd +pub fn fft_dit_chunk_16_simd_f64(simd: S, reals: &mut [f64], imags: &mut [f64]) { + use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; const DIST: usize = 8; const CHUNK_SIZE: usize = DIST << 1; - let two = f64x8::splat(2.0); + let two = f64x8::splat(simd, 2.0); // Twiddle factors for W_16^k where k = 0..7 - let twiddle_re = f64x8::new([ + let twiddle_re = f64x8::simd_from([ 1.0, // W_16^0 0.9238795325112867, // W_16^1 = cos(pi/8) std::f64::consts::FRAC_1_SQRT_2, // W_16^2 = sqrt(2)/2 @@ -243,9 +213,9 @@ pub fn fft_dit_chunk_16_simd_f64(reals: &mut [f64], imags: &mut [f64]) { -0.38268343236508984, // W_16^5 = -cos(3*pi/8) -std::f64::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 -0.9238795325112867, // W_16^7 = -cos(pi/8) - ]); + ], simd); - let twiddle_im = f64x8::new([ + let twiddle_im = f64x8::simd_from([ 0.0, // W_16^0 -0.38268343236508984, // W_16^1 = -sin(pi/8) -std::f64::consts::FRAC_1_SQRT_2, // W_16^2 = -sqrt(2)/2 @@ -254,7 +224,7 @@ pub fn fft_dit_chunk_16_simd_f64(reals: &mut [f64], imags: &mut [f64]) { -0.9238795325112867, // W_16^5 = -sin(3*pi/8) -std::f64::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 -0.38268343236508984, // W_16^7 = -sin(pi/8) - ]); + ], simd); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -263,10 +233,10 @@ pub fn fft_dit_chunk_16_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Load all 8 elements at once - let in0_re = f64x8::new(reals_s0[0..8].try_into().unwrap()); - let in1_re = f64x8::new(reals_s1[0..8].try_into().unwrap()); - let in0_im = f64x8::new(imags_s0[0..8].try_into().unwrap()); - let in1_im = f64x8::new(imags_s1[0..8].try_into().unwrap()); + let in0_re = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s0[0..8]).unwrap(), simd); + let in1_re = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s1[0..8]).unwrap(), simd); + let in0_im = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s0[0..8]).unwrap(), simd); + let in1_im = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s1[0..8]).unwrap(), simd); let out0_re = twiddle_im.mul_add(-in1_im, twiddle_re.mul_add(in1_re, in0_re)); let out0_im = twiddle_im.mul_add(in1_re, twiddle_re.mul_add(in1_im, in0_im)); @@ -275,31 +245,24 @@ pub fn fft_dit_chunk_16_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0.copy_from_slice(out0_re.as_array()); - imags_s0.copy_from_slice(out0_im.as_array()); - reals_s1.copy_from_slice(out1_re.as_array()); - imags_s1.copy_from_slice(out1_im.as_array()); + reals_s0.copy_from_slice(out0_re.as_slice()); + imags_s0.copy_from_slice(out0_im.as_slice()); + reals_s1.copy_from_slice(out1_re.as_slice()); + imags_s1.copy_from_slice(out1_im.as_slice()); }); } /// DIT butterfly for chunk_size == 16 (f32) -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_chunk_16_simd_f32(reals: &mut [f32], imags: &mut [f32]) { +#[inline(always)] // required by fearless_simd +pub fn fft_dit_chunk_16_simd_f32(simd: S, reals: &mut [f32], imags: &mut [f32]) { + use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; const DIST: usize = 8; const CHUNK_SIZE: usize = DIST << 1; - let two = f32x8::splat(2.0); + let two = f32x8::splat(simd, 2.0); // Twiddle factors for W_16^k where k = 0..7 - let twiddle_re = f32x8::new([ + let twiddle_re = f32x8::simd_from([ 1.0_f32, // W_16^0 0.923_879_5_f32, // W_16^1 = cos(pi/8) std::f32::consts::FRAC_1_SQRT_2, // W_16^2 = sqrt(2)/2 @@ -308,9 +271,9 @@ pub fn fft_dit_chunk_16_simd_f32(reals: &mut [f32], imags: &mut [f32]) { -0.382_683_43_f32, // W_16^5 = -cos(3*pi/8) -std::f32::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 -0.923_879_5_f32, // W_16^7 = -cos(pi/8) - ]); + ], simd); - let twiddle_im = f32x8::new([ + let twiddle_im = f32x8::simd_from([ 0.0_f32, // W_16^0 -0.382_683_43_f32, // W_16^1 = -sin(pi/8) -std::f32::consts::FRAC_1_SQRT_2, // W_16^2 = -sqrt(2)/2 @@ -319,7 +282,7 @@ pub fn fft_dit_chunk_16_simd_f32(reals: &mut [f32], imags: &mut [f32]) { -0.923_879_5_f32, // W_16^5 = -sin(3*pi/8) -std::f32::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 -0.382_683_43_f32, // W_16^7 = -sin(pi/8) - ]); + ], simd); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -328,10 +291,10 @@ pub fn fft_dit_chunk_16_simd_f32(reals: &mut [f32], imags: &mut [f32]) { let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Load all 8 elements at once - let in0_re = f32x8::new(reals_s0[0..8].try_into().unwrap()); - let in1_re = f32x8::new(reals_s1[0..8].try_into().unwrap()); - let in0_im = f32x8::new(imags_s0[0..8].try_into().unwrap()); - let in1_im = f32x8::new(imags_s1[0..8].try_into().unwrap()); + let in0_re = f32x8::simd_from(<[f32; 8]>::try_from(&reals_s0[0..8]).unwrap(), simd); + let in1_re = f32x8::simd_from(<[f32; 8]>::try_from(&reals_s1[0..8]).unwrap(), simd); + let in0_im = f32x8::simd_from(<[f32; 8]>::try_from(&imags_s0[0..8]).unwrap(), simd); + let in1_im = f32x8::simd_from(<[f32; 8]>::try_from(&imags_s1[0..8]).unwrap(), simd); let out0_re = twiddle_im.mul_add(-in1_im, twiddle_re.mul_add(in1_re, in0_re)); let out0_im = twiddle_im.mul_add(in1_re, twiddle_re.mul_add(in1_im, in0_im)); @@ -340,30 +303,23 @@ pub fn fft_dit_chunk_16_simd_f32(reals: &mut [f32], imags: &mut [f32]) { let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0.copy_from_slice(out0_re.as_array()); - imags_s0.copy_from_slice(out0_im.as_array()); - reals_s1.copy_from_slice(out1_re.as_array()); - imags_s1.copy_from_slice(out1_im.as_array()); + reals_s0.copy_from_slice(out0_re.as_slice()); + imags_s0.copy_from_slice(out0_im.as_slice()); + reals_s1.copy_from_slice(out1_re.as_slice()); + imags_s1.copy_from_slice(out1_im.as_slice()); }); } /// DIT butterfly for chunk_size == 32 (f64) -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_chunk_32_simd_f64(reals: &mut [f64], imags: &mut [f64]) { +#[inline(always)] // required by fearless_simd +pub fn fft_dit_chunk_32_simd_f64(simd: S, reals: &mut [f64], imags: &mut [f64]) { + use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; const DIST: usize = 16; const CHUNK_SIZE: usize = DIST << 1; - let two = f64x8::splat(2.0); + let two = f64x8::splat(simd, 2.0); // First 8 twiddle factors for W_32^k where k = 0..7 - let twiddle_re_0_7 = f64x8::new([ + let twiddle_re_0_7 = f64x8::simd_from([ 1.0, // W_32^0 = 1 0.9807852804032304, // W_32^1 = cos(π/16) 0.9238795325112867, // W_32^2 = cos(π/8) @@ -372,9 +328,9 @@ pub fn fft_dit_chunk_32_simd_f64(reals: &mut [f64], imags: &mut [f64]) { 0.5555702330196022, // W_32^5 = cos(5π/16) 0.3826834323650898, // W_32^6 = cos(3π/8) 0.19509032201612825, // W_32^7 = cos(7π/16) - ]); + ], simd); - let twiddle_im_0_7 = f64x8::new([ + let twiddle_im_0_7 = f64x8::simd_from([ 0.0, // W_32^0 -0.19509032201612825, // W_32^1 = -sin(π/16) -0.3826834323650898, // W_32^2 = -sin(π/8) @@ -383,10 +339,10 @@ pub fn fft_dit_chunk_32_simd_f64(reals: &mut [f64], imags: &mut [f64]) { -0.8314696123025452, // W_32^5 = -sin(5π/16) -0.9238795325112867, // W_32^6 = -sin(3π/8) -0.9807852804032304, // W_32^7 = -sin(7π/16) - ]); + ], simd); // Second 8 twiddle factors for W_32^k where k = 8..15 - let twiddle_re_8_15 = f64x8::new([ + let twiddle_re_8_15 = f64x8::simd_from([ 0.0, // W_32^8 = 0 - i -0.19509032201612825, // W_32^9 -0.3826834323650898, // W_32^10 @@ -395,9 +351,9 @@ pub fn fft_dit_chunk_32_simd_f64(reals: &mut [f64], imags: &mut [f64]) { -0.8314696123025452, // W_32^13 -0.9238795325112867, // W_32^14 -0.9807852804032304, // W_32^15 - ]); + ], simd); - let twiddle_im_8_15 = f64x8::new([ + let twiddle_im_8_15 = f64x8::simd_from([ -1.0, // W_32^8 -0.9807852804032304, // W_32^9 -0.9238795325112867, // W_32^10 @@ -406,7 +362,7 @@ pub fn fft_dit_chunk_32_simd_f64(reals: &mut [f64], imags: &mut [f64]) { -0.5555702330196022, // W_32^13 -0.3826834323650898, // W_32^14 -0.19509032201612825, // W_32^15 - ]); + ], simd); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -415,10 +371,10 @@ pub fn fft_dit_chunk_32_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Process first 8 butterflies - let in0_re_0_7 = f64x8::new(reals_s0[0..8].try_into().unwrap()); - let in1_re_0_7 = f64x8::new(reals_s1[0..8].try_into().unwrap()); - let in0_im_0_7 = f64x8::new(imags_s0[0..8].try_into().unwrap()); - let in1_im_0_7 = f64x8::new(imags_s1[0..8].try_into().unwrap()); + let in0_re_0_7 = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s0[0..8]).unwrap(), simd); + let in1_re_0_7 = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s1[0..8]).unwrap(), simd); + let in0_im_0_7 = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s0[0..8]).unwrap(), simd); + let in1_im_0_7 = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s1[0..8]).unwrap(), simd); let out0_re_0_7 = twiddle_im_0_7.mul_add(-in1_im_0_7, twiddle_re_0_7.mul_add(in1_re_0_7, in0_re_0_7)); @@ -428,16 +384,16 @@ pub fn fft_dit_chunk_32_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let out1_re_0_7 = two.mul_sub(in0_re_0_7, out0_re_0_7); let out1_im_0_7 = two.mul_sub(in0_im_0_7, out0_im_0_7); - reals_s0[0..8].copy_from_slice(out0_re_0_7.as_array()); - imags_s0[0..8].copy_from_slice(out0_im_0_7.as_array()); - reals_s1[0..8].copy_from_slice(out1_re_0_7.as_array()); - imags_s1[0..8].copy_from_slice(out1_im_0_7.as_array()); + reals_s0[0..8].copy_from_slice(out0_re_0_7.as_slice()); + imags_s0[0..8].copy_from_slice(out0_im_0_7.as_slice()); + reals_s1[0..8].copy_from_slice(out1_re_0_7.as_slice()); + imags_s1[0..8].copy_from_slice(out1_im_0_7.as_slice()); // Process second 8 butterflies - let in0_re_8_15 = f64x8::new(reals_s0[8..16].try_into().unwrap()); - let in1_re_8_15 = f64x8::new(reals_s1[8..16].try_into().unwrap()); - let in0_im_8_15 = f64x8::new(imags_s0[8..16].try_into().unwrap()); - let in1_im_8_15 = f64x8::new(imags_s1[8..16].try_into().unwrap()); + let in0_re_8_15 = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s0[8..16]).unwrap(), simd); + let in1_re_8_15 = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s1[8..16]).unwrap(), simd); + let in0_im_8_15 = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s0[8..16]).unwrap(), simd); + let in1_im_8_15 = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s1[8..16]).unwrap(), simd); let out0_re_8_15 = twiddle_im_8_15.mul_add( -in1_im_8_15, @@ -451,31 +407,24 @@ pub fn fft_dit_chunk_32_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let out1_re_8_15 = two.mul_sub(in0_re_8_15, out0_re_8_15); let out1_im_8_15 = two.mul_sub(in0_im_8_15, out0_im_8_15); - reals_s0[8..16].copy_from_slice(out0_re_8_15.as_array()); - imags_s0[8..16].copy_from_slice(out0_im_8_15.as_array()); - reals_s1[8..16].copy_from_slice(out1_re_8_15.as_array()); - imags_s1[8..16].copy_from_slice(out1_im_8_15.as_array()); + reals_s0[8..16].copy_from_slice(out0_re_8_15.as_slice()); + imags_s0[8..16].copy_from_slice(out0_im_8_15.as_slice()); + reals_s1[8..16].copy_from_slice(out1_re_8_15.as_slice()); + imags_s1[8..16].copy_from_slice(out1_im_8_15.as_slice()); }); } /// DIT butterfly for chunk_size == 32 (f32) -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_chunk_32_simd_f32(reals: &mut [f32], imags: &mut [f32]) { +#[inline(always)] // required by fearless_simd +pub fn fft_dit_chunk_32_simd_f32(simd: S, reals: &mut [f32], imags: &mut [f32]) { + use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; const DIST: usize = 16; const CHUNK_SIZE: usize = DIST << 1; - let two = f32x16::splat(2.0); + let two = f32x16::splat(simd, 2.0); // All 16 twiddle factors for W_32^k where k = 0..15 - let twiddle_re = f32x16::new([ + let twiddle_re = f32x16::simd_from([ 1.0_f32, // W_32^0 = 1 0.980_785_25_f32, // W_32^1 = cos(π/16) 0.923_879_5_f32, // W_32^2 = cos(π/8) @@ -492,9 +441,9 @@ pub fn fft_dit_chunk_32_simd_f32(reals: &mut [f32], imags: &mut [f32]) { -0.831_469_6_f32, // W_32^13 -0.923_879_5_f32, // W_32^14 -0.980_785_25_f32, // W_32^15 - ]); + ], simd); - let twiddle_im = f32x16::new([ + let twiddle_im = f32x16::simd_from([ 0.0_f32, // W_32^0 -0.195_090_32_f32, // W_32^1 = -sin(π/16) -0.382_683_43_f32, // W_32^2 = -sin(π/8) @@ -511,7 +460,7 @@ pub fn fft_dit_chunk_32_simd_f32(reals: &mut [f32], imags: &mut [f32]) { -0.555_570_24_f32, // W_32^13 -0.382_683_43_f32, // W_32^14 -0.195_090_32_f32, // W_32^15 - ]); + ], simd); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -520,10 +469,10 @@ pub fn fft_dit_chunk_32_simd_f32(reals: &mut [f32], imags: &mut [f32]) { let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Process all 16 butterflies at once with f32x16 - let in0_re = f32x16::new(reals_s0[0..16].try_into().unwrap()); - let in1_re = f32x16::new(reals_s1[0..16].try_into().unwrap()); - let in0_im = f32x16::new(imags_s0[0..16].try_into().unwrap()); - let in1_im = f32x16::new(imags_s1[0..16].try_into().unwrap()); + let in0_re = f32x16::simd_from(<[f32; 16]>::try_from(&reals_s0[0..16]).unwrap(), simd); + let in1_re = f32x16::simd_from(<[f32; 16]>::try_from(&reals_s1[0..16]).unwrap(), simd); + let in0_im = f32x16::simd_from(<[f32; 16]>::try_from(&imags_s0[0..16]).unwrap(), simd); + let in1_im = f32x16::simd_from(<[f32; 16]>::try_from(&imags_s1[0..16]).unwrap(), simd); let out0_re = twiddle_im.mul_add(-in1_im, twiddle_re.mul_add(in1_re, in0_re)); let out0_im = twiddle_im.mul_add(in1_re, twiddle_re.mul_add(in1_im, in0_im)); @@ -531,32 +480,25 @@ pub fn fft_dit_chunk_32_simd_f32(reals: &mut [f32], imags: &mut [f32]) { let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0.copy_from_slice(out0_re.as_array()); - imags_s0.copy_from_slice(out0_im.as_array()); - reals_s1.copy_from_slice(out1_re.as_array()); - imags_s1.copy_from_slice(out1_im.as_array()); + reals_s0.copy_from_slice(out0_re.as_slice()); + imags_s0.copy_from_slice(out0_im.as_slice()); + reals_s1.copy_from_slice(out1_re.as_slice()); + imags_s1.copy_from_slice(out1_im.as_slice()); }); } /// DIT butterfly for chunk_size == 64 (f64) -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) { +#[inline(always)] // required by fearless_simd +pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mut [f64]) { + use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; const DIST: usize = 32; const CHUNK_SIZE: usize = DIST << 1; - let two = f64x8::splat(2.0); + let two = f64x8::splat(simd, 2.0); // Process in 4 iterations of 8 butterflies each // Twiddles for W_64^k where k = 0..7 - let twiddle_re_0_7 = f64x8::new([ + let twiddle_re_0_7 = f64x8::simd_from([ 1.0, // W_64^0 = 1 0.9951847266721969, // W_64^1 = cos(π/32) 0.9807852804032304, // W_64^2 = cos(π/16) @@ -565,9 +507,9 @@ pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) { 0.8819212643483549, // W_64^5 = cos(5π/32) 0.8314696123025452, // W_64^6 = cos(3π/16) 0.773010453362737, // W_64^7 = cos(7π/32) - ]); + ], simd); - let twiddle_im_0_7 = f64x8::new([ + let twiddle_im_0_7 = f64x8::simd_from([ 0.0, // W_64^0 -0.0980171403295606, // W_64^1 = -sin(π/32) -0.19509032201612825, // W_64^2 = -sin(π/16) @@ -576,10 +518,10 @@ pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) { -0.47139673682599764, // W_64^5 = -sin(5π/32) -0.5555702330196022, // W_64^6 = -sin(3π/16) -0.6343932841636455, // W_64^7 = -sin(7π/32) - ]); + ], simd); // Twiddles for k = 8..15 - let twiddle_re_8_15 = f64x8::new([ + let twiddle_re_8_15 = f64x8::simd_from([ std::f64::consts::FRAC_1_SQRT_2, // W_64^8 = sqrt(2)/2 0.6343932841636455, // W_64^9 0.5555702330196022, // W_64^10 @@ -588,9 +530,9 @@ pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) { 0.29028467725446233, // W_64^13 0.19509032201612825, // W_64^14 0.0980171403295606, // W_64^15 - ]); + ], simd); - let twiddle_im_8_15 = f64x8::new([ + let twiddle_im_8_15 = f64x8::simd_from([ -std::f64::consts::FRAC_1_SQRT_2, // W_64^8 -0.773010453362737, // W_64^9 -0.8314696123025452, // W_64^10 @@ -599,10 +541,10 @@ pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) { -0.9569403357322089, // W_64^13 -0.9807852804032304, // W_64^14 -0.9951847266721969, // W_64^15 - ]); + ], simd); // Twiddles for k = 16..23 - let twiddle_re_16_23 = f64x8::new([ + let twiddle_re_16_23 = f64x8::simd_from([ 0.0, // W_64^16 = -i -0.0980171403295606, // W_64^17 -0.19509032201612825, // W_64^18 @@ -611,9 +553,9 @@ pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) { -0.47139673682599764, // W_64^21 -0.5555702330196022, // W_64^22 -0.6343932841636455, // W_64^23 - ]); + ], simd); - let twiddle_im_16_23 = f64x8::new([ + let twiddle_im_16_23 = f64x8::simd_from([ -1.0, // W_64^16 -0.9951847266721969, // W_64^17 -0.9807852804032304, // W_64^18 @@ -622,10 +564,10 @@ pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) { -0.8819212643483549, // W_64^21 -0.8314696123025452, // W_64^22 -0.773010453362737, // W_64^23 - ]); + ], simd); // Twiddles for k = 24..31 - let twiddle_re_24_31 = f64x8::new([ + let twiddle_re_24_31 = f64x8::simd_from([ -std::f64::consts::FRAC_1_SQRT_2, // W_64^24 -0.773010453362737, // W_64^25 -0.8314696123025452, // W_64^26 @@ -634,9 +576,9 @@ pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) { -0.9569403357322089, // W_64^29 -0.9807852804032304, // W_64^30 -0.9951847266721969, // W_64^31 - ]); + ], simd); - let twiddle_im_24_31 = f64x8::new([ + let twiddle_im_24_31 = f64x8::simd_from([ -std::f64::consts::FRAC_1_SQRT_2, // W_64^24 -0.6343932841636455, // W_64^25 -0.5555702330196022, // W_64^26 @@ -645,7 +587,7 @@ pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) { -0.29028467725446233, // W_64^29 -0.19509032201612825, // W_64^30 -0.0980171403295606, // W_64^31 - ]); + ], simd); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -654,42 +596,42 @@ pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Process butterflies 0..7 - let in0_re = f64x8::new(reals_s0[0..8].try_into().unwrap()); - let in1_re = f64x8::new(reals_s1[0..8].try_into().unwrap()); - let in0_im = f64x8::new(imags_s0[0..8].try_into().unwrap()); - let in1_im = f64x8::new(imags_s1[0..8].try_into().unwrap()); + let in0_re = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s0[0..8]).unwrap(), simd); + let in1_re = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s1[0..8]).unwrap(), simd); + let in0_im = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s0[0..8]).unwrap(), simd); + let in1_im = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s1[0..8]).unwrap(), simd); let out0_re = twiddle_im_0_7.mul_add(-in1_im, twiddle_re_0_7.mul_add(in1_re, in0_re)); let out0_im = twiddle_im_0_7.mul_add(in1_re, twiddle_re_0_7.mul_add(in1_im, in0_im)); let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0[0..8].copy_from_slice(out0_re.as_array()); - imags_s0[0..8].copy_from_slice(out0_im.as_array()); - reals_s1[0..8].copy_from_slice(out1_re.as_array()); - imags_s1[0..8].copy_from_slice(out1_im.as_array()); + reals_s0[0..8].copy_from_slice(out0_re.as_slice()); + imags_s0[0..8].copy_from_slice(out0_im.as_slice()); + reals_s1[0..8].copy_from_slice(out1_re.as_slice()); + imags_s1[0..8].copy_from_slice(out1_im.as_slice()); // Process butterflies 8..15 - let in0_re = f64x8::new(reals_s0[8..16].try_into().unwrap()); - let in1_re = f64x8::new(reals_s1[8..16].try_into().unwrap()); - let in0_im = f64x8::new(imags_s0[8..16].try_into().unwrap()); - let in1_im = f64x8::new(imags_s1[8..16].try_into().unwrap()); + let in0_re = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s0[8..16]).unwrap(), simd); + let in1_re = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s1[8..16]).unwrap(), simd); + let in0_im = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s0[8..16]).unwrap(), simd); + let in1_im = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s1[8..16]).unwrap(), simd); let out0_re = twiddle_im_8_15.mul_add(-in1_im, twiddle_re_8_15.mul_add(in1_re, in0_re)); let out0_im = twiddle_im_8_15.mul_add(in1_re, twiddle_re_8_15.mul_add(in1_im, in0_im)); let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0[8..16].copy_from_slice(out0_re.as_array()); - imags_s0[8..16].copy_from_slice(out0_im.as_array()); - reals_s1[8..16].copy_from_slice(out1_re.as_array()); - imags_s1[8..16].copy_from_slice(out1_im.as_array()); + reals_s0[8..16].copy_from_slice(out0_re.as_slice()); + imags_s0[8..16].copy_from_slice(out0_im.as_slice()); + reals_s1[8..16].copy_from_slice(out1_re.as_slice()); + imags_s1[8..16].copy_from_slice(out1_im.as_slice()); // Process butterflies 16..23 - let in0_re = f64x8::new(reals_s0[16..24].try_into().unwrap()); - let in1_re = f64x8::new(reals_s1[16..24].try_into().unwrap()); - let in0_im = f64x8::new(imags_s0[16..24].try_into().unwrap()); - let in1_im = f64x8::new(imags_s1[16..24].try_into().unwrap()); + let in0_re = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s0[16..24]).unwrap(), simd); + let in1_re = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s1[16..24]).unwrap(), simd); + let in0_im = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s0[16..24]).unwrap(), simd); + let in1_im = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s1[16..24]).unwrap(), simd); let out0_re = twiddle_im_16_23.mul_add(-in1_im, twiddle_re_16_23.mul_add(in1_re, in0_re)); @@ -698,16 +640,16 @@ pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0[16..24].copy_from_slice(out0_re.as_array()); - imags_s0[16..24].copy_from_slice(out0_im.as_array()); - reals_s1[16..24].copy_from_slice(out1_re.as_array()); - imags_s1[16..24].copy_from_slice(out1_im.as_array()); + reals_s0[16..24].copy_from_slice(out0_re.as_slice()); + imags_s0[16..24].copy_from_slice(out0_im.as_slice()); + reals_s1[16..24].copy_from_slice(out1_re.as_slice()); + imags_s1[16..24].copy_from_slice(out1_im.as_slice()); // Process butterflies 24..31 - let in0_re = f64x8::new(reals_s0[24..32].try_into().unwrap()); - let in1_re = f64x8::new(reals_s1[24..32].try_into().unwrap()); - let in0_im = f64x8::new(imags_s0[24..32].try_into().unwrap()); - let in1_im = f64x8::new(imags_s1[24..32].try_into().unwrap()); + let in0_re = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s0[24..32]).unwrap(), simd); + let in1_re = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s1[24..32]).unwrap(), simd); + let in0_im = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s0[24..32]).unwrap(), simd); + let in1_im = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s1[24..32]).unwrap(), simd); let out0_re = twiddle_im_24_31.mul_add(-in1_im, twiddle_re_24_31.mul_add(in1_re, in0_re)); @@ -716,32 +658,25 @@ pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0[24..32].copy_from_slice(out0_re.as_array()); - imags_s0[24..32].copy_from_slice(out0_im.as_array()); - reals_s1[24..32].copy_from_slice(out1_re.as_array()); - imags_s1[24..32].copy_from_slice(out1_im.as_array()); + reals_s0[24..32].copy_from_slice(out0_re.as_slice()); + imags_s0[24..32].copy_from_slice(out0_im.as_slice()); + reals_s1[24..32].copy_from_slice(out1_re.as_slice()); + imags_s1[24..32].copy_from_slice(out1_im.as_slice()); }); } /// DIT butterfly for chunk_size == 64 (f32) -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_chunk_64_simd_f32(reals: &mut [f32], imags: &mut [f32]) { +#[inline(always)] // required by fearless_simd +pub fn fft_dit_chunk_64_simd_f32(simd: S, reals: &mut [f32], imags: &mut [f32]) { + use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; const DIST: usize = 32; const CHUNK_SIZE: usize = DIST << 1; - let two = f32x16::splat(2.0); + let two = f32x16::splat(simd, 2.0); // Process in 2 iterations of 16 butterflies each // Twiddles for W_64^k where k = 0..15 - let twiddle_re_0_15 = f32x16::new([ + let twiddle_re_0_15 = f32x16::simd_from([ 1.0_f32, // W_64^0 = 1 0.995_184_7_f32, // W_64^1 = cos(π/32) 0.980_785_25_f32, // W_64^2 = cos(π/16) @@ -758,9 +693,9 @@ pub fn fft_dit_chunk_64_simd_f32(reals: &mut [f32], imags: &mut [f32]) { 0.290_284_66_f32, // W_64^13 0.195_090_32_f32, // W_64^14 0.098_017_14_f32, // W_64^15 - ]); + ], simd); - let twiddle_im_0_15 = f32x16::new([ + let twiddle_im_0_15 = f32x16::simd_from([ 0.0_f32, // W_64^0 -0.098_017_14_f32, // W_64^1 = -sin(π/32) -0.195_090_32_f32, // W_64^2 = -sin(π/16) @@ -777,10 +712,10 @@ pub fn fft_dit_chunk_64_simd_f32(reals: &mut [f32], imags: &mut [f32]) { -0.956_940_35_f32, // W_64^13 -0.980_785_25_f32, // W_64^14 -0.995_184_7_f32, // W_64^15 - ]); + ], simd); // Twiddles for k = 16..31 - let twiddle_re_16_31 = f32x16::new([ + let twiddle_re_16_31 = f32x16::simd_from([ 0.0_f32, // W_64^16 = -i -0.098_017_14_f32, // W_64^17 -0.195_090_32_f32, // W_64^18 @@ -797,9 +732,9 @@ pub fn fft_dit_chunk_64_simd_f32(reals: &mut [f32], imags: &mut [f32]) { -0.956_940_35_f32, // W_64^29 -0.980_785_25_f32, // W_64^30 -0.995_184_7_f32, // W_64^31 - ]); + ], simd); - let twiddle_im_16_31 = f32x16::new([ + let twiddle_im_16_31 = f32x16::simd_from([ -1.0_f32, // W_64^16 -0.995_184_7_f32, // W_64^17 -0.980_785_25_f32, // W_64^18 @@ -816,7 +751,7 @@ pub fn fft_dit_chunk_64_simd_f32(reals: &mut [f32], imags: &mut [f32]) { -0.290_284_66_f32, // W_64^29 -0.195_090_32_f32, // W_64^30 -0.098_017_14_f32, // W_64^31 - ]); + ], simd); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -825,26 +760,26 @@ pub fn fft_dit_chunk_64_simd_f32(reals: &mut [f32], imags: &mut [f32]) { let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Process butterflies 0..15 - let in0_re = f32x16::new(reals_s0[0..16].try_into().unwrap()); - let in1_re = f32x16::new(reals_s1[0..16].try_into().unwrap()); - let in0_im = f32x16::new(imags_s0[0..16].try_into().unwrap()); - let in1_im = f32x16::new(imags_s1[0..16].try_into().unwrap()); + let in0_re = f32x16::simd_from(<[f32; 16]>::try_from(&reals_s0[0..16]).unwrap(), simd); + let in1_re = f32x16::simd_from(<[f32; 16]>::try_from(&reals_s1[0..16]).unwrap(), simd); + let in0_im = f32x16::simd_from(<[f32; 16]>::try_from(&imags_s0[0..16]).unwrap(), simd); + let in1_im = f32x16::simd_from(<[f32; 16]>::try_from(&imags_s1[0..16]).unwrap(), simd); let out0_re = twiddle_im_0_15.mul_add(-in1_im, twiddle_re_0_15.mul_add(in1_re, in0_re)); let out0_im = twiddle_im_0_15.mul_add(in1_re, twiddle_re_0_15.mul_add(in1_im, in0_im)); let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0[0..16].copy_from_slice(out0_re.as_array()); - imags_s0[0..16].copy_from_slice(out0_im.as_array()); - reals_s1[0..16].copy_from_slice(out1_re.as_array()); - imags_s1[0..16].copy_from_slice(out1_im.as_array()); + reals_s0[0..16].copy_from_slice(out0_re.as_slice()); + imags_s0[0..16].copy_from_slice(out0_im.as_slice()); + reals_s1[0..16].copy_from_slice(out1_re.as_slice()); + imags_s1[0..16].copy_from_slice(out1_im.as_slice()); // Process butterflies 16..31 - let in0_re = f32x16::new(reals_s0[16..32].try_into().unwrap()); - let in1_re = f32x16::new(reals_s1[16..32].try_into().unwrap()); - let in0_im = f32x16::new(imags_s0[16..32].try_into().unwrap()); - let in1_im = f32x16::new(imags_s1[16..32].try_into().unwrap()); + let in0_re = f32x16::simd_from(<[f32; 16]>::try_from(&reals_s0[16..32]).unwrap(), simd); + let in1_re = f32x16::simd_from(<[f32; 16]>::try_from(&reals_s1[16..32]).unwrap(), simd); + let in0_im = f32x16::simd_from(<[f32; 16]>::try_from(&imags_s0[16..32]).unwrap(), simd); + let in1_im = f32x16::simd_from(<[f32; 16]>::try_from(&imags_s1[16..32]).unwrap(), simd); let out0_re = twiddle_im_16_31.mul_add(-in1_im, twiddle_re_16_31.mul_add(in1_re, in0_re)); @@ -853,30 +788,24 @@ pub fn fft_dit_chunk_64_simd_f32(reals: &mut [f32], imags: &mut [f32]) { let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0[16..32].copy_from_slice(out0_re.as_array()); - imags_s0[16..32].copy_from_slice(out0_im.as_array()); - reals_s1[16..32].copy_from_slice(out1_re.as_array()); - imags_s1[16..32].copy_from_slice(out1_im.as_array()); + reals_s0[16..32].copy_from_slice(out0_re.as_slice()); + imags_s0[16..32].copy_from_slice(out0_im.as_slice()); + reals_s1[16..32].copy_from_slice(out1_re.as_slice()); + imags_s1[16..32].copy_from_slice(out1_im.as_slice()); }); } /// General DIT butterfly for f64 -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_64_chunk_n_simd( +#[inline(always)] // required by fearless_simd +pub fn fft_dit_64_chunk_n_simd( + simd: S, reals: &mut [f64], imags: &mut [f64], twiddles_re: &[f64], twiddles_im: &[f64], dist: usize, ) { + use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; const LANES: usize = 8; let chunk_size = dist << 1; assert!(chunk_size >= LANES * 2); @@ -895,14 +824,14 @@ pub fn fft_dit_64_chunk_n_simd( .zip(twiddles_re.as_chunks::().0.iter()) .zip(twiddles_im.as_chunks::().0.iter()) .for_each(|(((((re_s0, re_s1), im_s0), im_s1), tw_re), tw_im)| { - let two = f64x8::splat(2.0); - let in0_re = f64x8::new(*re_s0); - let in1_re = f64x8::new(*re_s1); - let in0_im = f64x8::new(*im_s0); - let in1_im = f64x8::new(*im_s1); + let two = f64x8::splat(simd, 2.0); + let in0_re = f64x8::simd_from(*re_s0, simd); + let in1_re = f64x8::simd_from(*re_s1, simd); + let in0_im = f64x8::simd_from(*im_s0, simd); + let in1_im = f64x8::simd_from(*im_s1, simd); - let tw_re = f64x8::new(*tw_re); - let tw_im = f64x8::new(*tw_im); + let tw_re = f64x8::simd_from(*tw_re, simd); + let tw_im = f64x8::simd_from(*tw_im, simd); // out0.re = (in0.re + tw_re * in1.re) - tw_im * in1.im let out0_re = tw_im.mul_add(-in1_im, tw_re.mul_add(in1_re, in0_re)); @@ -913,31 +842,25 @@ pub fn fft_dit_64_chunk_n_simd( let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - re_s0.copy_from_slice(out0_re.as_array()); - im_s0.copy_from_slice(out0_im.as_array()); - re_s1.copy_from_slice(out1_re.as_array()); - im_s1.copy_from_slice(out1_im.as_array()); + re_s0.copy_from_slice(out0_re.as_slice()); + im_s0.copy_from_slice(out0_im.as_slice()); + re_s1.copy_from_slice(out1_re.as_slice()); + im_s1.copy_from_slice(out1_im.as_slice()); }); }); } /// General DIT butterfly for f32 -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_32_chunk_n_simd( +#[inline(always)] // required by fearless_simd +pub fn fft_dit_32_chunk_n_simd( + simd: S, reals: &mut [f32], imags: &mut [f32], twiddles_re: &[f32], twiddles_im: &[f32], dist: usize, ) { + use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; const LANES: usize = 16; let chunk_size = dist << 1; assert!(chunk_size >= LANES * 2); @@ -956,14 +879,14 @@ pub fn fft_dit_32_chunk_n_simd( .zip(twiddles_re.as_chunks::().0.iter()) .zip(twiddles_im.as_chunks::().0.iter()) .for_each(|(((((re_s0, re_s1), im_s0), im_s1), tw_re), tw_im)| { - let two = f32x16::splat(2.0); - let in0_re = f32x16::new(*re_s0); - let in1_re = f32x16::new(*re_s1); - let in0_im = f32x16::new(*im_s0); - let in1_im = f32x16::new(*im_s1); + let two = f32x16::splat(simd, 2.0); + let in0_re = f32x16::simd_from(*re_s0, simd); + let in1_re = f32x16::simd_from(*re_s1, simd); + let in0_im = f32x16::simd_from(*im_s0, simd); + let in1_im = f32x16::simd_from(*im_s1, simd); - let tw_re = f32x16::new(*tw_re); - let tw_im = f32x16::new(*tw_im); + let tw_re = f32x16::simd_from(*tw_re, simd); + let tw_im = f32x16::simd_from(*tw_im, simd); // out0.re = (in0.re + tw_re * in1.re) - tw_im * in1.im let out0_re = tw_im.mul_add(-in1_im, tw_re.mul_add(in1_re, in0_re)); @@ -974,10 +897,10 @@ pub fn fft_dit_32_chunk_n_simd( let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - re_s0.copy_from_slice(out0_re.as_array()); - im_s0.copy_from_slice(out0_im.as_array()); - re_s1.copy_from_slice(out1_re.as_array()); - im_s1.copy_from_slice(out1_im.as_array()); + re_s0.copy_from_slice(out0_re.as_slice()); + im_s0.copy_from_slice(out0_im.as_slice()); + re_s1.copy_from_slice(out1_re.as_slice()); + im_s1.copy_from_slice(out1_im.as_slice()); }); }); } From 1b17dbe4261dec39a6c17db8231356f4e25c3663 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Fri, 12 Dec 2025 13:17:41 +0000 Subject: [PATCH 05/12] Clean up imports --- src/kernels/dit.rs | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/kernels/dit.rs b/src/kernels/dit.rs index 3e0c9ac..fa85790 100644 --- a/src/kernels/dit.rs +++ b/src/kernels/dit.rs @@ -6,7 +6,7 @@ use core::f32; use num_traits::Float; use fearless_simd::{Simd, SimdBase, SimdFrom, SimdFloat}; -use wide::{f32x16, f32x4, f32x8, f64x4, f64x8}; +use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; use crate::kernels::common::fft_chunk_2; @@ -101,7 +101,6 @@ pub fn fft_dit_chunk_4_simd_f32(_simd: S, reals: &mut [f32], imags: &mu /// DIT butterfly for chunk_size == 8 (f64) with SIMD #[inline(always)] // required by fearless_simd pub fn fft_dit_chunk_8_simd_f64(simd: S, reals: &mut [f64], imags: &mut [f64]) { - use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; const DIST: usize = 4; const CHUNK_SIZE: usize = DIST << 1; @@ -149,7 +148,6 @@ pub fn fft_dit_chunk_8_simd_f64(simd: S, reals: &mut [f64], imags: &mut /// DIT butterfly for chunk_size == 8 (f32) with SIMD #[inline(always)] // required by fearless_simd pub fn fft_dit_chunk_8_simd_f32(simd: S, reals: &mut [f32], imags: &mut [f32]) { - use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; const DIST: usize = 4; const CHUNK_SIZE: usize = DIST << 1; @@ -197,7 +195,6 @@ pub fn fft_dit_chunk_8_simd_f32(simd: S, reals: &mut [f32], imags: &mut /// DIT butterfly for chunk_size == 16 (f64) #[inline(always)] // required by fearless_simd pub fn fft_dit_chunk_16_simd_f64(simd: S, reals: &mut [f64], imags: &mut [f64]) { - use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; const DIST: usize = 8; const CHUNK_SIZE: usize = DIST << 1; @@ -255,7 +252,6 @@ pub fn fft_dit_chunk_16_simd_f64(simd: S, reals: &mut [f64], imags: &mu /// DIT butterfly for chunk_size == 16 (f32) #[inline(always)] // required by fearless_simd pub fn fft_dit_chunk_16_simd_f32(simd: S, reals: &mut [f32], imags: &mut [f32]) { - use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; const DIST: usize = 8; const CHUNK_SIZE: usize = DIST << 1; @@ -312,7 +308,6 @@ pub fn fft_dit_chunk_16_simd_f32(simd: S, reals: &mut [f32], imags: &mu /// DIT butterfly for chunk_size == 32 (f64) #[inline(always)] // required by fearless_simd pub fn fft_dit_chunk_32_simd_f64(simd: S, reals: &mut [f64], imags: &mut [f64]) { - use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; const DIST: usize = 16; const CHUNK_SIZE: usize = DIST << 1; @@ -417,7 +412,6 @@ pub fn fft_dit_chunk_32_simd_f64(simd: S, reals: &mut [f64], imags: &mu /// DIT butterfly for chunk_size == 32 (f32) #[inline(always)] // required by fearless_simd pub fn fft_dit_chunk_32_simd_f32(simd: S, reals: &mut [f32], imags: &mut [f32]) { - use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; const DIST: usize = 16; const CHUNK_SIZE: usize = DIST << 1; @@ -490,7 +484,6 @@ pub fn fft_dit_chunk_32_simd_f32(simd: S, reals: &mut [f32], imags: &mu /// DIT butterfly for chunk_size == 64 (f64) #[inline(always)] // required by fearless_simd pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mut [f64]) { - use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; const DIST: usize = 32; const CHUNK_SIZE: usize = DIST << 1; @@ -668,7 +661,6 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu /// DIT butterfly for chunk_size == 64 (f32) #[inline(always)] // required by fearless_simd pub fn fft_dit_chunk_64_simd_f32(simd: S, reals: &mut [f32], imags: &mut [f32]) { - use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; const DIST: usize = 32; const CHUNK_SIZE: usize = DIST << 1; @@ -805,7 +797,6 @@ pub fn fft_dit_64_chunk_n_simd( twiddles_im: &[f64], dist: usize, ) { - use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; const LANES: usize = 8; let chunk_size = dist << 1; assert!(chunk_size >= LANES * 2); @@ -860,7 +851,6 @@ pub fn fft_dit_32_chunk_n_simd( twiddles_im: &[f32], dist: usize, ) { - use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; const LANES: usize = 16; let chunk_size = dist << 1; assert!(chunk_size >= LANES * 2); From 6244269549dccea8aa724fb60548f04d978e7721 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Fri, 12 Dec 2025 13:36:03 +0000 Subject: [PATCH 06/12] Wire up new DIT kernel function signatures to DIT process --- src/algorithms/dit.rs | 47 ++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/src/algorithms/dit.rs b/src/algorithms/dit.rs index 0353354..f87bc13 100644 --- a/src/algorithms/dit.rs +++ b/src/algorithms/dit.rs @@ -25,6 +25,7 @@ use crate::kernels::dit::{ use crate::options::Options; use crate::parallel::run_maybe_in_parallel; use crate::planner::{Direction, PlannerDit32, PlannerDit64}; +use fearless_simd::{Level, dispatch}; /// L1 cache block size in complex elements (8KB for f32, 16KB for f64) const L1_BLOCK_SIZE: usize = 1024; @@ -40,6 +41,7 @@ fn recursive_dit_fft_f64( planner: &PlannerDit64, opts: &Options, mut stage_twiddle_idx: usize, + simd_level: Level, ) -> usize { let log_size = size.ilog2() as usize; @@ -51,6 +53,7 @@ fn recursive_dit_fft_f64( stage, planner, stage_twiddle_idx, + simd_level ); } stage_twiddle_idx @@ -63,8 +66,8 @@ fn recursive_dit_fft_f64( // Recursively process both halves run_maybe_in_parallel( size > opts.smallest_parallel_chunk_size, - || recursive_dit_fft_f64(re_first_half, im_first_half, half, planner, opts, 0), - || recursive_dit_fft_f64(re_second_half, im_second_half, half, planner, opts, 0), + || recursive_dit_fft_f64(re_first_half, im_first_half, half, planner, opts, 0, simd_level), + || recursive_dit_fft_f64(re_second_half, im_second_half, half, planner, opts, 0, simd_level), ); // Both halves completed stages 0..log_half-1 @@ -79,6 +82,7 @@ fn recursive_dit_fft_f64( stage, planner, stage_twiddle_idx, + simd_level ); } @@ -94,6 +98,7 @@ fn recursive_dit_fft_f32( planner: &PlannerDit32, opts: &Options, mut stage_twiddle_idx: usize, + simd_level: Level, ) -> usize { let log_size = size.ilog2() as usize; @@ -105,6 +110,7 @@ fn recursive_dit_fft_f32( stage, planner, stage_twiddle_idx, + simd_level ); } stage_twiddle_idx @@ -117,8 +123,8 @@ fn recursive_dit_fft_f32( // Recursively process both halves run_maybe_in_parallel( size > opts.smallest_parallel_chunk_size, - || recursive_dit_fft_f32(re_first_half, im_first_half, half, planner, opts, 0), - || recursive_dit_fft_f32(re_second_half, im_second_half, half, planner, opts, 0), + || recursive_dit_fft_f32(re_first_half, im_first_half, half, planner, opts, 0, simd_level), + || recursive_dit_fft_f32(re_second_half, im_second_half, half, planner, opts, 0, simd_level), ); // Both halves completed stages 0..log_half-1 @@ -133,6 +139,7 @@ fn recursive_dit_fft_f32( stage, planner, stage_twiddle_idx, + simd_level ); } @@ -148,6 +155,7 @@ fn execute_dit_stage_f64( stage: usize, planner: &PlannerDit64, stage_twiddle_idx: usize, + simd_level: Level, ) -> usize { let dist = 1 << stage; let chunk_size = dist << 1; @@ -156,24 +164,24 @@ fn execute_dit_stage_f64( fft_dit_chunk_2(reals, imags); stage_twiddle_idx } else if chunk_size == 4 { - fft_dit_chunk_4_simd_f64(reals, imags); + dispatch!(simd_level, simd => fft_dit_chunk_4_simd_f64(simd, reals, imags)); stage_twiddle_idx } else if chunk_size == 8 { - fft_dit_chunk_8_simd_f64(reals, imags); + dispatch!(simd_level, simd => fft_dit_chunk_8_simd_f64(simd, reals, imags)); stage_twiddle_idx } else if chunk_size == 16 { - fft_dit_chunk_16_simd_f64(reals, imags); + dispatch!(simd_level, simd => fft_dit_chunk_16_simd_f64(simd, reals, imags)); stage_twiddle_idx } else if chunk_size == 32 { - fft_dit_chunk_32_simd_f64(reals, imags); + dispatch!(simd_level, simd => fft_dit_chunk_32_simd_f64(simd, reals, imags)); stage_twiddle_idx } else if chunk_size == 64 { - fft_dit_chunk_64_simd_f64(reals, imags); + dispatch!(simd_level, simd => fft_dit_chunk_64_simd_f64(simd, reals, imags)); stage_twiddle_idx } else { // For larger chunks, use general kernel with twiddles from planner let (twiddles_re, twiddles_im) = &planner.stage_twiddles[stage_twiddle_idx]; - fft_dit_64_chunk_n_simd(reals, imags, twiddles_re, twiddles_im, dist); + dispatch!(simd_level, simd => fft_dit_64_chunk_n_simd(simd, reals, imags, twiddles_re, twiddles_im, dist)); stage_twiddle_idx + 1 } } @@ -186,6 +194,7 @@ fn execute_dit_stage_f32( stage: usize, planner: &PlannerDit32, stage_twiddle_idx: usize, + simd_level: Level, ) -> usize { let dist = 1 << stage; let chunk_size = dist << 1; @@ -194,24 +203,24 @@ fn execute_dit_stage_f32( fft_dit_chunk_2(reals, imags); stage_twiddle_idx } else if chunk_size == 4 { - fft_dit_chunk_4_simd_f32(reals, imags); + dispatch!(simd_level, simd => fft_dit_chunk_4_simd_f32(simd, reals, imags)); stage_twiddle_idx } else if chunk_size == 8 { - fft_dit_chunk_8_simd_f32(reals, imags); + dispatch!(simd_level, simd => fft_dit_chunk_8_simd_f32(simd, reals, imags)); stage_twiddle_idx } else if chunk_size == 16 { - fft_dit_chunk_16_simd_f32(reals, imags); + dispatch!(simd_level, simd => fft_dit_chunk_16_simd_f32(simd, reals, imags)); stage_twiddle_idx } else if chunk_size == 32 { - fft_dit_chunk_32_simd_f32(reals, imags); + dispatch!(simd_level, simd => fft_dit_chunk_32_simd_f32(simd, reals, imags)); stage_twiddle_idx } else if chunk_size == 64 { - fft_dit_chunk_64_simd_f32(reals, imags); + dispatch!(simd_level, simd => fft_dit_chunk_64_simd_f32(simd, reals, imags)); stage_twiddle_idx } else { // For larger chunks, use general kernel with twiddles from planner let (twiddles_re, twiddles_im) = &planner.stage_twiddles[stage_twiddle_idx]; - fft_dit_32_chunk_n_simd(reals, imags, twiddles_re, twiddles_im, dist); + dispatch!(simd_level, simd => fft_dit_32_chunk_n_simd(simd, reals, imags, twiddles_re, twiddles_im, dist)); stage_twiddle_idx + 1 } } @@ -261,7 +270,8 @@ pub fn fft_64_dit_with_planner_and_opts( } } - recursive_dit_fft_f64(reals, imags, n, planner, opts, 0); + let simd_level = Level::new(); + recursive_dit_fft_f64(reals, imags, n, planner, opts, 0, simd_level); // Scaling for inverse transform if let Direction::Reverse = planner.direction { @@ -304,7 +314,8 @@ pub fn fft_32_dit_with_planner_and_opts( } } - recursive_dit_fft_f32(reals, imags, n, planner, opts, 0); + let simd_level = Level::new(); + recursive_dit_fft_f32(reals, imags, n, planner, opts, 0, simd_level); // Scaling for inverse transform if let Direction::Reverse = planner.direction { From f003c147f8083043cb0a62e4890b3b3df063aab1 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Fri, 12 Dec 2025 18:37:33 +0000 Subject: [PATCH 07/12] Dispatch to multiversioned fft_dit_chunk_2 via fearless_simd rather than multiversion --- src/algorithms/dit.rs | 4 ++-- src/kernels/dit.rs | 20 ++++++++++++++++---- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/algorithms/dit.rs b/src/algorithms/dit.rs index f87bc13..fe64dfa 100644 --- a/src/algorithms/dit.rs +++ b/src/algorithms/dit.rs @@ -161,7 +161,7 @@ fn execute_dit_stage_f64( let chunk_size = dist << 1; if chunk_size == 2 { - fft_dit_chunk_2(reals, imags); + dispatch!(simd_level, simd => fft_dit_chunk_2(simd, reals, imags)); stage_twiddle_idx } else if chunk_size == 4 { dispatch!(simd_level, simd => fft_dit_chunk_4_simd_f64(simd, reals, imags)); @@ -200,7 +200,7 @@ fn execute_dit_stage_f32( let chunk_size = dist << 1; if chunk_size == 2 { - fft_dit_chunk_2(reals, imags); + dispatch!(simd_level, simd => fft_dit_chunk_2(simd, reals, imags)); stage_twiddle_idx } else if chunk_size == 4 { dispatch!(simd_level, simd => fft_dit_chunk_4_simd_f32(simd, reals, imags)); diff --git a/src/kernels/dit.rs b/src/kernels/dit.rs index fa85790..e865e21 100644 --- a/src/kernels/dit.rs +++ b/src/kernels/dit.rs @@ -8,12 +8,24 @@ use num_traits::Float; use fearless_simd::{Simd, SimdBase, SimdFrom, SimdFloat}; use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; -use crate::kernels::common::fft_chunk_2; - /// DIT butterfly for chunk_size == 2 /// Identical to DIF version (no twiddles at size 2) -pub fn fft_dit_chunk_2(reals: &mut [T], imags: &mut [T]) { - fft_chunk_2(reals, imags); +#[inline(always)] // required by fearless_simd +pub fn fft_dit_chunk_2(_simd: S, reals: &mut [T], imags: &mut [T]) { + reals + .chunks_exact_mut(2) + .zip(imags.chunks_exact_mut(2)) + .for_each(|(reals_chunk, imags_chunk)| { + let z0_re = reals_chunk[0]; + let z0_im = imags_chunk[0]; + let z1_re = reals_chunk[1]; + let z1_im = imags_chunk[1]; + + reals_chunk[0] = z0_re + z1_re; + imags_chunk[0] = z0_im + z1_im; + reals_chunk[1] = z0_re - z1_re; + imags_chunk[1] = z0_im - z1_im; + }); } /// DIT butterfly for chunk_size == 4 (f64) From a19a4e58e862829d83ba4988a75b33512b29542a Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Fri, 12 Dec 2025 18:55:57 +0000 Subject: [PATCH 08/12] move SIMD dispatch one level higher so that it's definitely, positively not messing anything up --- src/algorithms/dit.rs | 71 +++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 33 deletions(-) diff --git a/src/algorithms/dit.rs b/src/algorithms/dit.rs index fe64dfa..bbd6057 100644 --- a/src/algorithms/dit.rs +++ b/src/algorithms/dit.rs @@ -14,6 +14,8 @@ //! DIT starts with fine-grained memory access and progressively works with //! larger contiguous chunks. //! +use fearless_simd::{dispatch, Level, Simd}; + use crate::algorithms::cobra::cobra_apply; use crate::kernels::dit::{ fft_dit_32_chunk_n_simd, fft_dit_64_chunk_n_simd, fft_dit_chunk_16_simd_f32, @@ -25,7 +27,6 @@ use crate::kernels::dit::{ use crate::options::Options; use crate::parallel::run_maybe_in_parallel; use crate::planner::{Direction, PlannerDit32, PlannerDit64}; -use fearless_simd::{Level, dispatch}; /// L1 cache block size in complex elements (8KB for f32, 16KB for f64) const L1_BLOCK_SIZE: usize = 1024; @@ -34,26 +35,26 @@ const L1_BLOCK_SIZE: usize = 1024; /// /// Recursively divides by 2 until reaching L1 cache size, processes stages within /// each block, then processes cross-block stages on return. -fn recursive_dit_fft_f64( +fn recursive_dit_fft_f64( + simd: S, reals: &mut [f64], imags: &mut [f64], size: usize, planner: &PlannerDit64, opts: &Options, mut stage_twiddle_idx: usize, - simd_level: Level, ) -> usize { let log_size = size.ilog2() as usize; if size <= L1_BLOCK_SIZE { for stage in 0..log_size { stage_twiddle_idx = execute_dit_stage_f64( + simd, &mut reals[..size], &mut imags[..size], stage, planner, stage_twiddle_idx, - simd_level ); } stage_twiddle_idx @@ -66,8 +67,8 @@ fn recursive_dit_fft_f64( // Recursively process both halves run_maybe_in_parallel( size > opts.smallest_parallel_chunk_size, - || recursive_dit_fft_f64(re_first_half, im_first_half, half, planner, opts, 0, simd_level), - || recursive_dit_fft_f64(re_second_half, im_second_half, half, planner, opts, 0, simd_level), + || recursive_dit_fft_f64(simd, re_first_half, im_first_half, half, planner, opts, 0), + || recursive_dit_fft_f64(simd, re_second_half, im_second_half, half, planner, opts, 0), ); // Both halves completed stages 0..log_half-1 @@ -77,12 +78,12 @@ fn recursive_dit_fft_f64( // Process remaining stages that span both halves for stage in log_half..log_size { stage_twiddle_idx = execute_dit_stage_f64( + simd, &mut reals[..size], &mut imags[..size], stage, planner, stage_twiddle_idx, - simd_level ); } @@ -91,26 +92,26 @@ fn recursive_dit_fft_f64( } /// Recursive cache-blocked DIT FFT for f32 using post-order traversal. -fn recursive_dit_fft_f32( +fn recursive_dit_fft_f32( + simd: S, reals: &mut [f32], imags: &mut [f32], size: usize, planner: &PlannerDit32, opts: &Options, mut stage_twiddle_idx: usize, - simd_level: Level, ) -> usize { let log_size = size.ilog2() as usize; if size <= L1_BLOCK_SIZE { for stage in 0..log_size { stage_twiddle_idx = execute_dit_stage_f32( + simd, &mut reals[..size], &mut imags[..size], stage, planner, stage_twiddle_idx, - simd_level ); } stage_twiddle_idx @@ -123,8 +124,8 @@ fn recursive_dit_fft_f32( // Recursively process both halves run_maybe_in_parallel( size > opts.smallest_parallel_chunk_size, - || recursive_dit_fft_f32(re_first_half, im_first_half, half, planner, opts, 0, simd_level), - || recursive_dit_fft_f32(re_second_half, im_second_half, half, planner, opts, 0, simd_level), + || recursive_dit_fft_f32(simd, re_first_half, im_first_half, half, planner, opts, 0), + || recursive_dit_fft_f32(simd, re_second_half, im_second_half, half, planner, opts, 0), ); // Both halves completed stages 0..log_half-1 @@ -134,12 +135,12 @@ fn recursive_dit_fft_f32( // Process remaining stages that span both halves for stage in log_half..log_size { stage_twiddle_idx = execute_dit_stage_f32( + simd, &mut reals[..size], &mut imags[..size], stage, planner, stage_twiddle_idx, - simd_level ); } @@ -149,78 +150,82 @@ fn recursive_dit_fft_f32( /// Execute a single DIT stage, dispatching to appropriate kernel based on chunk size. /// Returns updated stage_twiddle_idx. -fn execute_dit_stage_f64( +fn execute_dit_stage_f64( + simd: S, reals: &mut [f64], imags: &mut [f64], stage: usize, planner: &PlannerDit64, stage_twiddle_idx: usize, - simd_level: Level, ) -> usize { let dist = 1 << stage; let chunk_size = dist << 1; if chunk_size == 2 { - dispatch!(simd_level, simd => fft_dit_chunk_2(simd, reals, imags)); + simd.vectorize(|| fft_dit_chunk_2(simd, reals, imags)); stage_twiddle_idx } else if chunk_size == 4 { - dispatch!(simd_level, simd => fft_dit_chunk_4_simd_f64(simd, reals, imags)); + simd.vectorize(|| fft_dit_chunk_4_simd_f64(simd, reals, imags)); stage_twiddle_idx } else if chunk_size == 8 { - dispatch!(simd_level, simd => fft_dit_chunk_8_simd_f64(simd, reals, imags)); + simd.vectorize(|| fft_dit_chunk_8_simd_f64(simd, reals, imags)); stage_twiddle_idx } else if chunk_size == 16 { - dispatch!(simd_level, simd => fft_dit_chunk_16_simd_f64(simd, reals, imags)); + simd.vectorize(|| fft_dit_chunk_16_simd_f64(simd, reals, imags)); stage_twiddle_idx } else if chunk_size == 32 { - dispatch!(simd_level, simd => fft_dit_chunk_32_simd_f64(simd, reals, imags)); + simd.vectorize(|| fft_dit_chunk_32_simd_f64(simd, reals, imags)); stage_twiddle_idx } else if chunk_size == 64 { - dispatch!(simd_level, simd => fft_dit_chunk_64_simd_f64(simd, reals, imags)); + simd.vectorize(|| fft_dit_chunk_64_simd_f64(simd, reals, imags)); stage_twiddle_idx } else { // For larger chunks, use general kernel with twiddles from planner let (twiddles_re, twiddles_im) = &planner.stage_twiddles[stage_twiddle_idx]; - dispatch!(simd_level, simd => fft_dit_64_chunk_n_simd(simd, reals, imags, twiddles_re, twiddles_im, dist)); + simd.vectorize(|| { + fft_dit_64_chunk_n_simd(simd, reals, imags, twiddles_re, twiddles_im, dist) + }); stage_twiddle_idx + 1 } } /// Execute a single DIT stage, dispatching to appropriate kernel based on chunk size. /// Returns updated stage_twiddle_idx. -fn execute_dit_stage_f32( +fn execute_dit_stage_f32( + simd: S, reals: &mut [f32], imags: &mut [f32], stage: usize, planner: &PlannerDit32, stage_twiddle_idx: usize, - simd_level: Level, ) -> usize { let dist = 1 << stage; let chunk_size = dist << 1; if chunk_size == 2 { - dispatch!(simd_level, simd => fft_dit_chunk_2(simd, reals, imags)); + simd.vectorize(|| fft_dit_chunk_2(simd, reals, imags)); stage_twiddle_idx } else if chunk_size == 4 { - dispatch!(simd_level, simd => fft_dit_chunk_4_simd_f32(simd, reals, imags)); + simd.vectorize(|| fft_dit_chunk_4_simd_f32(simd, reals, imags)); stage_twiddle_idx } else if chunk_size == 8 { - dispatch!(simd_level, simd => fft_dit_chunk_8_simd_f32(simd, reals, imags)); + simd.vectorize(|| fft_dit_chunk_8_simd_f32(simd, reals, imags)); stage_twiddle_idx } else if chunk_size == 16 { - dispatch!(simd_level, simd => fft_dit_chunk_16_simd_f32(simd, reals, imags)); + simd.vectorize(|| fft_dit_chunk_16_simd_f32(simd, reals, imags)); stage_twiddle_idx } else if chunk_size == 32 { - dispatch!(simd_level, simd => fft_dit_chunk_32_simd_f32(simd, reals, imags)); + simd.vectorize(|| fft_dit_chunk_32_simd_f32(simd, reals, imags)); stage_twiddle_idx } else if chunk_size == 64 { - dispatch!(simd_level, simd => fft_dit_chunk_64_simd_f32(simd, reals, imags)); + simd.vectorize(|| fft_dit_chunk_64_simd_f32(simd, reals, imags)); stage_twiddle_idx } else { // For larger chunks, use general kernel with twiddles from planner let (twiddles_re, twiddles_im) = &planner.stage_twiddles[stage_twiddle_idx]; - dispatch!(simd_level, simd => fft_dit_32_chunk_n_simd(simd, reals, imags, twiddles_re, twiddles_im, dist)); + simd.vectorize(|| { + fft_dit_32_chunk_n_simd(simd, reals, imags, twiddles_re, twiddles_im, dist) + }); stage_twiddle_idx + 1 } } @@ -271,7 +276,7 @@ pub fn fft_64_dit_with_planner_and_opts( } let simd_level = Level::new(); - recursive_dit_fft_f64(reals, imags, n, planner, opts, 0, simd_level); + dispatch!(simd_level, simd => recursive_dit_fft_f64(simd, reals, imags, n, planner, opts, 0)); // Scaling for inverse transform if let Direction::Reverse = planner.direction { @@ -315,7 +320,7 @@ pub fn fft_32_dit_with_planner_and_opts( } let simd_level = Level::new(); - recursive_dit_fft_f32(reals, imags, n, planner, opts, 0, simd_level); + dispatch!(simd_level, simd => recursive_dit_fft_f32(simd, reals, imags, n, planner, opts, 0)); // Scaling for inverse transform if let Direction::Reverse = planner.direction { From 35a7b601d20ccb1be0c42389ceb84de382f8b395 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 21 Jan 2026 12:08:43 +0000 Subject: [PATCH 09/12] Update for the swapped order of arguments in simd_from --- src/kernels/dit.rs | 232 ++++++++++++++++++++++----------------------- 1 file changed, 116 insertions(+), 116 deletions(-) diff --git a/src/kernels/dit.rs b/src/kernels/dit.rs index e865e21..bd36e05 100644 --- a/src/kernels/dit.rs +++ b/src/kernels/dit.rs @@ -117,18 +117,18 @@ pub fn fft_dit_chunk_8_simd_f64(simd: S, reals: &mut [f64], imags: &mut const CHUNK_SIZE: usize = DIST << 1; let two = f64x4::splat(simd, 2.0); - let sqrt2_2 = f64x4::simd_from([ + let sqrt2_2 = f64x4::simd_from(simd, [ 1.0, // W_8^0 real std::f64::consts::FRAC_1_SQRT_2, // W_8^1 real (sqrt(2)/2) 0.0, // W_8^2 real -std::f64::consts::FRAC_1_SQRT_2, // W_8^3 real (-sqrt(2)/2) - ], simd); - let sqrt2_2_im = f64x4::simd_from([ + ]); + let sqrt2_2_im = f64x4::simd_from(simd, [ 0.0, // W_8^0 imag -std::f64::consts::FRAC_1_SQRT_2, // W_8^1 imag (-sqrt(2)/2) -1.0, // W_8^2 imag -std::f64::consts::FRAC_1_SQRT_2, // W_8^3 imag (-sqrt(2)/2) - ], simd); + ]); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -136,10 +136,10 @@ pub fn fft_dit_chunk_8_simd_f64(simd: S, reals: &mut [f64], imags: &mut let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST); let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); - let in0_re = f64x4::simd_from(<[f64; 4]>::try_from(&reals_s0[0..4]).unwrap(), simd); - let in1_re = f64x4::simd_from(<[f64; 4]>::try_from(&reals_s1[0..4]).unwrap(), simd); - let in0_im = f64x4::simd_from(<[f64; 4]>::try_from(&imags_s0[0..4]).unwrap(), simd); - let in1_im = f64x4::simd_from(<[f64; 4]>::try_from(&imags_s1[0..4]).unwrap(), simd); + let in0_re = f64x4::simd_from(simd, <[f64; 4]>::try_from(&reals_s0[0..4]).unwrap()); + let in1_re = f64x4::simd_from(simd, <[f64; 4]>::try_from(&reals_s1[0..4]).unwrap()); + let in0_im = f64x4::simd_from(simd, <[f64; 4]>::try_from(&imags_s0[0..4]).unwrap()); + let in1_im = f64x4::simd_from(simd, <[f64; 4]>::try_from(&imags_s1[0..4]).unwrap()); // out0.re = (in0.re + w.re * in1.re) - w.im * in1.im let out0_re = sqrt2_2_im.mul_add(-in1_im, sqrt2_2.mul_add(in1_re, in0_re)); @@ -164,18 +164,18 @@ pub fn fft_dit_chunk_8_simd_f32(simd: S, reals: &mut [f32], imags: &mut const CHUNK_SIZE: usize = DIST << 1; let two = f32x4::splat(simd, 2.0); - let sqrt2_2 = f32x4::simd_from([ + let sqrt2_2 = f32x4::simd_from(simd, [ 1.0_f32, // W_8^0 real std::f32::consts::FRAC_1_SQRT_2, // W_8^1 real (sqrt(2)/2) 0.0_f32, // W_8^2 real -std::f32::consts::FRAC_1_SQRT_2, // W_8^3 real (-sqrt(2)/2) - ], simd); - let sqrt2_2_im = f32x4::simd_from([ + ]); + let sqrt2_2_im = f32x4::simd_from(simd, [ 0.0_f32, // W_8^0 imag -std::f32::consts::FRAC_1_SQRT_2, // W_8^1 imag (-sqrt(2)/2) -1.0_f32, // W_8^2 imag -std::f32::consts::FRAC_1_SQRT_2, // W_8^3 imag (-sqrt(2)/2) - ], simd); + ]); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -183,10 +183,10 @@ pub fn fft_dit_chunk_8_simd_f32(simd: S, reals: &mut [f32], imags: &mut let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST); let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); - let in0_re = f32x4::simd_from(<[f32; 4]>::try_from(&reals_s0[0..4]).unwrap(), simd); - let in1_re = f32x4::simd_from(<[f32; 4]>::try_from(&reals_s1[0..4]).unwrap(), simd); - let in0_im = f32x4::simd_from(<[f32; 4]>::try_from(&imags_s0[0..4]).unwrap(), simd); - let in1_im = f32x4::simd_from(<[f32; 4]>::try_from(&imags_s1[0..4]).unwrap(), simd); + let in0_re = f32x4::simd_from(simd, <[f32; 4]>::try_from(&reals_s0[0..4]).unwrap()); + let in1_re = f32x4::simd_from(simd, <[f32; 4]>::try_from(&reals_s1[0..4]).unwrap()); + let in0_im = f32x4::simd_from(simd, <[f32; 4]>::try_from(&imags_s0[0..4]).unwrap()); + let in1_im = f32x4::simd_from(simd, <[f32; 4]>::try_from(&imags_s1[0..4]).unwrap()); // out0.re = (in0.re + w.re * in1.re) - w.im * in1.im let out0_re = sqrt2_2_im.mul_add(-in1_im, sqrt2_2.mul_add(in1_re, in0_re)); @@ -213,7 +213,7 @@ pub fn fft_dit_chunk_16_simd_f64(simd: S, reals: &mut [f64], imags: &mu let two = f64x8::splat(simd, 2.0); // Twiddle factors for W_16^k where k = 0..7 - let twiddle_re = f64x8::simd_from([ + let twiddle_re = f64x8::simd_from(simd, [ 1.0, // W_16^0 0.9238795325112867, // W_16^1 = cos(pi/8) std::f64::consts::FRAC_1_SQRT_2, // W_16^2 = sqrt(2)/2 @@ -222,9 +222,9 @@ pub fn fft_dit_chunk_16_simd_f64(simd: S, reals: &mut [f64], imags: &mu -0.38268343236508984, // W_16^5 = -cos(3*pi/8) -std::f64::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 -0.9238795325112867, // W_16^7 = -cos(pi/8) - ], simd); + ]); - let twiddle_im = f64x8::simd_from([ + let twiddle_im = f64x8::simd_from(simd, [ 0.0, // W_16^0 -0.38268343236508984, // W_16^1 = -sin(pi/8) -std::f64::consts::FRAC_1_SQRT_2, // W_16^2 = -sqrt(2)/2 @@ -233,7 +233,7 @@ pub fn fft_dit_chunk_16_simd_f64(simd: S, reals: &mut [f64], imags: &mu -0.9238795325112867, // W_16^5 = -sin(3*pi/8) -std::f64::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 -0.38268343236508984, // W_16^7 = -sin(pi/8) - ], simd); + ]); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -242,10 +242,10 @@ pub fn fft_dit_chunk_16_simd_f64(simd: S, reals: &mut [f64], imags: &mu let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Load all 8 elements at once - let in0_re = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s0[0..8]).unwrap(), simd); - let in1_re = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s1[0..8]).unwrap(), simd); - let in0_im = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s0[0..8]).unwrap(), simd); - let in1_im = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s1[0..8]).unwrap(), simd); + let in0_re = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s0[0..8]).unwrap()); + let in1_re = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s1[0..8]).unwrap()); + let in0_im = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s0[0..8]).unwrap()); + let in1_im = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s1[0..8]).unwrap()); let out0_re = twiddle_im.mul_add(-in1_im, twiddle_re.mul_add(in1_re, in0_re)); let out0_im = twiddle_im.mul_add(in1_re, twiddle_re.mul_add(in1_im, in0_im)); @@ -270,7 +270,7 @@ pub fn fft_dit_chunk_16_simd_f32(simd: S, reals: &mut [f32], imags: &mu let two = f32x8::splat(simd, 2.0); // Twiddle factors for W_16^k where k = 0..7 - let twiddle_re = f32x8::simd_from([ + let twiddle_re = f32x8::simd_from(simd, [ 1.0_f32, // W_16^0 0.923_879_5_f32, // W_16^1 = cos(pi/8) std::f32::consts::FRAC_1_SQRT_2, // W_16^2 = sqrt(2)/2 @@ -279,9 +279,9 @@ pub fn fft_dit_chunk_16_simd_f32(simd: S, reals: &mut [f32], imags: &mu -0.382_683_43_f32, // W_16^5 = -cos(3*pi/8) -std::f32::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 -0.923_879_5_f32, // W_16^7 = -cos(pi/8) - ], simd); + ]); - let twiddle_im = f32x8::simd_from([ + let twiddle_im = f32x8::simd_from(simd, [ 0.0_f32, // W_16^0 -0.382_683_43_f32, // W_16^1 = -sin(pi/8) -std::f32::consts::FRAC_1_SQRT_2, // W_16^2 = -sqrt(2)/2 @@ -290,7 +290,7 @@ pub fn fft_dit_chunk_16_simd_f32(simd: S, reals: &mut [f32], imags: &mu -0.923_879_5_f32, // W_16^5 = -sin(3*pi/8) -std::f32::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 -0.382_683_43_f32, // W_16^7 = -sin(pi/8) - ], simd); + ]); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -299,10 +299,10 @@ pub fn fft_dit_chunk_16_simd_f32(simd: S, reals: &mut [f32], imags: &mu let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Load all 8 elements at once - let in0_re = f32x8::simd_from(<[f32; 8]>::try_from(&reals_s0[0..8]).unwrap(), simd); - let in1_re = f32x8::simd_from(<[f32; 8]>::try_from(&reals_s1[0..8]).unwrap(), simd); - let in0_im = f32x8::simd_from(<[f32; 8]>::try_from(&imags_s0[0..8]).unwrap(), simd); - let in1_im = f32x8::simd_from(<[f32; 8]>::try_from(&imags_s1[0..8]).unwrap(), simd); + let in0_re = f32x8::simd_from(simd, <[f32; 8]>::try_from(&reals_s0[0..8]).unwrap()); + let in1_re = f32x8::simd_from(simd, <[f32; 8]>::try_from(&reals_s1[0..8]).unwrap()); + let in0_im = f32x8::simd_from(simd, <[f32; 8]>::try_from(&imags_s0[0..8]).unwrap()); + let in1_im = f32x8::simd_from(simd, <[f32; 8]>::try_from(&imags_s1[0..8]).unwrap()); let out0_re = twiddle_im.mul_add(-in1_im, twiddle_re.mul_add(in1_re, in0_re)); let out0_im = twiddle_im.mul_add(in1_re, twiddle_re.mul_add(in1_im, in0_im)); @@ -326,7 +326,7 @@ pub fn fft_dit_chunk_32_simd_f64(simd: S, reals: &mut [f64], imags: &mu let two = f64x8::splat(simd, 2.0); // First 8 twiddle factors for W_32^k where k = 0..7 - let twiddle_re_0_7 = f64x8::simd_from([ + let twiddle_re_0_7 = f64x8::simd_from(simd, [ 1.0, // W_32^0 = 1 0.9807852804032304, // W_32^1 = cos(π/16) 0.9238795325112867, // W_32^2 = cos(π/8) @@ -335,9 +335,9 @@ pub fn fft_dit_chunk_32_simd_f64(simd: S, reals: &mut [f64], imags: &mu 0.5555702330196022, // W_32^5 = cos(5π/16) 0.3826834323650898, // W_32^6 = cos(3π/8) 0.19509032201612825, // W_32^7 = cos(7π/16) - ], simd); + ]); - let twiddle_im_0_7 = f64x8::simd_from([ + let twiddle_im_0_7 = f64x8::simd_from(simd, [ 0.0, // W_32^0 -0.19509032201612825, // W_32^1 = -sin(π/16) -0.3826834323650898, // W_32^2 = -sin(π/8) @@ -346,10 +346,10 @@ pub fn fft_dit_chunk_32_simd_f64(simd: S, reals: &mut [f64], imags: &mu -0.8314696123025452, // W_32^5 = -sin(5π/16) -0.9238795325112867, // W_32^6 = -sin(3π/8) -0.9807852804032304, // W_32^7 = -sin(7π/16) - ], simd); + ]); // Second 8 twiddle factors for W_32^k where k = 8..15 - let twiddle_re_8_15 = f64x8::simd_from([ + let twiddle_re_8_15 = f64x8::simd_from(simd, [ 0.0, // W_32^8 = 0 - i -0.19509032201612825, // W_32^9 -0.3826834323650898, // W_32^10 @@ -358,9 +358,9 @@ pub fn fft_dit_chunk_32_simd_f64(simd: S, reals: &mut [f64], imags: &mu -0.8314696123025452, // W_32^13 -0.9238795325112867, // W_32^14 -0.9807852804032304, // W_32^15 - ], simd); + ]); - let twiddle_im_8_15 = f64x8::simd_from([ + let twiddle_im_8_15 = f64x8::simd_from(simd, [ -1.0, // W_32^8 -0.9807852804032304, // W_32^9 -0.9238795325112867, // W_32^10 @@ -369,7 +369,7 @@ pub fn fft_dit_chunk_32_simd_f64(simd: S, reals: &mut [f64], imags: &mu -0.5555702330196022, // W_32^13 -0.3826834323650898, // W_32^14 -0.19509032201612825, // W_32^15 - ], simd); + ]); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -378,10 +378,10 @@ pub fn fft_dit_chunk_32_simd_f64(simd: S, reals: &mut [f64], imags: &mu let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Process first 8 butterflies - let in0_re_0_7 = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s0[0..8]).unwrap(), simd); - let in1_re_0_7 = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s1[0..8]).unwrap(), simd); - let in0_im_0_7 = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s0[0..8]).unwrap(), simd); - let in1_im_0_7 = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s1[0..8]).unwrap(), simd); + let in0_re_0_7 = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s0[0..8]).unwrap()); + let in1_re_0_7 = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s1[0..8]).unwrap()); + let in0_im_0_7 = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s0[0..8]).unwrap()); + let in1_im_0_7 = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s1[0..8]).unwrap()); let out0_re_0_7 = twiddle_im_0_7.mul_add(-in1_im_0_7, twiddle_re_0_7.mul_add(in1_re_0_7, in0_re_0_7)); @@ -397,10 +397,10 @@ pub fn fft_dit_chunk_32_simd_f64(simd: S, reals: &mut [f64], imags: &mu imags_s1[0..8].copy_from_slice(out1_im_0_7.as_slice()); // Process second 8 butterflies - let in0_re_8_15 = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s0[8..16]).unwrap(), simd); - let in1_re_8_15 = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s1[8..16]).unwrap(), simd); - let in0_im_8_15 = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s0[8..16]).unwrap(), simd); - let in1_im_8_15 = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s1[8..16]).unwrap(), simd); + let in0_re_8_15 = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s0[8..16]).unwrap()); + let in1_re_8_15 = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s1[8..16]).unwrap()); + let in0_im_8_15 = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s0[8..16]).unwrap()); + let in1_im_8_15 = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s1[8..16]).unwrap()); let out0_re_8_15 = twiddle_im_8_15.mul_add( -in1_im_8_15, @@ -430,7 +430,7 @@ pub fn fft_dit_chunk_32_simd_f32(simd: S, reals: &mut [f32], imags: &mu let two = f32x16::splat(simd, 2.0); // All 16 twiddle factors for W_32^k where k = 0..15 - let twiddle_re = f32x16::simd_from([ + let twiddle_re = f32x16::simd_from(simd, [ 1.0_f32, // W_32^0 = 1 0.980_785_25_f32, // W_32^1 = cos(π/16) 0.923_879_5_f32, // W_32^2 = cos(π/8) @@ -447,9 +447,9 @@ pub fn fft_dit_chunk_32_simd_f32(simd: S, reals: &mut [f32], imags: &mu -0.831_469_6_f32, // W_32^13 -0.923_879_5_f32, // W_32^14 -0.980_785_25_f32, // W_32^15 - ], simd); + ]); - let twiddle_im = f32x16::simd_from([ + let twiddle_im = f32x16::simd_from(simd, [ 0.0_f32, // W_32^0 -0.195_090_32_f32, // W_32^1 = -sin(π/16) -0.382_683_43_f32, // W_32^2 = -sin(π/8) @@ -466,7 +466,7 @@ pub fn fft_dit_chunk_32_simd_f32(simd: S, reals: &mut [f32], imags: &mu -0.555_570_24_f32, // W_32^13 -0.382_683_43_f32, // W_32^14 -0.195_090_32_f32, // W_32^15 - ], simd); + ]); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -475,10 +475,10 @@ pub fn fft_dit_chunk_32_simd_f32(simd: S, reals: &mut [f32], imags: &mu let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Process all 16 butterflies at once with f32x16 - let in0_re = f32x16::simd_from(<[f32; 16]>::try_from(&reals_s0[0..16]).unwrap(), simd); - let in1_re = f32x16::simd_from(<[f32; 16]>::try_from(&reals_s1[0..16]).unwrap(), simd); - let in0_im = f32x16::simd_from(<[f32; 16]>::try_from(&imags_s0[0..16]).unwrap(), simd); - let in1_im = f32x16::simd_from(<[f32; 16]>::try_from(&imags_s1[0..16]).unwrap(), simd); + let in0_re = f32x16::simd_from(simd, <[f32; 16]>::try_from(&reals_s0[0..16]).unwrap()); + let in1_re = f32x16::simd_from(simd, <[f32; 16]>::try_from(&reals_s1[0..16]).unwrap()); + let in0_im = f32x16::simd_from(simd, <[f32; 16]>::try_from(&imags_s0[0..16]).unwrap()); + let in1_im = f32x16::simd_from(simd, <[f32; 16]>::try_from(&imags_s1[0..16]).unwrap()); let out0_re = twiddle_im.mul_add(-in1_im, twiddle_re.mul_add(in1_re, in0_re)); let out0_im = twiddle_im.mul_add(in1_re, twiddle_re.mul_add(in1_im, in0_im)); @@ -503,7 +503,7 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu // Process in 4 iterations of 8 butterflies each // Twiddles for W_64^k where k = 0..7 - let twiddle_re_0_7 = f64x8::simd_from([ + let twiddle_re_0_7 = f64x8::simd_from(simd, [ 1.0, // W_64^0 = 1 0.9951847266721969, // W_64^1 = cos(π/32) 0.9807852804032304, // W_64^2 = cos(π/16) @@ -512,9 +512,9 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu 0.8819212643483549, // W_64^5 = cos(5π/32) 0.8314696123025452, // W_64^6 = cos(3π/16) 0.773010453362737, // W_64^7 = cos(7π/32) - ], simd); + ]); - let twiddle_im_0_7 = f64x8::simd_from([ + let twiddle_im_0_7 = f64x8::simd_from(simd, [ 0.0, // W_64^0 -0.0980171403295606, // W_64^1 = -sin(π/32) -0.19509032201612825, // W_64^2 = -sin(π/16) @@ -523,10 +523,10 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu -0.47139673682599764, // W_64^5 = -sin(5π/32) -0.5555702330196022, // W_64^6 = -sin(3π/16) -0.6343932841636455, // W_64^7 = -sin(7π/32) - ], simd); + ]); // Twiddles for k = 8..15 - let twiddle_re_8_15 = f64x8::simd_from([ + let twiddle_re_8_15 = f64x8::simd_from(simd, [ std::f64::consts::FRAC_1_SQRT_2, // W_64^8 = sqrt(2)/2 0.6343932841636455, // W_64^9 0.5555702330196022, // W_64^10 @@ -535,9 +535,9 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu 0.29028467725446233, // W_64^13 0.19509032201612825, // W_64^14 0.0980171403295606, // W_64^15 - ], simd); + ]); - let twiddle_im_8_15 = f64x8::simd_from([ + let twiddle_im_8_15 = f64x8::simd_from(simd, [ -std::f64::consts::FRAC_1_SQRT_2, // W_64^8 -0.773010453362737, // W_64^9 -0.8314696123025452, // W_64^10 @@ -546,10 +546,10 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu -0.9569403357322089, // W_64^13 -0.9807852804032304, // W_64^14 -0.9951847266721969, // W_64^15 - ], simd); + ]); // Twiddles for k = 16..23 - let twiddle_re_16_23 = f64x8::simd_from([ + let twiddle_re_16_23 = f64x8::simd_from(simd, [ 0.0, // W_64^16 = -i -0.0980171403295606, // W_64^17 -0.19509032201612825, // W_64^18 @@ -558,9 +558,9 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu -0.47139673682599764, // W_64^21 -0.5555702330196022, // W_64^22 -0.6343932841636455, // W_64^23 - ], simd); + ]); - let twiddle_im_16_23 = f64x8::simd_from([ + let twiddle_im_16_23 = f64x8::simd_from(simd, [ -1.0, // W_64^16 -0.9951847266721969, // W_64^17 -0.9807852804032304, // W_64^18 @@ -569,10 +569,10 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu -0.8819212643483549, // W_64^21 -0.8314696123025452, // W_64^22 -0.773010453362737, // W_64^23 - ], simd); + ]); // Twiddles for k = 24..31 - let twiddle_re_24_31 = f64x8::simd_from([ + let twiddle_re_24_31 = f64x8::simd_from(simd, [ -std::f64::consts::FRAC_1_SQRT_2, // W_64^24 -0.773010453362737, // W_64^25 -0.8314696123025452, // W_64^26 @@ -581,9 +581,9 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu -0.9569403357322089, // W_64^29 -0.9807852804032304, // W_64^30 -0.9951847266721969, // W_64^31 - ], simd); + ]); - let twiddle_im_24_31 = f64x8::simd_from([ + let twiddle_im_24_31 = f64x8::simd_from(simd, [ -std::f64::consts::FRAC_1_SQRT_2, // W_64^24 -0.6343932841636455, // W_64^25 -0.5555702330196022, // W_64^26 @@ -592,7 +592,7 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu -0.29028467725446233, // W_64^29 -0.19509032201612825, // W_64^30 -0.0980171403295606, // W_64^31 - ], simd); + ]); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -601,10 +601,10 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Process butterflies 0..7 - let in0_re = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s0[0..8]).unwrap(), simd); - let in1_re = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s1[0..8]).unwrap(), simd); - let in0_im = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s0[0..8]).unwrap(), simd); - let in1_im = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s1[0..8]).unwrap(), simd); + let in0_re = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s0[0..8]).unwrap()); + let in1_re = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s1[0..8]).unwrap()); + let in0_im = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s0[0..8]).unwrap()); + let in1_im = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s1[0..8]).unwrap()); let out0_re = twiddle_im_0_7.mul_add(-in1_im, twiddle_re_0_7.mul_add(in1_re, in0_re)); let out0_im = twiddle_im_0_7.mul_add(in1_re, twiddle_re_0_7.mul_add(in1_im, in0_im)); @@ -617,10 +617,10 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu imags_s1[0..8].copy_from_slice(out1_im.as_slice()); // Process butterflies 8..15 - let in0_re = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s0[8..16]).unwrap(), simd); - let in1_re = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s1[8..16]).unwrap(), simd); - let in0_im = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s0[8..16]).unwrap(), simd); - let in1_im = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s1[8..16]).unwrap(), simd); + let in0_re = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s0[8..16]).unwrap()); + let in1_re = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s1[8..16]).unwrap()); + let in0_im = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s0[8..16]).unwrap()); + let in1_im = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s1[8..16]).unwrap()); let out0_re = twiddle_im_8_15.mul_add(-in1_im, twiddle_re_8_15.mul_add(in1_re, in0_re)); let out0_im = twiddle_im_8_15.mul_add(in1_re, twiddle_re_8_15.mul_add(in1_im, in0_im)); @@ -633,10 +633,10 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu imags_s1[8..16].copy_from_slice(out1_im.as_slice()); // Process butterflies 16..23 - let in0_re = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s0[16..24]).unwrap(), simd); - let in1_re = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s1[16..24]).unwrap(), simd); - let in0_im = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s0[16..24]).unwrap(), simd); - let in1_im = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s1[16..24]).unwrap(), simd); + let in0_re = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s0[16..24]).unwrap()); + let in1_re = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s1[16..24]).unwrap()); + let in0_im = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s0[16..24]).unwrap()); + let in1_im = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s1[16..24]).unwrap()); let out0_re = twiddle_im_16_23.mul_add(-in1_im, twiddle_re_16_23.mul_add(in1_re, in0_re)); @@ -651,10 +651,10 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu imags_s1[16..24].copy_from_slice(out1_im.as_slice()); // Process butterflies 24..31 - let in0_re = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s0[24..32]).unwrap(), simd); - let in1_re = f64x8::simd_from(<[f64; 8]>::try_from(&reals_s1[24..32]).unwrap(), simd); - let in0_im = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s0[24..32]).unwrap(), simd); - let in1_im = f64x8::simd_from(<[f64; 8]>::try_from(&imags_s1[24..32]).unwrap(), simd); + let in0_re = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s0[24..32]).unwrap()); + let in1_re = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s1[24..32]).unwrap()); + let in0_im = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s0[24..32]).unwrap()); + let in1_im = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s1[24..32]).unwrap()); let out0_re = twiddle_im_24_31.mul_add(-in1_im, twiddle_re_24_31.mul_add(in1_re, in0_re)); @@ -680,7 +680,7 @@ pub fn fft_dit_chunk_64_simd_f32(simd: S, reals: &mut [f32], imags: &mu // Process in 2 iterations of 16 butterflies each // Twiddles for W_64^k where k = 0..15 - let twiddle_re_0_15 = f32x16::simd_from([ + let twiddle_re_0_15 = f32x16::simd_from(simd, [ 1.0_f32, // W_64^0 = 1 0.995_184_7_f32, // W_64^1 = cos(π/32) 0.980_785_25_f32, // W_64^2 = cos(π/16) @@ -697,9 +697,9 @@ pub fn fft_dit_chunk_64_simd_f32(simd: S, reals: &mut [f32], imags: &mu 0.290_284_66_f32, // W_64^13 0.195_090_32_f32, // W_64^14 0.098_017_14_f32, // W_64^15 - ], simd); + ]); - let twiddle_im_0_15 = f32x16::simd_from([ + let twiddle_im_0_15 = f32x16::simd_from(simd, [ 0.0_f32, // W_64^0 -0.098_017_14_f32, // W_64^1 = -sin(π/32) -0.195_090_32_f32, // W_64^2 = -sin(π/16) @@ -716,10 +716,10 @@ pub fn fft_dit_chunk_64_simd_f32(simd: S, reals: &mut [f32], imags: &mu -0.956_940_35_f32, // W_64^13 -0.980_785_25_f32, // W_64^14 -0.995_184_7_f32, // W_64^15 - ], simd); + ]); // Twiddles for k = 16..31 - let twiddle_re_16_31 = f32x16::simd_from([ + let twiddle_re_16_31 = f32x16::simd_from(simd, [ 0.0_f32, // W_64^16 = -i -0.098_017_14_f32, // W_64^17 -0.195_090_32_f32, // W_64^18 @@ -736,9 +736,9 @@ pub fn fft_dit_chunk_64_simd_f32(simd: S, reals: &mut [f32], imags: &mu -0.956_940_35_f32, // W_64^29 -0.980_785_25_f32, // W_64^30 -0.995_184_7_f32, // W_64^31 - ], simd); + ]); - let twiddle_im_16_31 = f32x16::simd_from([ + let twiddle_im_16_31 = f32x16::simd_from(simd, [ -1.0_f32, // W_64^16 -0.995_184_7_f32, // W_64^17 -0.980_785_25_f32, // W_64^18 @@ -755,7 +755,7 @@ pub fn fft_dit_chunk_64_simd_f32(simd: S, reals: &mut [f32], imags: &mu -0.290_284_66_f32, // W_64^29 -0.195_090_32_f32, // W_64^30 -0.098_017_14_f32, // W_64^31 - ], simd); + ]); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -764,10 +764,10 @@ pub fn fft_dit_chunk_64_simd_f32(simd: S, reals: &mut [f32], imags: &mu let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Process butterflies 0..15 - let in0_re = f32x16::simd_from(<[f32; 16]>::try_from(&reals_s0[0..16]).unwrap(), simd); - let in1_re = f32x16::simd_from(<[f32; 16]>::try_from(&reals_s1[0..16]).unwrap(), simd); - let in0_im = f32x16::simd_from(<[f32; 16]>::try_from(&imags_s0[0..16]).unwrap(), simd); - let in1_im = f32x16::simd_from(<[f32; 16]>::try_from(&imags_s1[0..16]).unwrap(), simd); + let in0_re = f32x16::simd_from(simd, <[f32; 16]>::try_from(&reals_s0[0..16]).unwrap()); + let in1_re = f32x16::simd_from(simd, <[f32; 16]>::try_from(&reals_s1[0..16]).unwrap()); + let in0_im = f32x16::simd_from(simd, <[f32; 16]>::try_from(&imags_s0[0..16]).unwrap()); + let in1_im = f32x16::simd_from(simd, <[f32; 16]>::try_from(&imags_s1[0..16]).unwrap()); let out0_re = twiddle_im_0_15.mul_add(-in1_im, twiddle_re_0_15.mul_add(in1_re, in0_re)); let out0_im = twiddle_im_0_15.mul_add(in1_re, twiddle_re_0_15.mul_add(in1_im, in0_im)); @@ -780,10 +780,10 @@ pub fn fft_dit_chunk_64_simd_f32(simd: S, reals: &mut [f32], imags: &mu imags_s1[0..16].copy_from_slice(out1_im.as_slice()); // Process butterflies 16..31 - let in0_re = f32x16::simd_from(<[f32; 16]>::try_from(&reals_s0[16..32]).unwrap(), simd); - let in1_re = f32x16::simd_from(<[f32; 16]>::try_from(&reals_s1[16..32]).unwrap(), simd); - let in0_im = f32x16::simd_from(<[f32; 16]>::try_from(&imags_s0[16..32]).unwrap(), simd); - let in1_im = f32x16::simd_from(<[f32; 16]>::try_from(&imags_s1[16..32]).unwrap(), simd); + let in0_re = f32x16::simd_from(simd, <[f32; 16]>::try_from(&reals_s0[16..32]).unwrap()); + let in1_re = f32x16::simd_from(simd, <[f32; 16]>::try_from(&reals_s1[16..32]).unwrap()); + let in0_im = f32x16::simd_from(simd, <[f32; 16]>::try_from(&imags_s0[16..32]).unwrap()); + let in1_im = f32x16::simd_from(simd, <[f32; 16]>::try_from(&imags_s1[16..32]).unwrap()); let out0_re = twiddle_im_16_31.mul_add(-in1_im, twiddle_re_16_31.mul_add(in1_re, in0_re)); @@ -828,13 +828,13 @@ pub fn fft_dit_64_chunk_n_simd( .zip(twiddles_im.as_chunks::().0.iter()) .for_each(|(((((re_s0, re_s1), im_s0), im_s1), tw_re), tw_im)| { let two = f64x8::splat(simd, 2.0); - let in0_re = f64x8::simd_from(*re_s0, simd); - let in1_re = f64x8::simd_from(*re_s1, simd); - let in0_im = f64x8::simd_from(*im_s0, simd); - let in1_im = f64x8::simd_from(*im_s1, simd); + let in0_re = f64x8::simd_from(simd, *re_s0); + let in1_re = f64x8::simd_from(simd, *re_s1); + let in0_im = f64x8::simd_from(simd, *im_s0); + let in1_im = f64x8::simd_from(simd, *im_s1); - let tw_re = f64x8::simd_from(*tw_re, simd); - let tw_im = f64x8::simd_from(*tw_im, simd); + let tw_re = f64x8::simd_from(simd, *tw_re); + let tw_im = f64x8::simd_from(simd, *tw_im); // out0.re = (in0.re + tw_re * in1.re) - tw_im * in1.im let out0_re = tw_im.mul_add(-in1_im, tw_re.mul_add(in1_re, in0_re)); @@ -882,13 +882,13 @@ pub fn fft_dit_32_chunk_n_simd( .zip(twiddles_im.as_chunks::().0.iter()) .for_each(|(((((re_s0, re_s1), im_s0), im_s1), tw_re), tw_im)| { let two = f32x16::splat(simd, 2.0); - let in0_re = f32x16::simd_from(*re_s0, simd); - let in1_re = f32x16::simd_from(*re_s1, simd); - let in0_im = f32x16::simd_from(*im_s0, simd); - let in1_im = f32x16::simd_from(*im_s1, simd); + let in0_re = f32x16::simd_from(simd, *re_s0); + let in1_re = f32x16::simd_from(simd, *re_s1); + let in0_im = f32x16::simd_from(simd, *im_s0); + let in1_im = f32x16::simd_from(simd, *im_s1); - let tw_re = f32x16::simd_from(*tw_re, simd); - let tw_im = f32x16::simd_from(*tw_im, simd); + let tw_re = f32x16::simd_from(simd, *tw_re); + let tw_im = f32x16::simd_from(simd, *tw_im); // out0.re = (in0.re + tw_re * in1.re) - tw_im * in1.im let out0_re = tw_im.mul_add(-in1_im, tw_re.mul_add(in1_re, in0_re)); From 6cbf4efac71d4270a199b529efd025958662c582 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 21 Jan 2026 12:08:56 +0000 Subject: [PATCH 10/12] cargo fmt --- src/kernels/dit.rs | 699 +++++++++++++++++++++++++-------------------- 1 file changed, 390 insertions(+), 309 deletions(-) diff --git a/src/kernels/dit.rs b/src/kernels/dit.rs index bd36e05..17759e6 100644 --- a/src/kernels/dit.rs +++ b/src/kernels/dit.rs @@ -4,9 +4,8 @@ //! use core::f32; +use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8, Simd, SimdBase, SimdFloat, SimdFrom}; use num_traits::Float; -use fearless_simd::{Simd, SimdBase, SimdFrom, SimdFloat}; -use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; /// DIT butterfly for chunk_size == 2 /// Identical to DIF version (no twiddles at size 2) @@ -117,18 +116,24 @@ pub fn fft_dit_chunk_8_simd_f64(simd: S, reals: &mut [f64], imags: &mut const CHUNK_SIZE: usize = DIST << 1; let two = f64x4::splat(simd, 2.0); - let sqrt2_2 = f64x4::simd_from(simd, [ - 1.0, // W_8^0 real - std::f64::consts::FRAC_1_SQRT_2, // W_8^1 real (sqrt(2)/2) - 0.0, // W_8^2 real - -std::f64::consts::FRAC_1_SQRT_2, // W_8^3 real (-sqrt(2)/2) - ]); - let sqrt2_2_im = f64x4::simd_from(simd, [ - 0.0, // W_8^0 imag - -std::f64::consts::FRAC_1_SQRT_2, // W_8^1 imag (-sqrt(2)/2) - -1.0, // W_8^2 imag - -std::f64::consts::FRAC_1_SQRT_2, // W_8^3 imag (-sqrt(2)/2) - ]); + let sqrt2_2 = f64x4::simd_from( + simd, + [ + 1.0, // W_8^0 real + std::f64::consts::FRAC_1_SQRT_2, // W_8^1 real (sqrt(2)/2) + 0.0, // W_8^2 real + -std::f64::consts::FRAC_1_SQRT_2, // W_8^3 real (-sqrt(2)/2) + ], + ); + let sqrt2_2_im = f64x4::simd_from( + simd, + [ + 0.0, // W_8^0 imag + -std::f64::consts::FRAC_1_SQRT_2, // W_8^1 imag (-sqrt(2)/2) + -1.0, // W_8^2 imag + -std::f64::consts::FRAC_1_SQRT_2, // W_8^3 imag (-sqrt(2)/2) + ], + ); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -164,18 +169,24 @@ pub fn fft_dit_chunk_8_simd_f32(simd: S, reals: &mut [f32], imags: &mut const CHUNK_SIZE: usize = DIST << 1; let two = f32x4::splat(simd, 2.0); - let sqrt2_2 = f32x4::simd_from(simd, [ - 1.0_f32, // W_8^0 real - std::f32::consts::FRAC_1_SQRT_2, // W_8^1 real (sqrt(2)/2) - 0.0_f32, // W_8^2 real - -std::f32::consts::FRAC_1_SQRT_2, // W_8^3 real (-sqrt(2)/2) - ]); - let sqrt2_2_im = f32x4::simd_from(simd, [ - 0.0_f32, // W_8^0 imag - -std::f32::consts::FRAC_1_SQRT_2, // W_8^1 imag (-sqrt(2)/2) - -1.0_f32, // W_8^2 imag - -std::f32::consts::FRAC_1_SQRT_2, // W_8^3 imag (-sqrt(2)/2) - ]); + let sqrt2_2 = f32x4::simd_from( + simd, + [ + 1.0_f32, // W_8^0 real + std::f32::consts::FRAC_1_SQRT_2, // W_8^1 real (sqrt(2)/2) + 0.0_f32, // W_8^2 real + -std::f32::consts::FRAC_1_SQRT_2, // W_8^3 real (-sqrt(2)/2) + ], + ); + let sqrt2_2_im = f32x4::simd_from( + simd, + [ + 0.0_f32, // W_8^0 imag + -std::f32::consts::FRAC_1_SQRT_2, // W_8^1 imag (-sqrt(2)/2) + -1.0_f32, // W_8^2 imag + -std::f32::consts::FRAC_1_SQRT_2, // W_8^3 imag (-sqrt(2)/2) + ], + ); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -213,27 +224,33 @@ pub fn fft_dit_chunk_16_simd_f64(simd: S, reals: &mut [f64], imags: &mu let two = f64x8::splat(simd, 2.0); // Twiddle factors for W_16^k where k = 0..7 - let twiddle_re = f64x8::simd_from(simd, [ - 1.0, // W_16^0 - 0.9238795325112867, // W_16^1 = cos(pi/8) - std::f64::consts::FRAC_1_SQRT_2, // W_16^2 = sqrt(2)/2 - 0.38268343236508984, // W_16^3 = cos(3*pi/8) - 0.0, // W_16^4 - -0.38268343236508984, // W_16^5 = -cos(3*pi/8) - -std::f64::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 - -0.9238795325112867, // W_16^7 = -cos(pi/8) - ]); - - let twiddle_im = f64x8::simd_from(simd, [ - 0.0, // W_16^0 - -0.38268343236508984, // W_16^1 = -sin(pi/8) - -std::f64::consts::FRAC_1_SQRT_2, // W_16^2 = -sqrt(2)/2 - -0.9238795325112867, // W_16^3 = -sin(3*pi/8) - -1.0, // W_16^4 - -0.9238795325112867, // W_16^5 = -sin(3*pi/8) - -std::f64::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 - -0.38268343236508984, // W_16^7 = -sin(pi/8) - ]); + let twiddle_re = f64x8::simd_from( + simd, + [ + 1.0, // W_16^0 + 0.9238795325112867, // W_16^1 = cos(pi/8) + std::f64::consts::FRAC_1_SQRT_2, // W_16^2 = sqrt(2)/2 + 0.38268343236508984, // W_16^3 = cos(3*pi/8) + 0.0, // W_16^4 + -0.38268343236508984, // W_16^5 = -cos(3*pi/8) + -std::f64::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 + -0.9238795325112867, // W_16^7 = -cos(pi/8) + ], + ); + + let twiddle_im = f64x8::simd_from( + simd, + [ + 0.0, // W_16^0 + -0.38268343236508984, // W_16^1 = -sin(pi/8) + -std::f64::consts::FRAC_1_SQRT_2, // W_16^2 = -sqrt(2)/2 + -0.9238795325112867, // W_16^3 = -sin(3*pi/8) + -1.0, // W_16^4 + -0.9238795325112867, // W_16^5 = -sin(3*pi/8) + -std::f64::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 + -0.38268343236508984, // W_16^7 = -sin(pi/8) + ], + ); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -270,27 +287,33 @@ pub fn fft_dit_chunk_16_simd_f32(simd: S, reals: &mut [f32], imags: &mu let two = f32x8::splat(simd, 2.0); // Twiddle factors for W_16^k where k = 0..7 - let twiddle_re = f32x8::simd_from(simd, [ - 1.0_f32, // W_16^0 - 0.923_879_5_f32, // W_16^1 = cos(pi/8) - std::f32::consts::FRAC_1_SQRT_2, // W_16^2 = sqrt(2)/2 - 0.382_683_43_f32, // W_16^3 = cos(3*pi/8) - 0.0_f32, // W_16^4 - -0.382_683_43_f32, // W_16^5 = -cos(3*pi/8) - -std::f32::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 - -0.923_879_5_f32, // W_16^7 = -cos(pi/8) - ]); - - let twiddle_im = f32x8::simd_from(simd, [ - 0.0_f32, // W_16^0 - -0.382_683_43_f32, // W_16^1 = -sin(pi/8) - -std::f32::consts::FRAC_1_SQRT_2, // W_16^2 = -sqrt(2)/2 - -0.923_879_5_f32, // W_16^3 = -sin(3*pi/8) - -1.0_f32, // W_16^4 - -0.923_879_5_f32, // W_16^5 = -sin(3*pi/8) - -std::f32::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 - -0.382_683_43_f32, // W_16^7 = -sin(pi/8) - ]); + let twiddle_re = f32x8::simd_from( + simd, + [ + 1.0_f32, // W_16^0 + 0.923_879_5_f32, // W_16^1 = cos(pi/8) + std::f32::consts::FRAC_1_SQRT_2, // W_16^2 = sqrt(2)/2 + 0.382_683_43_f32, // W_16^3 = cos(3*pi/8) + 0.0_f32, // W_16^4 + -0.382_683_43_f32, // W_16^5 = -cos(3*pi/8) + -std::f32::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 + -0.923_879_5_f32, // W_16^7 = -cos(pi/8) + ], + ); + + let twiddle_im = f32x8::simd_from( + simd, + [ + 0.0_f32, // W_16^0 + -0.382_683_43_f32, // W_16^1 = -sin(pi/8) + -std::f32::consts::FRAC_1_SQRT_2, // W_16^2 = -sqrt(2)/2 + -0.923_879_5_f32, // W_16^3 = -sin(3*pi/8) + -1.0_f32, // W_16^4 + -0.923_879_5_f32, // W_16^5 = -sin(3*pi/8) + -std::f32::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 + -0.382_683_43_f32, // W_16^7 = -sin(pi/8) + ], + ); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -326,50 +349,62 @@ pub fn fft_dit_chunk_32_simd_f64(simd: S, reals: &mut [f64], imags: &mu let two = f64x8::splat(simd, 2.0); // First 8 twiddle factors for W_32^k where k = 0..7 - let twiddle_re_0_7 = f64x8::simd_from(simd, [ - 1.0, // W_32^0 = 1 - 0.9807852804032304, // W_32^1 = cos(π/16) - 0.9238795325112867, // W_32^2 = cos(π/8) - 0.8314696123025452, // W_32^3 = cos(3π/16) - std::f64::consts::FRAC_1_SQRT_2, // W_32^4 = sqrt(2)/2 - 0.5555702330196022, // W_32^5 = cos(5π/16) - 0.3826834323650898, // W_32^6 = cos(3π/8) - 0.19509032201612825, // W_32^7 = cos(7π/16) - ]); - - let twiddle_im_0_7 = f64x8::simd_from(simd, [ - 0.0, // W_32^0 - -0.19509032201612825, // W_32^1 = -sin(π/16) - -0.3826834323650898, // W_32^2 = -sin(π/8) - -0.5555702330196022, // W_32^3 = -sin(3π/16) - -std::f64::consts::FRAC_1_SQRT_2, // W_32^4 = -sqrt(2)/2 - -0.8314696123025452, // W_32^5 = -sin(5π/16) - -0.9238795325112867, // W_32^6 = -sin(3π/8) - -0.9807852804032304, // W_32^7 = -sin(7π/16) - ]); + let twiddle_re_0_7 = f64x8::simd_from( + simd, + [ + 1.0, // W_32^0 = 1 + 0.9807852804032304, // W_32^1 = cos(π/16) + 0.9238795325112867, // W_32^2 = cos(π/8) + 0.8314696123025452, // W_32^3 = cos(3π/16) + std::f64::consts::FRAC_1_SQRT_2, // W_32^4 = sqrt(2)/2 + 0.5555702330196022, // W_32^5 = cos(5π/16) + 0.3826834323650898, // W_32^6 = cos(3π/8) + 0.19509032201612825, // W_32^7 = cos(7π/16) + ], + ); + + let twiddle_im_0_7 = f64x8::simd_from( + simd, + [ + 0.0, // W_32^0 + -0.19509032201612825, // W_32^1 = -sin(π/16) + -0.3826834323650898, // W_32^2 = -sin(π/8) + -0.5555702330196022, // W_32^3 = -sin(3π/16) + -std::f64::consts::FRAC_1_SQRT_2, // W_32^4 = -sqrt(2)/2 + -0.8314696123025452, // W_32^5 = -sin(5π/16) + -0.9238795325112867, // W_32^6 = -sin(3π/8) + -0.9807852804032304, // W_32^7 = -sin(7π/16) + ], + ); // Second 8 twiddle factors for W_32^k where k = 8..15 - let twiddle_re_8_15 = f64x8::simd_from(simd, [ - 0.0, // W_32^8 = 0 - i - -0.19509032201612825, // W_32^9 - -0.3826834323650898, // W_32^10 - -0.5555702330196022, // W_32^11 - -std::f64::consts::FRAC_1_SQRT_2, // W_32^12 - -0.8314696123025452, // W_32^13 - -0.9238795325112867, // W_32^14 - -0.9807852804032304, // W_32^15 - ]); - - let twiddle_im_8_15 = f64x8::simd_from(simd, [ - -1.0, // W_32^8 - -0.9807852804032304, // W_32^9 - -0.9238795325112867, // W_32^10 - -0.8314696123025452, // W_32^11 - -std::f64::consts::FRAC_1_SQRT_2, // W_32^12 - -0.5555702330196022, // W_32^13 - -0.3826834323650898, // W_32^14 - -0.19509032201612825, // W_32^15 - ]); + let twiddle_re_8_15 = f64x8::simd_from( + simd, + [ + 0.0, // W_32^8 = 0 - i + -0.19509032201612825, // W_32^9 + -0.3826834323650898, // W_32^10 + -0.5555702330196022, // W_32^11 + -std::f64::consts::FRAC_1_SQRT_2, // W_32^12 + -0.8314696123025452, // W_32^13 + -0.9238795325112867, // W_32^14 + -0.9807852804032304, // W_32^15 + ], + ); + + let twiddle_im_8_15 = f64x8::simd_from( + simd, + [ + -1.0, // W_32^8 + -0.9807852804032304, // W_32^9 + -0.9238795325112867, // W_32^10 + -0.8314696123025452, // W_32^11 + -std::f64::consts::FRAC_1_SQRT_2, // W_32^12 + -0.5555702330196022, // W_32^13 + -0.3826834323650898, // W_32^14 + -0.19509032201612825, // W_32^15 + ], + ); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -397,10 +432,14 @@ pub fn fft_dit_chunk_32_simd_f64(simd: S, reals: &mut [f64], imags: &mu imags_s1[0..8].copy_from_slice(out1_im_0_7.as_slice()); // Process second 8 butterflies - let in0_re_8_15 = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s0[8..16]).unwrap()); - let in1_re_8_15 = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s1[8..16]).unwrap()); - let in0_im_8_15 = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s0[8..16]).unwrap()); - let in1_im_8_15 = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s1[8..16]).unwrap()); + let in0_re_8_15 = + f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s0[8..16]).unwrap()); + let in1_re_8_15 = + f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s1[8..16]).unwrap()); + let in0_im_8_15 = + f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s0[8..16]).unwrap()); + let in1_im_8_15 = + f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s1[8..16]).unwrap()); let out0_re_8_15 = twiddle_im_8_15.mul_add( -in1_im_8_15, @@ -430,43 +469,49 @@ pub fn fft_dit_chunk_32_simd_f32(simd: S, reals: &mut [f32], imags: &mu let two = f32x16::splat(simd, 2.0); // All 16 twiddle factors for W_32^k where k = 0..15 - let twiddle_re = f32x16::simd_from(simd, [ - 1.0_f32, // W_32^0 = 1 - 0.980_785_25_f32, // W_32^1 = cos(π/16) - 0.923_879_5_f32, // W_32^2 = cos(π/8) - 0.831_469_6_f32, // W_32^3 = cos(3π/16) - std::f32::consts::FRAC_1_SQRT_2, // W_32^4 = sqrt(2)/2 - 0.555_570_24_f32, // W_32^5 = cos(5π/16) - 0.382_683_43_f32, // W_32^6 = cos(3π/8) - 0.195_090_32_f32, // W_32^7 = cos(7π/16) - 0.0_f32, // W_32^8 = 0 - i - -0.195_090_32_f32, // W_32^9 - -0.382_683_43_f32, // W_32^10 - -0.555_570_24_f32, // W_32^11 - -f32::consts::FRAC_1_SQRT_2, // W_32^12 - -0.831_469_6_f32, // W_32^13 - -0.923_879_5_f32, // W_32^14 - -0.980_785_25_f32, // W_32^15 - ]); - - let twiddle_im = f32x16::simd_from(simd, [ - 0.0_f32, // W_32^0 - -0.195_090_32_f32, // W_32^1 = -sin(π/16) - -0.382_683_43_f32, // W_32^2 = -sin(π/8) - -0.555_570_24_f32, // W_32^3 = -sin(3π/16) - -std::f32::consts::FRAC_1_SQRT_2, // W_32^4 = -sqrt(2)/2 - -0.831_469_6_f32, // W_32^5 = -sin(5π/16) - -0.923_879_5_f32, // W_32^6 = -sin(3π/8) - -0.980_785_25_f32, // W_32^7 = -sin(7π/16) - -1.0_f32, // W_32^8 - -0.980_785_25_f32, // W_32^9 - -0.923_879_5_f32, // W_32^10 - -0.831_469_6_f32, // W_32^11 - -std::f32::consts::FRAC_1_SQRT_2, // W_32^12 - -0.555_570_24_f32, // W_32^13 - -0.382_683_43_f32, // W_32^14 - -0.195_090_32_f32, // W_32^15 - ]); + let twiddle_re = f32x16::simd_from( + simd, + [ + 1.0_f32, // W_32^0 = 1 + 0.980_785_25_f32, // W_32^1 = cos(π/16) + 0.923_879_5_f32, // W_32^2 = cos(π/8) + 0.831_469_6_f32, // W_32^3 = cos(3π/16) + std::f32::consts::FRAC_1_SQRT_2, // W_32^4 = sqrt(2)/2 + 0.555_570_24_f32, // W_32^5 = cos(5π/16) + 0.382_683_43_f32, // W_32^6 = cos(3π/8) + 0.195_090_32_f32, // W_32^7 = cos(7π/16) + 0.0_f32, // W_32^8 = 0 - i + -0.195_090_32_f32, // W_32^9 + -0.382_683_43_f32, // W_32^10 + -0.555_570_24_f32, // W_32^11 + -f32::consts::FRAC_1_SQRT_2, // W_32^12 + -0.831_469_6_f32, // W_32^13 + -0.923_879_5_f32, // W_32^14 + -0.980_785_25_f32, // W_32^15 + ], + ); + + let twiddle_im = f32x16::simd_from( + simd, + [ + 0.0_f32, // W_32^0 + -0.195_090_32_f32, // W_32^1 = -sin(π/16) + -0.382_683_43_f32, // W_32^2 = -sin(π/8) + -0.555_570_24_f32, // W_32^3 = -sin(3π/16) + -std::f32::consts::FRAC_1_SQRT_2, // W_32^4 = -sqrt(2)/2 + -0.831_469_6_f32, // W_32^5 = -sin(5π/16) + -0.923_879_5_f32, // W_32^6 = -sin(3π/8) + -0.980_785_25_f32, // W_32^7 = -sin(7π/16) + -1.0_f32, // W_32^8 + -0.980_785_25_f32, // W_32^9 + -0.923_879_5_f32, // W_32^10 + -0.831_469_6_f32, // W_32^11 + -std::f32::consts::FRAC_1_SQRT_2, // W_32^12 + -0.555_570_24_f32, // W_32^13 + -0.382_683_43_f32, // W_32^14 + -0.195_090_32_f32, // W_32^15 + ], + ); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -503,96 +548,120 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu // Process in 4 iterations of 8 butterflies each // Twiddles for W_64^k where k = 0..7 - let twiddle_re_0_7 = f64x8::simd_from(simd, [ - 1.0, // W_64^0 = 1 - 0.9951847266721969, // W_64^1 = cos(π/32) - 0.9807852804032304, // W_64^2 = cos(π/16) - 0.9569403357322089, // W_64^3 = cos(3π/32) - 0.9238795325112867, // W_64^4 = cos(π/8) - 0.8819212643483549, // W_64^5 = cos(5π/32) - 0.8314696123025452, // W_64^6 = cos(3π/16) - 0.773010453362737, // W_64^7 = cos(7π/32) - ]); - - let twiddle_im_0_7 = f64x8::simd_from(simd, [ - 0.0, // W_64^0 - -0.0980171403295606, // W_64^1 = -sin(π/32) - -0.19509032201612825, // W_64^2 = -sin(π/16) - -0.29028467725446233, // W_64^3 = -sin(3π/32) - -0.3826834323650898, // W_64^4 = -sin(π/8) - -0.47139673682599764, // W_64^5 = -sin(5π/32) - -0.5555702330196022, // W_64^6 = -sin(3π/16) - -0.6343932841636455, // W_64^7 = -sin(7π/32) - ]); + let twiddle_re_0_7 = f64x8::simd_from( + simd, + [ + 1.0, // W_64^0 = 1 + 0.9951847266721969, // W_64^1 = cos(π/32) + 0.9807852804032304, // W_64^2 = cos(π/16) + 0.9569403357322089, // W_64^3 = cos(3π/32) + 0.9238795325112867, // W_64^4 = cos(π/8) + 0.8819212643483549, // W_64^5 = cos(5π/32) + 0.8314696123025452, // W_64^6 = cos(3π/16) + 0.773010453362737, // W_64^7 = cos(7π/32) + ], + ); + + let twiddle_im_0_7 = f64x8::simd_from( + simd, + [ + 0.0, // W_64^0 + -0.0980171403295606, // W_64^1 = -sin(π/32) + -0.19509032201612825, // W_64^2 = -sin(π/16) + -0.29028467725446233, // W_64^3 = -sin(3π/32) + -0.3826834323650898, // W_64^4 = -sin(π/8) + -0.47139673682599764, // W_64^5 = -sin(5π/32) + -0.5555702330196022, // W_64^6 = -sin(3π/16) + -0.6343932841636455, // W_64^7 = -sin(7π/32) + ], + ); // Twiddles for k = 8..15 - let twiddle_re_8_15 = f64x8::simd_from(simd, [ - std::f64::consts::FRAC_1_SQRT_2, // W_64^8 = sqrt(2)/2 - 0.6343932841636455, // W_64^9 - 0.5555702330196022, // W_64^10 - 0.47139673682599764, // W_64^11 - 0.3826834323650898, // W_64^12 - 0.29028467725446233, // W_64^13 - 0.19509032201612825, // W_64^14 - 0.0980171403295606, // W_64^15 - ]); - - let twiddle_im_8_15 = f64x8::simd_from(simd, [ - -std::f64::consts::FRAC_1_SQRT_2, // W_64^8 - -0.773010453362737, // W_64^9 - -0.8314696123025452, // W_64^10 - -0.8819212643483549, // W_64^11 - -0.9238795325112867, // W_64^12 - -0.9569403357322089, // W_64^13 - -0.9807852804032304, // W_64^14 - -0.9951847266721969, // W_64^15 - ]); + let twiddle_re_8_15 = f64x8::simd_from( + simd, + [ + std::f64::consts::FRAC_1_SQRT_2, // W_64^8 = sqrt(2)/2 + 0.6343932841636455, // W_64^9 + 0.5555702330196022, // W_64^10 + 0.47139673682599764, // W_64^11 + 0.3826834323650898, // W_64^12 + 0.29028467725446233, // W_64^13 + 0.19509032201612825, // W_64^14 + 0.0980171403295606, // W_64^15 + ], + ); + + let twiddle_im_8_15 = f64x8::simd_from( + simd, + [ + -std::f64::consts::FRAC_1_SQRT_2, // W_64^8 + -0.773010453362737, // W_64^9 + -0.8314696123025452, // W_64^10 + -0.8819212643483549, // W_64^11 + -0.9238795325112867, // W_64^12 + -0.9569403357322089, // W_64^13 + -0.9807852804032304, // W_64^14 + -0.9951847266721969, // W_64^15 + ], + ); // Twiddles for k = 16..23 - let twiddle_re_16_23 = f64x8::simd_from(simd, [ - 0.0, // W_64^16 = -i - -0.0980171403295606, // W_64^17 - -0.19509032201612825, // W_64^18 - -0.29028467725446233, // W_64^19 - -0.3826834323650898, // W_64^20 - -0.47139673682599764, // W_64^21 - -0.5555702330196022, // W_64^22 - -0.6343932841636455, // W_64^23 - ]); - - let twiddle_im_16_23 = f64x8::simd_from(simd, [ - -1.0, // W_64^16 - -0.9951847266721969, // W_64^17 - -0.9807852804032304, // W_64^18 - -0.9569403357322089, // W_64^19 - -0.9238795325112867, // W_64^20 - -0.8819212643483549, // W_64^21 - -0.8314696123025452, // W_64^22 - -0.773010453362737, // W_64^23 - ]); + let twiddle_re_16_23 = f64x8::simd_from( + simd, + [ + 0.0, // W_64^16 = -i + -0.0980171403295606, // W_64^17 + -0.19509032201612825, // W_64^18 + -0.29028467725446233, // W_64^19 + -0.3826834323650898, // W_64^20 + -0.47139673682599764, // W_64^21 + -0.5555702330196022, // W_64^22 + -0.6343932841636455, // W_64^23 + ], + ); + + let twiddle_im_16_23 = f64x8::simd_from( + simd, + [ + -1.0, // W_64^16 + -0.9951847266721969, // W_64^17 + -0.9807852804032304, // W_64^18 + -0.9569403357322089, // W_64^19 + -0.9238795325112867, // W_64^20 + -0.8819212643483549, // W_64^21 + -0.8314696123025452, // W_64^22 + -0.773010453362737, // W_64^23 + ], + ); // Twiddles for k = 24..31 - let twiddle_re_24_31 = f64x8::simd_from(simd, [ - -std::f64::consts::FRAC_1_SQRT_2, // W_64^24 - -0.773010453362737, // W_64^25 - -0.8314696123025452, // W_64^26 - -0.8819212643483549, // W_64^27 - -0.9238795325112867, // W_64^28 - -0.9569403357322089, // W_64^29 - -0.9807852804032304, // W_64^30 - -0.9951847266721969, // W_64^31 - ]); - - let twiddle_im_24_31 = f64x8::simd_from(simd, [ - -std::f64::consts::FRAC_1_SQRT_2, // W_64^24 - -0.6343932841636455, // W_64^25 - -0.5555702330196022, // W_64^26 - -0.47139673682599764, // W_64^27 - -0.3826834323650898, // W_64^28 - -0.29028467725446233, // W_64^29 - -0.19509032201612825, // W_64^30 - -0.0980171403295606, // W_64^31 - ]); + let twiddle_re_24_31 = f64x8::simd_from( + simd, + [ + -std::f64::consts::FRAC_1_SQRT_2, // W_64^24 + -0.773010453362737, // W_64^25 + -0.8314696123025452, // W_64^26 + -0.8819212643483549, // W_64^27 + -0.9238795325112867, // W_64^28 + -0.9569403357322089, // W_64^29 + -0.9807852804032304, // W_64^30 + -0.9951847266721969, // W_64^31 + ], + ); + + let twiddle_im_24_31 = f64x8::simd_from( + simd, + [ + -std::f64::consts::FRAC_1_SQRT_2, // W_64^24 + -0.6343932841636455, // W_64^25 + -0.5555702330196022, // W_64^26 + -0.47139673682599764, // W_64^27 + -0.3826834323650898, // W_64^28 + -0.29028467725446233, // W_64^29 + -0.19509032201612825, // W_64^30 + -0.0980171403295606, // W_64^31 + ], + ); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -680,82 +749,94 @@ pub fn fft_dit_chunk_64_simd_f32(simd: S, reals: &mut [f32], imags: &mu // Process in 2 iterations of 16 butterflies each // Twiddles for W_64^k where k = 0..15 - let twiddle_re_0_15 = f32x16::simd_from(simd, [ - 1.0_f32, // W_64^0 = 1 - 0.995_184_7_f32, // W_64^1 = cos(π/32) - 0.980_785_25_f32, // W_64^2 = cos(π/16) - 0.956_940_35_f32, // W_64^3 = cos(3π/32) - 0.923_879_5_f32, // W_64^4 = cos(π/8) - 0.881_921_3_f32, // W_64^5 = cos(5π/32) - 0.831_469_6_f32, // W_64^6 = cos(3π/16) - 0.773_010_43_f32, // W_64^7 = cos(7π/32) - std::f32::consts::FRAC_1_SQRT_2, // W_64^8 = sqrt(2)/2 - 0.634_393_3_f32, // W_64^9 - 0.555_570_24_f32, // W_64^10 - 0.471_396_74_f32, // W_64^11 - 0.382_683_43_f32, // W_64^12 - 0.290_284_66_f32, // W_64^13 - 0.195_090_32_f32, // W_64^14 - 0.098_017_14_f32, // W_64^15 - ]); - - let twiddle_im_0_15 = f32x16::simd_from(simd, [ - 0.0_f32, // W_64^0 - -0.098_017_14_f32, // W_64^1 = -sin(π/32) - -0.195_090_32_f32, // W_64^2 = -sin(π/16) - -0.290_284_66_f32, // W_64^3 = -sin(3π/32) - -0.382_683_43_f32, // W_64^4 = -sin(π/8) - -0.471_396_74_f32, // W_64^5 = -sin(5π/32) - -0.555_570_24_f32, // W_64^6 = -sin(3π/16) - -0.634_393_3_f32, // W_64^7 = -sin(7π/32) - -std::f32::consts::FRAC_1_SQRT_2, // W_64^8 - -0.773_010_43_f32, // W_64^9 - -0.831_469_6_f32, // W_64^10 - -0.881_921_3_f32, // W_64^11 - -0.923_879_5_f32, // W_64^12 - -0.956_940_35_f32, // W_64^13 - -0.980_785_25_f32, // W_64^14 - -0.995_184_7_f32, // W_64^15 - ]); + let twiddle_re_0_15 = f32x16::simd_from( + simd, + [ + 1.0_f32, // W_64^0 = 1 + 0.995_184_7_f32, // W_64^1 = cos(π/32) + 0.980_785_25_f32, // W_64^2 = cos(π/16) + 0.956_940_35_f32, // W_64^3 = cos(3π/32) + 0.923_879_5_f32, // W_64^4 = cos(π/8) + 0.881_921_3_f32, // W_64^5 = cos(5π/32) + 0.831_469_6_f32, // W_64^6 = cos(3π/16) + 0.773_010_43_f32, // W_64^7 = cos(7π/32) + std::f32::consts::FRAC_1_SQRT_2, // W_64^8 = sqrt(2)/2 + 0.634_393_3_f32, // W_64^9 + 0.555_570_24_f32, // W_64^10 + 0.471_396_74_f32, // W_64^11 + 0.382_683_43_f32, // W_64^12 + 0.290_284_66_f32, // W_64^13 + 0.195_090_32_f32, // W_64^14 + 0.098_017_14_f32, // W_64^15 + ], + ); + + let twiddle_im_0_15 = f32x16::simd_from( + simd, + [ + 0.0_f32, // W_64^0 + -0.098_017_14_f32, // W_64^1 = -sin(π/32) + -0.195_090_32_f32, // W_64^2 = -sin(π/16) + -0.290_284_66_f32, // W_64^3 = -sin(3π/32) + -0.382_683_43_f32, // W_64^4 = -sin(π/8) + -0.471_396_74_f32, // W_64^5 = -sin(5π/32) + -0.555_570_24_f32, // W_64^6 = -sin(3π/16) + -0.634_393_3_f32, // W_64^7 = -sin(7π/32) + -std::f32::consts::FRAC_1_SQRT_2, // W_64^8 + -0.773_010_43_f32, // W_64^9 + -0.831_469_6_f32, // W_64^10 + -0.881_921_3_f32, // W_64^11 + -0.923_879_5_f32, // W_64^12 + -0.956_940_35_f32, // W_64^13 + -0.980_785_25_f32, // W_64^14 + -0.995_184_7_f32, // W_64^15 + ], + ); // Twiddles for k = 16..31 - let twiddle_re_16_31 = f32x16::simd_from(simd, [ - 0.0_f32, // W_64^16 = -i - -0.098_017_14_f32, // W_64^17 - -0.195_090_32_f32, // W_64^18 - -0.290_284_66_f32, // W_64^19 - -0.382_683_43_f32, // W_64^20 - -0.471_396_74_f32, // W_64^21 - -0.555_570_24_f32, // W_64^22 - -0.634_393_3_f32, // W_64^23 - -std::f32::consts::FRAC_1_SQRT_2, // W_64^24 - -0.773_010_43_f32, // W_64^25 - -0.831_469_6_f32, // W_64^26 - -0.881_921_3_f32, // W_64^27 - -0.923_879_5_f32, // W_64^28 - -0.956_940_35_f32, // W_64^29 - -0.980_785_25_f32, // W_64^30 - -0.995_184_7_f32, // W_64^31 - ]); - - let twiddle_im_16_31 = f32x16::simd_from(simd, [ - -1.0_f32, // W_64^16 - -0.995_184_7_f32, // W_64^17 - -0.980_785_25_f32, // W_64^18 - -0.956_940_35_f32, // W_64^19 - -0.923_879_5_f32, // W_64^20 - -0.881_921_3_f32, // W_64^21 - -0.831_469_6_f32, // W_64^22 - -0.773_010_43_f32, // W_64^23 - -std::f32::consts::FRAC_1_SQRT_2, // W_64^24 - -0.634_393_3_f32, // W_64^25 - -0.555_570_24_f32, // W_64^26 - -0.471_396_74_f32, // W_64^27 - -0.382_683_43_f32, // W_64^28 - -0.290_284_66_f32, // W_64^29 - -0.195_090_32_f32, // W_64^30 - -0.098_017_14_f32, // W_64^31 - ]); + let twiddle_re_16_31 = f32x16::simd_from( + simd, + [ + 0.0_f32, // W_64^16 = -i + -0.098_017_14_f32, // W_64^17 + -0.195_090_32_f32, // W_64^18 + -0.290_284_66_f32, // W_64^19 + -0.382_683_43_f32, // W_64^20 + -0.471_396_74_f32, // W_64^21 + -0.555_570_24_f32, // W_64^22 + -0.634_393_3_f32, // W_64^23 + -std::f32::consts::FRAC_1_SQRT_2, // W_64^24 + -0.773_010_43_f32, // W_64^25 + -0.831_469_6_f32, // W_64^26 + -0.881_921_3_f32, // W_64^27 + -0.923_879_5_f32, // W_64^28 + -0.956_940_35_f32, // W_64^29 + -0.980_785_25_f32, // W_64^30 + -0.995_184_7_f32, // W_64^31 + ], + ); + + let twiddle_im_16_31 = f32x16::simd_from( + simd, + [ + -1.0_f32, // W_64^16 + -0.995_184_7_f32, // W_64^17 + -0.980_785_25_f32, // W_64^18 + -0.956_940_35_f32, // W_64^19 + -0.923_879_5_f32, // W_64^20 + -0.881_921_3_f32, // W_64^21 + -0.831_469_6_f32, // W_64^22 + -0.773_010_43_f32, // W_64^23 + -std::f32::consts::FRAC_1_SQRT_2, // W_64^24 + -0.634_393_3_f32, // W_64^25 + -0.555_570_24_f32, // W_64^26 + -0.471_396_74_f32, // W_64^27 + -0.382_683_43_f32, // W_64^28 + -0.290_284_66_f32, // W_64^29 + -0.195_090_32_f32, // W_64^30 + -0.098_017_14_f32, // W_64^31 + ], + ); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) From 3ff0b45b2caa9d362cd7caf3aa734d089231bd86 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 21 Jan 2026 17:32:17 +0000 Subject: [PATCH 11/12] Simpler SIMD loads --- src/kernels/dit.rs | 108 ++++++++++++++++++++++----------------------- 1 file changed, 52 insertions(+), 56 deletions(-) diff --git a/src/kernels/dit.rs b/src/kernels/dit.rs index 17759e6..953210c 100644 --- a/src/kernels/dit.rs +++ b/src/kernels/dit.rs @@ -141,10 +141,10 @@ pub fn fft_dit_chunk_8_simd_f64(simd: S, reals: &mut [f64], imags: &mut let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST); let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); - let in0_re = f64x4::simd_from(simd, <[f64; 4]>::try_from(&reals_s0[0..4]).unwrap()); - let in1_re = f64x4::simd_from(simd, <[f64; 4]>::try_from(&reals_s1[0..4]).unwrap()); - let in0_im = f64x4::simd_from(simd, <[f64; 4]>::try_from(&imags_s0[0..4]).unwrap()); - let in1_im = f64x4::simd_from(simd, <[f64; 4]>::try_from(&imags_s1[0..4]).unwrap()); + let in0_re = f64x4::from_slice(simd, &reals_s0[0..4]); + let in1_re = f64x4::from_slice(simd, &reals_s1[0..4]); + let in0_im = f64x4::from_slice(simd, &imags_s0[0..4]); + let in1_im = f64x4::from_slice(simd, &imags_s1[0..4]); // out0.re = (in0.re + w.re * in1.re) - w.im * in1.im let out0_re = sqrt2_2_im.mul_add(-in1_im, sqrt2_2.mul_add(in1_re, in0_re)); @@ -194,10 +194,10 @@ pub fn fft_dit_chunk_8_simd_f32(simd: S, reals: &mut [f32], imags: &mut let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST); let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); - let in0_re = f32x4::simd_from(simd, <[f32; 4]>::try_from(&reals_s0[0..4]).unwrap()); - let in1_re = f32x4::simd_from(simd, <[f32; 4]>::try_from(&reals_s1[0..4]).unwrap()); - let in0_im = f32x4::simd_from(simd, <[f32; 4]>::try_from(&imags_s0[0..4]).unwrap()); - let in1_im = f32x4::simd_from(simd, <[f32; 4]>::try_from(&imags_s1[0..4]).unwrap()); + let in0_re = f32x4::from_slice(simd, &reals_s0[0..4]); + let in1_re = f32x4::from_slice(simd, &reals_s1[0..4]); + let in0_im = f32x4::from_slice(simd, &imags_s0[0..4]); + let in1_im = f32x4::from_slice(simd, &imags_s1[0..4]); // out0.re = (in0.re + w.re * in1.re) - w.im * in1.im let out0_re = sqrt2_2_im.mul_add(-in1_im, sqrt2_2.mul_add(in1_re, in0_re)); @@ -259,10 +259,10 @@ pub fn fft_dit_chunk_16_simd_f64(simd: S, reals: &mut [f64], imags: &mu let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Load all 8 elements at once - let in0_re = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s0[0..8]).unwrap()); - let in1_re = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s1[0..8]).unwrap()); - let in0_im = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s0[0..8]).unwrap()); - let in1_im = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s1[0..8]).unwrap()); + let in0_re = f64x8::from_slice(simd, &reals_s0[0..8]); + let in1_re = f64x8::from_slice(simd, &reals_s1[0..8]); + let in0_im = f64x8::from_slice(simd, &imags_s0[0..8]); + let in1_im = f64x8::from_slice(simd, &imags_s1[0..8]); let out0_re = twiddle_im.mul_add(-in1_im, twiddle_re.mul_add(in1_re, in0_re)); let out0_im = twiddle_im.mul_add(in1_re, twiddle_re.mul_add(in1_im, in0_im)); @@ -322,10 +322,10 @@ pub fn fft_dit_chunk_16_simd_f32(simd: S, reals: &mut [f32], imags: &mu let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Load all 8 elements at once - let in0_re = f32x8::simd_from(simd, <[f32; 8]>::try_from(&reals_s0[0..8]).unwrap()); - let in1_re = f32x8::simd_from(simd, <[f32; 8]>::try_from(&reals_s1[0..8]).unwrap()); - let in0_im = f32x8::simd_from(simd, <[f32; 8]>::try_from(&imags_s0[0..8]).unwrap()); - let in1_im = f32x8::simd_from(simd, <[f32; 8]>::try_from(&imags_s1[0..8]).unwrap()); + let in0_re = f32x8::from_slice(simd, &reals_s0[0..8]); + let in1_re = f32x8::from_slice(simd, &reals_s1[0..8]); + let in0_im = f32x8::from_slice(simd, &imags_s0[0..8]); + let in1_im = f32x8::from_slice(simd, &imags_s1[0..8]); let out0_re = twiddle_im.mul_add(-in1_im, twiddle_re.mul_add(in1_re, in0_re)); let out0_im = twiddle_im.mul_add(in1_re, twiddle_re.mul_add(in1_im, in0_im)); @@ -413,10 +413,10 @@ pub fn fft_dit_chunk_32_simd_f64(simd: S, reals: &mut [f64], imags: &mu let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Process first 8 butterflies - let in0_re_0_7 = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s0[0..8]).unwrap()); - let in1_re_0_7 = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s1[0..8]).unwrap()); - let in0_im_0_7 = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s0[0..8]).unwrap()); - let in1_im_0_7 = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s1[0..8]).unwrap()); + let in0_re_0_7 = f64x8::from_slice(simd, &reals_s0[0..8]); + let in1_re_0_7 = f64x8::from_slice(simd, &reals_s1[0..8]); + let in0_im_0_7 = f64x8::from_slice(simd, &imags_s0[0..8]); + let in1_im_0_7 = f64x8::from_slice(simd, &imags_s1[0..8]); let out0_re_0_7 = twiddle_im_0_7.mul_add(-in1_im_0_7, twiddle_re_0_7.mul_add(in1_re_0_7, in0_re_0_7)); @@ -432,14 +432,10 @@ pub fn fft_dit_chunk_32_simd_f64(simd: S, reals: &mut [f64], imags: &mu imags_s1[0..8].copy_from_slice(out1_im_0_7.as_slice()); // Process second 8 butterflies - let in0_re_8_15 = - f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s0[8..16]).unwrap()); - let in1_re_8_15 = - f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s1[8..16]).unwrap()); - let in0_im_8_15 = - f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s0[8..16]).unwrap()); - let in1_im_8_15 = - f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s1[8..16]).unwrap()); + let in0_re_8_15 = f64x8::from_slice(simd, &reals_s0[8..16]); + let in1_re_8_15 = f64x8::from_slice(simd, &reals_s1[8..16]); + let in0_im_8_15 = f64x8::from_slice(simd, &imags_s0[8..16]); + let in1_im_8_15 = f64x8::from_slice(simd, &imags_s1[8..16]); let out0_re_8_15 = twiddle_im_8_15.mul_add( -in1_im_8_15, @@ -520,10 +516,10 @@ pub fn fft_dit_chunk_32_simd_f32(simd: S, reals: &mut [f32], imags: &mu let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Process all 16 butterflies at once with f32x16 - let in0_re = f32x16::simd_from(simd, <[f32; 16]>::try_from(&reals_s0[0..16]).unwrap()); - let in1_re = f32x16::simd_from(simd, <[f32; 16]>::try_from(&reals_s1[0..16]).unwrap()); - let in0_im = f32x16::simd_from(simd, <[f32; 16]>::try_from(&imags_s0[0..16]).unwrap()); - let in1_im = f32x16::simd_from(simd, <[f32; 16]>::try_from(&imags_s1[0..16]).unwrap()); + let in0_re = f32x16::from_slice(simd, &reals_s0[0..16]); + let in1_re = f32x16::from_slice(simd, &reals_s1[0..16]); + let in0_im = f32x16::from_slice(simd, &imags_s0[0..16]); + let in1_im = f32x16::from_slice(simd, &imags_s1[0..16]); let out0_re = twiddle_im.mul_add(-in1_im, twiddle_re.mul_add(in1_re, in0_re)); let out0_im = twiddle_im.mul_add(in1_re, twiddle_re.mul_add(in1_im, in0_im)); @@ -670,10 +666,10 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Process butterflies 0..7 - let in0_re = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s0[0..8]).unwrap()); - let in1_re = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s1[0..8]).unwrap()); - let in0_im = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s0[0..8]).unwrap()); - let in1_im = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s1[0..8]).unwrap()); + let in0_re = f64x8::from_slice(simd, &reals_s0[0..8]); + let in1_re = f64x8::from_slice(simd, &reals_s1[0..8]); + let in0_im = f64x8::from_slice(simd, &imags_s0[0..8]); + let in1_im = f64x8::from_slice(simd, &imags_s1[0..8]); let out0_re = twiddle_im_0_7.mul_add(-in1_im, twiddle_re_0_7.mul_add(in1_re, in0_re)); let out0_im = twiddle_im_0_7.mul_add(in1_re, twiddle_re_0_7.mul_add(in1_im, in0_im)); @@ -686,10 +682,10 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu imags_s1[0..8].copy_from_slice(out1_im.as_slice()); // Process butterflies 8..15 - let in0_re = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s0[8..16]).unwrap()); - let in1_re = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s1[8..16]).unwrap()); - let in0_im = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s0[8..16]).unwrap()); - let in1_im = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s1[8..16]).unwrap()); + let in0_re = f64x8::from_slice(simd, &reals_s0[8..16]); + let in1_re = f64x8::from_slice(simd, &reals_s1[8..16]); + let in0_im = f64x8::from_slice(simd, &imags_s0[8..16]); + let in1_im = f64x8::from_slice(simd, &imags_s1[8..16]); let out0_re = twiddle_im_8_15.mul_add(-in1_im, twiddle_re_8_15.mul_add(in1_re, in0_re)); let out0_im = twiddle_im_8_15.mul_add(in1_re, twiddle_re_8_15.mul_add(in1_im, in0_im)); @@ -702,10 +698,10 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu imags_s1[8..16].copy_from_slice(out1_im.as_slice()); // Process butterflies 16..23 - let in0_re = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s0[16..24]).unwrap()); - let in1_re = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s1[16..24]).unwrap()); - let in0_im = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s0[16..24]).unwrap()); - let in1_im = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s1[16..24]).unwrap()); + let in0_re = f64x8::from_slice(simd, &reals_s0[16..24]); + let in1_re = f64x8::from_slice(simd, &reals_s1[16..24]); + let in0_im = f64x8::from_slice(simd, &imags_s0[16..24]); + let in1_im = f64x8::from_slice(simd, &imags_s1[16..24]); let out0_re = twiddle_im_16_23.mul_add(-in1_im, twiddle_re_16_23.mul_add(in1_re, in0_re)); @@ -720,10 +716,10 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu imags_s1[16..24].copy_from_slice(out1_im.as_slice()); // Process butterflies 24..31 - let in0_re = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s0[24..32]).unwrap()); - let in1_re = f64x8::simd_from(simd, <[f64; 8]>::try_from(&reals_s1[24..32]).unwrap()); - let in0_im = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s0[24..32]).unwrap()); - let in1_im = f64x8::simd_from(simd, <[f64; 8]>::try_from(&imags_s1[24..32]).unwrap()); + let in0_re = f64x8::from_slice(simd, &reals_s0[24..32]); + let in1_re = f64x8::from_slice(simd, &reals_s1[24..32]); + let in0_im = f64x8::from_slice(simd, &imags_s0[24..32]); + let in1_im = f64x8::from_slice(simd, &imags_s1[24..32]); let out0_re = twiddle_im_24_31.mul_add(-in1_im, twiddle_re_24_31.mul_add(in1_re, in0_re)); @@ -845,10 +841,10 @@ pub fn fft_dit_chunk_64_simd_f32(simd: S, reals: &mut [f32], imags: &mu let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Process butterflies 0..15 - let in0_re = f32x16::simd_from(simd, <[f32; 16]>::try_from(&reals_s0[0..16]).unwrap()); - let in1_re = f32x16::simd_from(simd, <[f32; 16]>::try_from(&reals_s1[0..16]).unwrap()); - let in0_im = f32x16::simd_from(simd, <[f32; 16]>::try_from(&imags_s0[0..16]).unwrap()); - let in1_im = f32x16::simd_from(simd, <[f32; 16]>::try_from(&imags_s1[0..16]).unwrap()); + let in0_re = f32x16::from_slice(simd, &reals_s0[0..16]); + let in1_re = f32x16::from_slice(simd, &reals_s1[0..16]); + let in0_im = f32x16::from_slice(simd, &imags_s0[0..16]); + let in1_im = f32x16::from_slice(simd, &imags_s1[0..16]); let out0_re = twiddle_im_0_15.mul_add(-in1_im, twiddle_re_0_15.mul_add(in1_re, in0_re)); let out0_im = twiddle_im_0_15.mul_add(in1_re, twiddle_re_0_15.mul_add(in1_im, in0_im)); @@ -861,10 +857,10 @@ pub fn fft_dit_chunk_64_simd_f32(simd: S, reals: &mut [f32], imags: &mu imags_s1[0..16].copy_from_slice(out1_im.as_slice()); // Process butterflies 16..31 - let in0_re = f32x16::simd_from(simd, <[f32; 16]>::try_from(&reals_s0[16..32]).unwrap()); - let in1_re = f32x16::simd_from(simd, <[f32; 16]>::try_from(&reals_s1[16..32]).unwrap()); - let in0_im = f32x16::simd_from(simd, <[f32; 16]>::try_from(&imags_s0[16..32]).unwrap()); - let in1_im = f32x16::simd_from(simd, <[f32; 16]>::try_from(&imags_s1[16..32]).unwrap()); + let in0_re = f32x16::from_slice(simd, &reals_s0[16..32]); + let in1_re = f32x16::from_slice(simd, &reals_s1[16..32]); + let in0_im = f32x16::from_slice(simd, &imags_s0[16..32]); + let in1_im = f32x16::from_slice(simd, &imags_s1[16..32]); let out0_re = twiddle_im_16_31.mul_add(-in1_im, twiddle_re_16_31.mul_add(in1_re, in0_re)); From c955c978d93c20986155cea97ca81432cc7522e6 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 21 Jan 2026 18:03:03 +0000 Subject: [PATCH 12/12] Simpler SIMD stores --- src/kernels/dit.rs | 120 ++++++++++++++++++++++----------------------- 1 file changed, 60 insertions(+), 60 deletions(-) diff --git a/src/kernels/dit.rs b/src/kernels/dit.rs index 953210c..7e988ce 100644 --- a/src/kernels/dit.rs +++ b/src/kernels/dit.rs @@ -155,10 +155,10 @@ pub fn fft_dit_chunk_8_simd_f64(simd: S, reals: &mut [f64], imags: &mut let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0.copy_from_slice(out0_re.as_slice()); - imags_s0.copy_from_slice(out0_im.as_slice()); - reals_s1.copy_from_slice(out1_re.as_slice()); - imags_s1.copy_from_slice(out1_im.as_slice()); + out0_re.store_slice(reals_s0); + out0_im.store_slice(imags_s0); + out1_re.store_slice(reals_s1); + out1_im.store_slice(imags_s1); }); } @@ -208,10 +208,10 @@ pub fn fft_dit_chunk_8_simd_f32(simd: S, reals: &mut [f32], imags: &mut let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0.copy_from_slice(out0_re.as_slice()); - imags_s0.copy_from_slice(out0_im.as_slice()); - reals_s1.copy_from_slice(out1_re.as_slice()); - imags_s1.copy_from_slice(out1_im.as_slice()); + out0_re.store_slice(reals_s0); + out0_im.store_slice(imags_s0); + out1_re.store_slice(reals_s1); + out1_im.store_slice(imags_s1); }); } @@ -271,10 +271,10 @@ pub fn fft_dit_chunk_16_simd_f64(simd: S, reals: &mut [f64], imags: &mu let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0.copy_from_slice(out0_re.as_slice()); - imags_s0.copy_from_slice(out0_im.as_slice()); - reals_s1.copy_from_slice(out1_re.as_slice()); - imags_s1.copy_from_slice(out1_im.as_slice()); + out0_re.store_slice(reals_s0); + out0_im.store_slice(imags_s0); + out1_re.store_slice(reals_s1); + out1_im.store_slice(imags_s1); }); } @@ -334,10 +334,10 @@ pub fn fft_dit_chunk_16_simd_f32(simd: S, reals: &mut [f32], imags: &mu let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0.copy_from_slice(out0_re.as_slice()); - imags_s0.copy_from_slice(out0_im.as_slice()); - reals_s1.copy_from_slice(out1_re.as_slice()); - imags_s1.copy_from_slice(out1_im.as_slice()); + out0_re.store_slice(reals_s0); + out0_im.store_slice(imags_s0); + out1_re.store_slice(reals_s1); + out1_im.store_slice(imags_s1); }); } /// DIT butterfly for chunk_size == 32 (f64) @@ -426,10 +426,10 @@ pub fn fft_dit_chunk_32_simd_f64(simd: S, reals: &mut [f64], imags: &mu let out1_re_0_7 = two.mul_sub(in0_re_0_7, out0_re_0_7); let out1_im_0_7 = two.mul_sub(in0_im_0_7, out0_im_0_7); - reals_s0[0..8].copy_from_slice(out0_re_0_7.as_slice()); - imags_s0[0..8].copy_from_slice(out0_im_0_7.as_slice()); - reals_s1[0..8].copy_from_slice(out1_re_0_7.as_slice()); - imags_s1[0..8].copy_from_slice(out1_im_0_7.as_slice()); + out0_re_0_7.store_slice(&mut reals_s0[0..8]); + out0_im_0_7.store_slice(&mut imags_s0[0..8]); + out1_re_0_7.store_slice(&mut reals_s1[0..8]); + out1_im_0_7.store_slice(&mut imags_s1[0..8]); // Process second 8 butterflies let in0_re_8_15 = f64x8::from_slice(simd, &reals_s0[8..16]); @@ -449,10 +449,10 @@ pub fn fft_dit_chunk_32_simd_f64(simd: S, reals: &mut [f64], imags: &mu let out1_re_8_15 = two.mul_sub(in0_re_8_15, out0_re_8_15); let out1_im_8_15 = two.mul_sub(in0_im_8_15, out0_im_8_15); - reals_s0[8..16].copy_from_slice(out0_re_8_15.as_slice()); - imags_s0[8..16].copy_from_slice(out0_im_8_15.as_slice()); - reals_s1[8..16].copy_from_slice(out1_re_8_15.as_slice()); - imags_s1[8..16].copy_from_slice(out1_im_8_15.as_slice()); + out0_re_8_15.store_slice(&mut reals_s0[8..16]); + out0_im_8_15.store_slice(&mut imags_s0[8..16]); + out1_re_8_15.store_slice(&mut reals_s1[8..16]); + out1_im_8_15.store_slice(&mut imags_s1[8..16]); }); } @@ -527,10 +527,10 @@ pub fn fft_dit_chunk_32_simd_f32(simd: S, reals: &mut [f32], imags: &mu let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0.copy_from_slice(out0_re.as_slice()); - imags_s0.copy_from_slice(out0_im.as_slice()); - reals_s1.copy_from_slice(out1_re.as_slice()); - imags_s1.copy_from_slice(out1_im.as_slice()); + out0_re.store_slice(reals_s0); + out0_im.store_slice(imags_s0); + out1_re.store_slice(reals_s1); + out1_im.store_slice(imags_s1); }); } @@ -676,10 +676,10 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0[0..8].copy_from_slice(out0_re.as_slice()); - imags_s0[0..8].copy_from_slice(out0_im.as_slice()); - reals_s1[0..8].copy_from_slice(out1_re.as_slice()); - imags_s1[0..8].copy_from_slice(out1_im.as_slice()); + out0_re.store_slice(&mut reals_s0[0..8]); + out0_im.store_slice(&mut imags_s0[0..8]); + out1_re.store_slice(&mut reals_s1[0..8]); + out1_im.store_slice(&mut imags_s1[0..8]); // Process butterflies 8..15 let in0_re = f64x8::from_slice(simd, &reals_s0[8..16]); @@ -692,10 +692,10 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0[8..16].copy_from_slice(out0_re.as_slice()); - imags_s0[8..16].copy_from_slice(out0_im.as_slice()); - reals_s1[8..16].copy_from_slice(out1_re.as_slice()); - imags_s1[8..16].copy_from_slice(out1_im.as_slice()); + out0_re.store_slice(&mut reals_s0[8..16]); + out0_im.store_slice(&mut imags_s0[8..16]); + out1_re.store_slice(&mut reals_s1[8..16]); + out1_im.store_slice(&mut imags_s1[8..16]); // Process butterflies 16..23 let in0_re = f64x8::from_slice(simd, &reals_s0[16..24]); @@ -710,10 +710,10 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0[16..24].copy_from_slice(out0_re.as_slice()); - imags_s0[16..24].copy_from_slice(out0_im.as_slice()); - reals_s1[16..24].copy_from_slice(out1_re.as_slice()); - imags_s1[16..24].copy_from_slice(out1_im.as_slice()); + out0_re.store_slice(&mut reals_s0[16..24]); + out0_im.store_slice(&mut imags_s0[16..24]); + out1_re.store_slice(&mut reals_s1[16..24]); + out1_im.store_slice(&mut imags_s1[16..24]); // Process butterflies 24..31 let in0_re = f64x8::from_slice(simd, &reals_s0[24..32]); @@ -728,10 +728,10 @@ pub fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mu let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0[24..32].copy_from_slice(out0_re.as_slice()); - imags_s0[24..32].copy_from_slice(out0_im.as_slice()); - reals_s1[24..32].copy_from_slice(out1_re.as_slice()); - imags_s1[24..32].copy_from_slice(out1_im.as_slice()); + out0_re.store_slice(&mut reals_s0[24..32]); + out0_im.store_slice(&mut imags_s0[24..32]); + out1_re.store_slice(&mut reals_s1[24..32]); + out1_im.store_slice(&mut imags_s1[24..32]); }); } @@ -851,10 +851,10 @@ pub fn fft_dit_chunk_64_simd_f32(simd: S, reals: &mut [f32], imags: &mu let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0[0..16].copy_from_slice(out0_re.as_slice()); - imags_s0[0..16].copy_from_slice(out0_im.as_slice()); - reals_s1[0..16].copy_from_slice(out1_re.as_slice()); - imags_s1[0..16].copy_from_slice(out1_im.as_slice()); + out0_re.store_slice(&mut reals_s0[0..16]); + out0_im.store_slice(&mut imags_s0[0..16]); + out1_re.store_slice(&mut reals_s1[0..16]); + out1_im.store_slice(&mut imags_s1[0..16]); // Process butterflies 16..31 let in0_re = f32x16::from_slice(simd, &reals_s0[16..32]); @@ -869,10 +869,10 @@ pub fn fft_dit_chunk_64_simd_f32(simd: S, reals: &mut [f32], imags: &mu let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0[16..32].copy_from_slice(out0_re.as_slice()); - imags_s0[16..32].copy_from_slice(out0_im.as_slice()); - reals_s1[16..32].copy_from_slice(out1_re.as_slice()); - imags_s1[16..32].copy_from_slice(out1_im.as_slice()); + out0_re.store_slice(&mut reals_s0[16..32]); + out0_im.store_slice(&mut imags_s0[16..32]); + out1_re.store_slice(&mut reals_s1[16..32]); + out1_im.store_slice(&mut imags_s1[16..32]); }); } @@ -922,10 +922,10 @@ pub fn fft_dit_64_chunk_n_simd( let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - re_s0.copy_from_slice(out0_re.as_slice()); - im_s0.copy_from_slice(out0_im.as_slice()); - re_s1.copy_from_slice(out1_re.as_slice()); - im_s1.copy_from_slice(out1_im.as_slice()); + out0_re.store_slice(re_s0); + out0_im.store_slice(im_s0); + out1_re.store_slice(re_s1); + out1_im.store_slice(im_s1); }); }); } @@ -976,10 +976,10 @@ pub fn fft_dit_32_chunk_n_simd( let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - re_s0.copy_from_slice(out0_re.as_slice()); - im_s0.copy_from_slice(out0_im.as_slice()); - re_s1.copy_from_slice(out1_re.as_slice()); - im_s1.copy_from_slice(out1_im.as_slice()); + out0_re.store_slice(re_s0); + out0_im.store_slice(im_s0); + out1_re.store_slice(re_s1); + out1_im.store_slice(im_s1); }); }); }