diff --git a/Cargo.toml b/Cargo.toml index 142f4fd..c38915b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,6 +19,7 @@ num-complex = { version = "0.4.6", features = ["bytemuck"], optional = true } bytemuck = { version = "1.23.2", optional = true } wide = "1.0.3" rayon = { version = "1.11.0", optional = true } +fearless_simd = { git = "https://github.com/valadaptive/fearless_simd.git", branch = "no-x4", version = "0.3.0" } [features] default = [] diff --git a/src/algorithms/dit.rs b/src/algorithms/dit.rs index 0353354..4e20c8b 100644 --- a/src/algorithms/dit.rs +++ b/src/algorithms/dit.rs @@ -14,14 +14,10 @@ //! DIT starts with fine-grained memory access and progressively works with //! larger contiguous chunks. //! +use fearless_simd::{dispatch, Level, Simd}; + use crate::algorithms::cobra::cobra_apply; -use crate::kernels::dit::{ - fft_dit_32_chunk_n_simd, fft_dit_64_chunk_n_simd, fft_dit_chunk_16_simd_f32, - fft_dit_chunk_16_simd_f64, fft_dit_chunk_2, fft_dit_chunk_32_simd_f32, - fft_dit_chunk_32_simd_f64, fft_dit_chunk_4_simd_f32, fft_dit_chunk_4_simd_f64, - fft_dit_chunk_64_simd_f32, fft_dit_chunk_64_simd_f64, fft_dit_chunk_8_simd_f32, - fft_dit_chunk_8_simd_f64, -}; +use crate::kernels::dit::*; use crate::options::Options; use crate::parallel::run_maybe_in_parallel; use crate::planner::{Direction, PlannerDit32, PlannerDit64}; @@ -33,7 +29,8 @@ const L1_BLOCK_SIZE: usize = 1024; /// /// Recursively divides by 2 until reaching L1 cache size, processes stages within /// each block, then processes cross-block stages on return. -fn recursive_dit_fft_f64( +fn recursive_dit_fft_f64( + simd: S, reals: &mut [f64], imags: &mut [f64], size: usize, @@ -46,6 +43,7 @@ fn recursive_dit_fft_f64( if size <= L1_BLOCK_SIZE { for stage in 0..log_size { stage_twiddle_idx = execute_dit_stage_f64( + simd, &mut reals[..size], &mut imags[..size], stage, @@ -63,8 +61,8 @@ fn recursive_dit_fft_f64( // Recursively process both halves run_maybe_in_parallel( size > opts.smallest_parallel_chunk_size, - || recursive_dit_fft_f64(re_first_half, im_first_half, half, planner, opts, 0), - || recursive_dit_fft_f64(re_second_half, im_second_half, half, planner, opts, 0), + || recursive_dit_fft_f64(simd, re_first_half, im_first_half, half, planner, opts, 0), + || recursive_dit_fft_f64(simd, re_second_half, im_second_half, half, planner, opts, 0), ); // Both halves completed stages 0..log_half-1 @@ -74,6 +72,7 @@ fn recursive_dit_fft_f64( // Process remaining stages that span both halves for stage in log_half..log_size { stage_twiddle_idx = execute_dit_stage_f64( + simd, &mut reals[..size], &mut imags[..size], stage, @@ -87,7 +86,8 @@ fn recursive_dit_fft_f64( } /// Recursive cache-blocked DIT FFT for f32 using post-order traversal. -fn recursive_dit_fft_f32( +fn recursive_dit_fft_f32( + simd: S, reals: &mut [f32], imags: &mut [f32], size: usize, @@ -100,6 +100,7 @@ fn recursive_dit_fft_f32( if size <= L1_BLOCK_SIZE { for stage in 0..log_size { stage_twiddle_idx = execute_dit_stage_f32( + simd, &mut reals[..size], &mut imags[..size], stage, @@ -117,8 +118,8 @@ fn recursive_dit_fft_f32( // Recursively process both halves run_maybe_in_parallel( size > opts.smallest_parallel_chunk_size, - || recursive_dit_fft_f32(re_first_half, im_first_half, half, planner, opts, 0), - || recursive_dit_fft_f32(re_second_half, im_second_half, half, planner, opts, 0), + || recursive_dit_fft_f32(simd, re_first_half, im_first_half, half, planner, opts, 0), + || recursive_dit_fft_f32(simd, re_second_half, im_second_half, half, planner, opts, 0), ); // Both halves completed stages 0..log_half-1 @@ -128,6 +129,7 @@ fn recursive_dit_fft_f32( // Process remaining stages that span both halves for stage in log_half..log_size { stage_twiddle_idx = execute_dit_stage_f32( + simd, &mut reals[..size], &mut imags[..size], stage, @@ -142,7 +144,8 @@ fn recursive_dit_fft_f32( /// Execute a single DIT stage, dispatching to appropriate kernel based on chunk size. /// Returns updated stage_twiddle_idx. -fn execute_dit_stage_f64( +fn execute_dit_stage_f64( + simd: S, reals: &mut [f64], imags: &mut [f64], stage: usize, @@ -153,34 +156,35 @@ fn execute_dit_stage_f64( let chunk_size = dist << 1; if chunk_size == 2 { - fft_dit_chunk_2(reals, imags); + simd.vectorize(|| fft_dit_chunk_2(simd, reals, imags)); stage_twiddle_idx } else if chunk_size == 4 { - fft_dit_chunk_4_simd_f64(reals, imags); + fft_dit_chunk_4_f64(simd, reals, imags); stage_twiddle_idx } else if chunk_size == 8 { - fft_dit_chunk_8_simd_f64(reals, imags); + fft_dit_chunk_8_f64(simd, reals, imags); stage_twiddle_idx } else if chunk_size == 16 { - fft_dit_chunk_16_simd_f64(reals, imags); + fft_dit_chunk_16_f64(simd, reals, imags); stage_twiddle_idx } else if chunk_size == 32 { - fft_dit_chunk_32_simd_f64(reals, imags); + fft_dit_chunk_32_f64(simd, reals, imags); stage_twiddle_idx } else if chunk_size == 64 { - fft_dit_chunk_64_simd_f64(reals, imags); + fft_dit_chunk_64_f64(simd, reals, imags); stage_twiddle_idx } else { // For larger chunks, use general kernel with twiddles from planner let (twiddles_re, twiddles_im) = &planner.stage_twiddles[stage_twiddle_idx]; - fft_dit_64_chunk_n_simd(reals, imags, twiddles_re, twiddles_im, dist); + fft_dit_chunk_n_f64(simd, reals, imags, twiddles_re, twiddles_im, dist); stage_twiddle_idx + 1 } } /// Execute a single DIT stage, dispatching to appropriate kernel based on chunk size. /// Returns updated stage_twiddle_idx. -fn execute_dit_stage_f32( +fn execute_dit_stage_f32( + simd: S, reals: &mut [f32], imags: &mut [f32], stage: usize, @@ -191,27 +195,27 @@ fn execute_dit_stage_f32( let chunk_size = dist << 1; if chunk_size == 2 { - fft_dit_chunk_2(reals, imags); + simd.vectorize(|| fft_dit_chunk_2(simd, reals, imags)); stage_twiddle_idx } else if chunk_size == 4 { - fft_dit_chunk_4_simd_f32(reals, imags); + fft_dit_chunk_4_f32(simd, reals, imags); stage_twiddle_idx } else if chunk_size == 8 { - fft_dit_chunk_8_simd_f32(reals, imags); + fft_dit_chunk_8_f32(simd, reals, imags); stage_twiddle_idx } else if chunk_size == 16 { - fft_dit_chunk_16_simd_f32(reals, imags); + fft_dit_chunk_16_f32(simd, reals, imags); stage_twiddle_idx } else if chunk_size == 32 { - fft_dit_chunk_32_simd_f32(reals, imags); + fft_dit_chunk_32_f32(simd, reals, imags); stage_twiddle_idx } else if chunk_size == 64 { - fft_dit_chunk_64_simd_f32(reals, imags); + fft_dit_chunk_64_f32(simd, reals, imags); stage_twiddle_idx } else { // For larger chunks, use general kernel with twiddles from planner let (twiddles_re, twiddles_im) = &planner.stage_twiddles[stage_twiddle_idx]; - fft_dit_32_chunk_n_simd(reals, imags, twiddles_re, twiddles_im, dist); + fft_dit_chunk_n_f32(simd, reals, imags, twiddles_re, twiddles_im, dist); stage_twiddle_idx + 1 } } @@ -261,7 +265,8 @@ pub fn fft_64_dit_with_planner_and_opts( } } - recursive_dit_fft_f64(reals, imags, n, planner, opts, 0); + let simd_level = Level::new(); + dispatch!(simd_level, simd => recursive_dit_fft_f64(simd, reals, imags, n, planner, opts, 0)); // Scaling for inverse transform if let Direction::Reverse = planner.direction { @@ -304,7 +309,8 @@ pub fn fft_32_dit_with_planner_and_opts( } } - recursive_dit_fft_f32(reals, imags, n, planner, opts, 0); + let simd_level = Level::new(); + dispatch!(simd_level, simd => recursive_dit_fft_f32(simd, reals, imags, n, planner, opts, 0)); // Scaling for inverse transform if let Direction::Reverse = planner.direction { diff --git a/src/kernels/dit.rs b/src/kernels/dit.rs index 1f4c4d7..5c1f50e 100644 --- a/src/kernels/dit.rs +++ b/src/kernels/dit.rs @@ -4,28 +4,41 @@ //! use core::f32; +use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8, Simd, SimdBase, SimdFloat, SimdFrom}; use num_traits::Float; -use wide::{f32x16, f32x4, f32x8, f64x4, f64x8}; - -use crate::kernels::common::fft_chunk_2; /// DIT butterfly for chunk_size == 2 /// Identical to DIF version (no twiddles at size 2) -pub fn fft_dit_chunk_2(reals: &mut [T], imags: &mut [T]) { - fft_chunk_2(reals, imags); +#[inline(always)] // required by fearless_simd +pub fn fft_dit_chunk_2(_simd: S, reals: &mut [T], imags: &mut [T]) { + reals + .chunks_exact_mut(2) + .zip(imags.chunks_exact_mut(2)) + .for_each(|(reals_chunk, imags_chunk)| { + let z0_re = reals_chunk[0]; + let z0_im = imags_chunk[0]; + let z1_re = reals_chunk[1]; + let z1_im = imags_chunk[1]; + + reals_chunk[0] = z0_re + z1_re; + imags_chunk[0] = z0_im + z1_im; + reals_chunk[1] = z0_re - z1_re; + imags_chunk[1] = z0_im - z1_im; + }); +} + +/// DIT butterfly for chunk_size == 4 (f64) +#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure +pub fn fft_dit_chunk_4_f64(simd: S, reals: &mut [f64], imags: &mut [f64]) { + simd.vectorize( + #[inline(always)] + || fft_dit_chunk_4_simd_f64(simd, reals, imags), + ) } /// DIT butterfly for chunk_size == 4 (f64) -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_chunk_4_simd_f64(reals: &mut [f64], imags: &mut [f64]) { +#[inline(always)] // required by fearless_simd +fn fft_dit_chunk_4_simd_f64(_simd: S, reals: &mut [f64], imags: &mut [f64]) { const DIST: usize = 2; const CHUNK_SIZE: usize = DIST << 1; @@ -65,16 +78,17 @@ pub fn fft_dit_chunk_4_simd_f64(reals: &mut [f64], imags: &mut [f64]) { } /// DIT butterfly for chunk_size == 4 (f32) -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_chunk_4_simd_f32(reals: &mut [f32], imags: &mut [f32]) { +#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure +pub fn fft_dit_chunk_4_f32(simd: S, reals: &mut [f32], imags: &mut [f32]) { + simd.vectorize( + #[inline(always)] + || fft_dit_chunk_4_simd_f32(simd, reals, imags), + ) +} + +/// DIT butterfly for chunk_size == 4 (f32) +#[inline(always)] // required by fearless_simd +fn fft_dit_chunk_4_simd_f32(_simd: S, reals: &mut [f32], imags: &mut [f32]) { const DIST: usize = 2; const CHUNK_SIZE: usize = DIST << 1; @@ -114,32 +128,39 @@ pub fn fft_dit_chunk_4_simd_f32(reals: &mut [f32], imags: &mut [f32]) { } /// DIT butterfly for chunk_size == 8 (f64) with SIMD -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_chunk_8_simd_f64(reals: &mut [f64], imags: &mut [f64]) { +#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure +pub fn fft_dit_chunk_8_f64(simd: S, reals: &mut [f64], imags: &mut [f64]) { + simd.vectorize( + #[inline(always)] + || fft_dit_chunk_8_simd_f64(simd, reals, imags), + ) +} + +/// DIT butterfly for chunk_size == 8 (f64) with SIMD +#[inline(always)] // required by fearless_simd +fn fft_dit_chunk_8_simd_f64(simd: S, reals: &mut [f64], imags: &mut [f64]) { const DIST: usize = 4; const CHUNK_SIZE: usize = DIST << 1; - let two = f64x4::splat(2.0); - let sqrt2_2 = f64x4::new([ - 1.0, // W_8^0 real - std::f64::consts::FRAC_1_SQRT_2, // W_8^1 real (sqrt(2)/2) - 0.0, // W_8^2 real - -std::f64::consts::FRAC_1_SQRT_2, // W_8^3 real (-sqrt(2)/2) - ]); - let sqrt2_2_im = f64x4::new([ - 0.0, // W_8^0 imag - -std::f64::consts::FRAC_1_SQRT_2, // W_8^1 imag (-sqrt(2)/2) - -1.0, // W_8^2 imag - -std::f64::consts::FRAC_1_SQRT_2, // W_8^3 imag (-sqrt(2)/2) - ]); + let two = f64x4::splat(simd, 2.0); + let sqrt2_2 = f64x4::simd_from( + simd, + [ + 1.0, // W_8^0 real + std::f64::consts::FRAC_1_SQRT_2, // W_8^1 real (sqrt(2)/2) + 0.0, // W_8^2 real + -std::f64::consts::FRAC_1_SQRT_2, // W_8^3 real (-sqrt(2)/2) + ], + ); + let sqrt2_2_im = f64x4::simd_from( + simd, + [ + 0.0, // W_8^0 imag + -std::f64::consts::FRAC_1_SQRT_2, // W_8^1 imag (-sqrt(2)/2) + -1.0, // W_8^2 imag + -std::f64::consts::FRAC_1_SQRT_2, // W_8^3 imag (-sqrt(2)/2) + ], + ); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -147,10 +168,10 @@ pub fn fft_dit_chunk_8_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST); let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); - let in0_re = f64x4::new(reals_s0[0..4].try_into().unwrap()); - let in1_re = f64x4::new(reals_s1[0..4].try_into().unwrap()); - let in0_im = f64x4::new(imags_s0[0..4].try_into().unwrap()); - let in1_im = f64x4::new(imags_s1[0..4].try_into().unwrap()); + let in0_re = f64x4::from_slice(simd, &reals_s0[0..4]); + let in1_re = f64x4::from_slice(simd, &reals_s1[0..4]); + let in0_im = f64x4::from_slice(simd, &imags_s0[0..4]); + let in1_im = f64x4::from_slice(simd, &imags_s1[0..4]); // out0.re = (in0.re + w.re * in1.re) - w.im * in1.im let out0_re = sqrt2_2_im.mul_add(-in1_im, sqrt2_2.mul_add(in1_re, in0_re)); @@ -161,40 +182,47 @@ pub fn fft_dit_chunk_8_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0.copy_from_slice(out0_re.as_array()); - imags_s0.copy_from_slice(out0_im.as_array()); - reals_s1.copy_from_slice(out1_re.as_array()); - imags_s1.copy_from_slice(out1_im.as_array()); + out0_re.store_slice(reals_s0); + out0_im.store_slice(imags_s0); + out1_re.store_slice(reals_s1); + out1_im.store_slice(imags_s1); }); } /// DIT butterfly for chunk_size == 8 (f32) with SIMD -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_chunk_8_simd_f32(reals: &mut [f32], imags: &mut [f32]) { +#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure +pub fn fft_dit_chunk_8_f32(simd: S, reals: &mut [f32], imags: &mut [f32]) { + simd.vectorize( + #[inline(always)] + || fft_dit_chunk_8_simd_f32(simd, reals, imags), + ) +} + +/// DIT butterfly for chunk_size == 8 (f32) with SIMD +#[inline(always)] // required by fearless_simd +fn fft_dit_chunk_8_simd_f32(simd: S, reals: &mut [f32], imags: &mut [f32]) { const DIST: usize = 4; const CHUNK_SIZE: usize = DIST << 1; - let two = f32x4::splat(2.0); - let sqrt2_2 = f32x4::new([ - 1.0_f32, // W_8^0 real - std::f32::consts::FRAC_1_SQRT_2, // W_8^1 real (sqrt(2)/2) - 0.0_f32, // W_8^2 real - -std::f32::consts::FRAC_1_SQRT_2, // W_8^3 real (-sqrt(2)/2) - ]); - let sqrt2_2_im = f32x4::new([ - 0.0_f32, // W_8^0 imag - -std::f32::consts::FRAC_1_SQRT_2, // W_8^1 imag (-sqrt(2)/2) - -1.0_f32, // W_8^2 imag - -std::f32::consts::FRAC_1_SQRT_2, // W_8^3 imag (-sqrt(2)/2) - ]); + let two = f32x4::splat(simd, 2.0); + let sqrt2_2 = f32x4::simd_from( + simd, + [ + 1.0_f32, // W_8^0 real + std::f32::consts::FRAC_1_SQRT_2, // W_8^1 real (sqrt(2)/2) + 0.0_f32, // W_8^2 real + -std::f32::consts::FRAC_1_SQRT_2, // W_8^3 real (-sqrt(2)/2) + ], + ); + let sqrt2_2_im = f32x4::simd_from( + simd, + [ + 0.0_f32, // W_8^0 imag + -std::f32::consts::FRAC_1_SQRT_2, // W_8^1 imag (-sqrt(2)/2) + -1.0_f32, // W_8^2 imag + -std::f32::consts::FRAC_1_SQRT_2, // W_8^3 imag (-sqrt(2)/2) + ], + ); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -202,10 +230,10 @@ pub fn fft_dit_chunk_8_simd_f32(reals: &mut [f32], imags: &mut [f32]) { let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST); let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); - let in0_re = f32x4::new(reals_s0[0..4].try_into().unwrap()); - let in1_re = f32x4::new(reals_s1[0..4].try_into().unwrap()); - let in0_im = f32x4::new(imags_s0[0..4].try_into().unwrap()); - let in1_im = f32x4::new(imags_s1[0..4].try_into().unwrap()); + let in0_re = f32x4::from_slice(simd, &reals_s0[0..4]); + let in1_re = f32x4::from_slice(simd, &reals_s1[0..4]); + let in0_im = f32x4::from_slice(simd, &imags_s0[0..4]); + let in1_im = f32x4::from_slice(simd, &imags_s1[0..4]); // out0.re = (in0.re + w.re * in1.re) - w.im * in1.im let out0_re = sqrt2_2_im.mul_add(-in1_im, sqrt2_2.mul_add(in1_re, in0_re)); @@ -216,51 +244,58 @@ pub fn fft_dit_chunk_8_simd_f32(reals: &mut [f32], imags: &mut [f32]) { let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0.copy_from_slice(out0_re.as_array()); - imags_s0.copy_from_slice(out0_im.as_array()); - reals_s1.copy_from_slice(out1_re.as_array()); - imags_s1.copy_from_slice(out1_im.as_array()); + out0_re.store_slice(reals_s0); + out0_im.store_slice(imags_s0); + out1_re.store_slice(reals_s1); + out1_im.store_slice(imags_s1); }); } -/// DIT butterfly for chunk_size == 16 (f64) -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_chunk_16_simd_f64(reals: &mut [f64], imags: &mut [f64]) { +/// DIT butterfly for chunk_size == 16 (f64) +#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure +pub fn fft_dit_chunk_16_f64(simd: S, reals: &mut [f64], imags: &mut [f64]) { + simd.vectorize( + #[inline(always)] + || fft_dit_chunk_16_simd_f64(simd, reals, imags), + ) +} + +/// DIT butterfly for chunk_size == 16 (f64) +#[inline(always)] // required by fearless_simd +fn fft_dit_chunk_16_simd_f64(simd: S, reals: &mut [f64], imags: &mut [f64]) { const DIST: usize = 8; const CHUNK_SIZE: usize = DIST << 1; - let two = f64x8::splat(2.0); + let two = f64x8::splat(simd, 2.0); // Twiddle factors for W_16^k where k = 0..7 - let twiddle_re = f64x8::new([ - 1.0, // W_16^0 - 0.9238795325112867, // W_16^1 = cos(pi/8) - std::f64::consts::FRAC_1_SQRT_2, // W_16^2 = sqrt(2)/2 - 0.38268343236508984, // W_16^3 = cos(3*pi/8) - 0.0, // W_16^4 - -0.38268343236508984, // W_16^5 = -cos(3*pi/8) - -std::f64::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 - -0.9238795325112867, // W_16^7 = -cos(pi/8) - ]); - - let twiddle_im = f64x8::new([ - 0.0, // W_16^0 - -0.38268343236508984, // W_16^1 = -sin(pi/8) - -std::f64::consts::FRAC_1_SQRT_2, // W_16^2 = -sqrt(2)/2 - -0.9238795325112867, // W_16^3 = -sin(3*pi/8) - -1.0, // W_16^4 - -0.9238795325112867, // W_16^5 = -sin(3*pi/8) - -std::f64::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 - -0.38268343236508984, // W_16^7 = -sin(pi/8) - ]); + let twiddle_re = f64x8::simd_from( + simd, + [ + 1.0, // W_16^0 + 0.9238795325112867, // W_16^1 = cos(pi/8) + std::f64::consts::FRAC_1_SQRT_2, // W_16^2 = sqrt(2)/2 + 0.38268343236508984, // W_16^3 = cos(3*pi/8) + 0.0, // W_16^4 + -0.38268343236508984, // W_16^5 = -cos(3*pi/8) + -std::f64::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 + -0.9238795325112867, // W_16^7 = -cos(pi/8) + ], + ); + + let twiddle_im = f64x8::simd_from( + simd, + [ + 0.0, // W_16^0 + -0.38268343236508984, // W_16^1 = -sin(pi/8) + -std::f64::consts::FRAC_1_SQRT_2, // W_16^2 = -sqrt(2)/2 + -0.9238795325112867, // W_16^3 = -sin(3*pi/8) + -1.0, // W_16^4 + -0.9238795325112867, // W_16^5 = -sin(3*pi/8) + -std::f64::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 + -0.38268343236508984, // W_16^7 = -sin(pi/8) + ], + ); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -269,10 +304,10 @@ pub fn fft_dit_chunk_16_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Load all 8 elements at once - let in0_re = f64x8::new(reals_s0[0..8].try_into().unwrap()); - let in1_re = f64x8::new(reals_s1[0..8].try_into().unwrap()); - let in0_im = f64x8::new(imags_s0[0..8].try_into().unwrap()); - let in1_im = f64x8::new(imags_s1[0..8].try_into().unwrap()); + let in0_re = f64x8::from_slice(simd, &reals_s0[0..8]); + let in1_re = f64x8::from_slice(simd, &reals_s1[0..8]); + let in0_im = f64x8::from_slice(simd, &imags_s0[0..8]); + let in1_im = f64x8::from_slice(simd, &imags_s1[0..8]); let out0_re = twiddle_im.mul_add(-in1_im, twiddle_re.mul_add(in1_re, in0_re)); let out0_im = twiddle_im.mul_add(in1_re, twiddle_re.mul_add(in1_im, in0_im)); @@ -281,51 +316,58 @@ pub fn fft_dit_chunk_16_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0.copy_from_slice(out0_re.as_array()); - imags_s0.copy_from_slice(out0_im.as_array()); - reals_s1.copy_from_slice(out1_re.as_array()); - imags_s1.copy_from_slice(out1_im.as_array()); + out0_re.store_slice(reals_s0); + out0_im.store_slice(imags_s0); + out1_re.store_slice(reals_s1); + out1_im.store_slice(imags_s1); }); } /// DIT butterfly for chunk_size == 16 (f32) -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_chunk_16_simd_f32(reals: &mut [f32], imags: &mut [f32]) { +#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure +pub fn fft_dit_chunk_16_f32(simd: S, reals: &mut [f32], imags: &mut [f32]) { + simd.vectorize( + #[inline(always)] + || fft_dit_chunk_16_simd_f32(simd, reals, imags), + ) +} + +/// DIT butterfly for chunk_size == 16 (f32) +#[inline(always)] // required by fearless_simd +fn fft_dit_chunk_16_simd_f32(simd: S, reals: &mut [f32], imags: &mut [f32]) { const DIST: usize = 8; const CHUNK_SIZE: usize = DIST << 1; - let two = f32x8::splat(2.0); + let two = f32x8::splat(simd, 2.0); // Twiddle factors for W_16^k where k = 0..7 - let twiddle_re = f32x8::new([ - 1.0_f32, // W_16^0 - 0.923_879_5_f32, // W_16^1 = cos(pi/8) - std::f32::consts::FRAC_1_SQRT_2, // W_16^2 = sqrt(2)/2 - 0.382_683_43_f32, // W_16^3 = cos(3*pi/8) - 0.0_f32, // W_16^4 - -0.382_683_43_f32, // W_16^5 = -cos(3*pi/8) - -std::f32::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 - -0.923_879_5_f32, // W_16^7 = -cos(pi/8) - ]); - - let twiddle_im = f32x8::new([ - 0.0_f32, // W_16^0 - -0.382_683_43_f32, // W_16^1 = -sin(pi/8) - -std::f32::consts::FRAC_1_SQRT_2, // W_16^2 = -sqrt(2)/2 - -0.923_879_5_f32, // W_16^3 = -sin(3*pi/8) - -1.0_f32, // W_16^4 - -0.923_879_5_f32, // W_16^5 = -sin(3*pi/8) - -std::f32::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 - -0.382_683_43_f32, // W_16^7 = -sin(pi/8) - ]); + let twiddle_re = f32x8::simd_from( + simd, + [ + 1.0_f32, // W_16^0 + 0.923_879_5_f32, // W_16^1 = cos(pi/8) + std::f32::consts::FRAC_1_SQRT_2, // W_16^2 = sqrt(2)/2 + 0.382_683_43_f32, // W_16^3 = cos(3*pi/8) + 0.0_f32, // W_16^4 + -0.382_683_43_f32, // W_16^5 = -cos(3*pi/8) + -std::f32::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 + -0.923_879_5_f32, // W_16^7 = -cos(pi/8) + ], + ); + + let twiddle_im = f32x8::simd_from( + simd, + [ + 0.0_f32, // W_16^0 + -0.382_683_43_f32, // W_16^1 = -sin(pi/8) + -std::f32::consts::FRAC_1_SQRT_2, // W_16^2 = -sqrt(2)/2 + -0.923_879_5_f32, // W_16^3 = -sin(3*pi/8) + -1.0_f32, // W_16^4 + -0.923_879_5_f32, // W_16^5 = -sin(3*pi/8) + -std::f32::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2 + -0.382_683_43_f32, // W_16^7 = -sin(pi/8) + ], + ); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -334,10 +376,10 @@ pub fn fft_dit_chunk_16_simd_f32(reals: &mut [f32], imags: &mut [f32]) { let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Load all 8 elements at once - let in0_re = f32x8::new(reals_s0[0..8].try_into().unwrap()); - let in1_re = f32x8::new(reals_s1[0..8].try_into().unwrap()); - let in0_im = f32x8::new(imags_s0[0..8].try_into().unwrap()); - let in1_im = f32x8::new(imags_s1[0..8].try_into().unwrap()); + let in0_re = f32x8::from_slice(simd, &reals_s0[0..8]); + let in1_re = f32x8::from_slice(simd, &reals_s1[0..8]); + let in0_im = f32x8::from_slice(simd, &imags_s0[0..8]); + let in1_im = f32x8::from_slice(simd, &imags_s1[0..8]); let out0_re = twiddle_im.mul_add(-in1_im, twiddle_re.mul_add(in1_re, in0_re)); let out0_im = twiddle_im.mul_add(in1_re, twiddle_re.mul_add(in1_im, in0_im)); @@ -346,73 +388,86 @@ pub fn fft_dit_chunk_16_simd_f32(reals: &mut [f32], imags: &mut [f32]) { let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0.copy_from_slice(out0_re.as_array()); - imags_s0.copy_from_slice(out0_im.as_array()); - reals_s1.copy_from_slice(out1_re.as_array()); - imags_s1.copy_from_slice(out1_im.as_array()); + out0_re.store_slice(reals_s0); + out0_im.store_slice(imags_s0); + out1_re.store_slice(reals_s1); + out1_im.store_slice(imags_s1); }); } /// DIT butterfly for chunk_size == 32 (f64) -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_chunk_32_simd_f64(reals: &mut [f64], imags: &mut [f64]) { +#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure +pub fn fft_dit_chunk_32_f64(simd: S, reals: &mut [f64], imags: &mut [f64]) { + simd.vectorize( + #[inline(always)] + || fft_dit_chunk_32_simd_f64(simd, reals, imags), + ) +} + +/// DIT butterfly for chunk_size == 32 (f64) +#[inline(always)] // required by fearless_simd +fn fft_dit_chunk_32_simd_f64(simd: S, reals: &mut [f64], imags: &mut [f64]) { const DIST: usize = 16; const CHUNK_SIZE: usize = DIST << 1; - let two = f64x8::splat(2.0); + let two = f64x8::splat(simd, 2.0); // First 8 twiddle factors for W_32^k where k = 0..7 - let twiddle_re_0_7 = f64x8::new([ - 1.0, // W_32^0 = 1 - 0.9807852804032304, // W_32^1 = cos(π/16) - 0.9238795325112867, // W_32^2 = cos(π/8) - 0.8314696123025452, // W_32^3 = cos(3π/16) - std::f64::consts::FRAC_1_SQRT_2, // W_32^4 = sqrt(2)/2 - 0.5555702330196022, // W_32^5 = cos(5π/16) - 0.3826834323650898, // W_32^6 = cos(3π/8) - 0.19509032201612825, // W_32^7 = cos(7π/16) - ]); - - let twiddle_im_0_7 = f64x8::new([ - 0.0, // W_32^0 - -0.19509032201612825, // W_32^1 = -sin(π/16) - -0.3826834323650898, // W_32^2 = -sin(π/8) - -0.5555702330196022, // W_32^3 = -sin(3π/16) - -std::f64::consts::FRAC_1_SQRT_2, // W_32^4 = -sqrt(2)/2 - -0.8314696123025452, // W_32^5 = -sin(5π/16) - -0.9238795325112867, // W_32^6 = -sin(3π/8) - -0.9807852804032304, // W_32^7 = -sin(7π/16) - ]); + let twiddle_re_0_7 = f64x8::simd_from( + simd, + [ + 1.0, // W_32^0 = 1 + 0.9807852804032304, // W_32^1 = cos(π/16) + 0.9238795325112867, // W_32^2 = cos(π/8) + 0.8314696123025452, // W_32^3 = cos(3π/16) + std::f64::consts::FRAC_1_SQRT_2, // W_32^4 = sqrt(2)/2 + 0.5555702330196022, // W_32^5 = cos(5π/16) + 0.3826834323650898, // W_32^6 = cos(3π/8) + 0.19509032201612825, // W_32^7 = cos(7π/16) + ], + ); + + let twiddle_im_0_7 = f64x8::simd_from( + simd, + [ + 0.0, // W_32^0 + -0.19509032201612825, // W_32^1 = -sin(π/16) + -0.3826834323650898, // W_32^2 = -sin(π/8) + -0.5555702330196022, // W_32^3 = -sin(3π/16) + -std::f64::consts::FRAC_1_SQRT_2, // W_32^4 = -sqrt(2)/2 + -0.8314696123025452, // W_32^5 = -sin(5π/16) + -0.9238795325112867, // W_32^6 = -sin(3π/8) + -0.9807852804032304, // W_32^7 = -sin(7π/16) + ], + ); // Second 8 twiddle factors for W_32^k where k = 8..15 - let twiddle_re_8_15 = f64x8::new([ - 0.0, // W_32^8 = 0 - i - -0.19509032201612825, // W_32^9 - -0.3826834323650898, // W_32^10 - -0.5555702330196022, // W_32^11 - -std::f64::consts::FRAC_1_SQRT_2, // W_32^12 - -0.8314696123025452, // W_32^13 - -0.9238795325112867, // W_32^14 - -0.9807852804032304, // W_32^15 - ]); - - let twiddle_im_8_15 = f64x8::new([ - -1.0, // W_32^8 - -0.9807852804032304, // W_32^9 - -0.9238795325112867, // W_32^10 - -0.8314696123025452, // W_32^11 - -std::f64::consts::FRAC_1_SQRT_2, // W_32^12 - -0.5555702330196022, // W_32^13 - -0.3826834323650898, // W_32^14 - -0.19509032201612825, // W_32^15 - ]); + let twiddle_re_8_15 = f64x8::simd_from( + simd, + [ + 0.0, // W_32^8 = 0 - i + -0.19509032201612825, // W_32^9 + -0.3826834323650898, // W_32^10 + -0.5555702330196022, // W_32^11 + -std::f64::consts::FRAC_1_SQRT_2, // W_32^12 + -0.8314696123025452, // W_32^13 + -0.9238795325112867, // W_32^14 + -0.9807852804032304, // W_32^15 + ], + ); + + let twiddle_im_8_15 = f64x8::simd_from( + simd, + [ + -1.0, // W_32^8 + -0.9807852804032304, // W_32^9 + -0.9238795325112867, // W_32^10 + -0.8314696123025452, // W_32^11 + -std::f64::consts::FRAC_1_SQRT_2, // W_32^12 + -0.5555702330196022, // W_32^13 + -0.3826834323650898, // W_32^14 + -0.19509032201612825, // W_32^15 + ], + ); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -421,10 +476,10 @@ pub fn fft_dit_chunk_32_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Process first 8 butterflies - let in0_re_0_7 = f64x8::new(reals_s0[0..8].try_into().unwrap()); - let in1_re_0_7 = f64x8::new(reals_s1[0..8].try_into().unwrap()); - let in0_im_0_7 = f64x8::new(imags_s0[0..8].try_into().unwrap()); - let in1_im_0_7 = f64x8::new(imags_s1[0..8].try_into().unwrap()); + let in0_re_0_7 = f64x8::from_slice(simd, &reals_s0[0..8]); + let in1_re_0_7 = f64x8::from_slice(simd, &reals_s1[0..8]); + let in0_im_0_7 = f64x8::from_slice(simd, &imags_s0[0..8]); + let in1_im_0_7 = f64x8::from_slice(simd, &imags_s1[0..8]); let out0_re_0_7 = twiddle_im_0_7.mul_add(-in1_im_0_7, twiddle_re_0_7.mul_add(in1_re_0_7, in0_re_0_7)); @@ -434,16 +489,16 @@ pub fn fft_dit_chunk_32_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let out1_re_0_7 = two.mul_sub(in0_re_0_7, out0_re_0_7); let out1_im_0_7 = two.mul_sub(in0_im_0_7, out0_im_0_7); - reals_s0[0..8].copy_from_slice(out0_re_0_7.as_array()); - imags_s0[0..8].copy_from_slice(out0_im_0_7.as_array()); - reals_s1[0..8].copy_from_slice(out1_re_0_7.as_array()); - imags_s1[0..8].copy_from_slice(out1_im_0_7.as_array()); + out0_re_0_7.store_slice(&mut reals_s0[0..8]); + out0_im_0_7.store_slice(&mut imags_s0[0..8]); + out1_re_0_7.store_slice(&mut reals_s1[0..8]); + out1_im_0_7.store_slice(&mut imags_s1[0..8]); // Process second 8 butterflies - let in0_re_8_15 = f64x8::new(reals_s0[8..16].try_into().unwrap()); - let in1_re_8_15 = f64x8::new(reals_s1[8..16].try_into().unwrap()); - let in0_im_8_15 = f64x8::new(imags_s0[8..16].try_into().unwrap()); - let in1_im_8_15 = f64x8::new(imags_s1[8..16].try_into().unwrap()); + let in0_re_8_15 = f64x8::from_slice(simd, &reals_s0[8..16]); + let in1_re_8_15 = f64x8::from_slice(simd, &reals_s1[8..16]); + let in0_im_8_15 = f64x8::from_slice(simd, &imags_s0[8..16]); + let in1_im_8_15 = f64x8::from_slice(simd, &imags_s1[8..16]); let out0_re_8_15 = twiddle_im_8_15.mul_add( -in1_im_8_15, @@ -457,67 +512,74 @@ pub fn fft_dit_chunk_32_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let out1_re_8_15 = two.mul_sub(in0_re_8_15, out0_re_8_15); let out1_im_8_15 = two.mul_sub(in0_im_8_15, out0_im_8_15); - reals_s0[8..16].copy_from_slice(out0_re_8_15.as_array()); - imags_s0[8..16].copy_from_slice(out0_im_8_15.as_array()); - reals_s1[8..16].copy_from_slice(out1_re_8_15.as_array()); - imags_s1[8..16].copy_from_slice(out1_im_8_15.as_array()); + out0_re_8_15.store_slice(&mut reals_s0[8..16]); + out0_im_8_15.store_slice(&mut imags_s0[8..16]); + out1_re_8_15.store_slice(&mut reals_s1[8..16]); + out1_im_8_15.store_slice(&mut imags_s1[8..16]); }); } /// DIT butterfly for chunk_size == 32 (f32) -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_chunk_32_simd_f32(reals: &mut [f32], imags: &mut [f32]) { +#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure +pub fn fft_dit_chunk_32_f32(simd: S, reals: &mut [f32], imags: &mut [f32]) { + simd.vectorize( + #[inline(always)] + || fft_dit_chunk_32_simd_f32(simd, reals, imags), + ) +} + +/// DIT butterfly for chunk_size == 32 (f32) +#[inline(always)] // required by fearless_simd +fn fft_dit_chunk_32_simd_f32(simd: S, reals: &mut [f32], imags: &mut [f32]) { const DIST: usize = 16; const CHUNK_SIZE: usize = DIST << 1; - let two = f32x16::splat(2.0); + let two = f32x16::splat(simd, 2.0); // All 16 twiddle factors for W_32^k where k = 0..15 - let twiddle_re = f32x16::new([ - 1.0_f32, // W_32^0 = 1 - 0.980_785_25_f32, // W_32^1 = cos(π/16) - 0.923_879_5_f32, // W_32^2 = cos(π/8) - 0.831_469_6_f32, // W_32^3 = cos(3π/16) - std::f32::consts::FRAC_1_SQRT_2, // W_32^4 = sqrt(2)/2 - 0.555_570_24_f32, // W_32^5 = cos(5π/16) - 0.382_683_43_f32, // W_32^6 = cos(3π/8) - 0.195_090_32_f32, // W_32^7 = cos(7π/16) - 0.0_f32, // W_32^8 = 0 - i - -0.195_090_32_f32, // W_32^9 - -0.382_683_43_f32, // W_32^10 - -0.555_570_24_f32, // W_32^11 - -f32::consts::FRAC_1_SQRT_2, // W_32^12 - -0.831_469_6_f32, // W_32^13 - -0.923_879_5_f32, // W_32^14 - -0.980_785_25_f32, // W_32^15 - ]); - - let twiddle_im = f32x16::new([ - 0.0_f32, // W_32^0 - -0.195_090_32_f32, // W_32^1 = -sin(π/16) - -0.382_683_43_f32, // W_32^2 = -sin(π/8) - -0.555_570_24_f32, // W_32^3 = -sin(3π/16) - -std::f32::consts::FRAC_1_SQRT_2, // W_32^4 = -sqrt(2)/2 - -0.831_469_6_f32, // W_32^5 = -sin(5π/16) - -0.923_879_5_f32, // W_32^6 = -sin(3π/8) - -0.980_785_25_f32, // W_32^7 = -sin(7π/16) - -1.0_f32, // W_32^8 - -0.980_785_25_f32, // W_32^9 - -0.923_879_5_f32, // W_32^10 - -0.831_469_6_f32, // W_32^11 - -std::f32::consts::FRAC_1_SQRT_2, // W_32^12 - -0.555_570_24_f32, // W_32^13 - -0.382_683_43_f32, // W_32^14 - -0.195_090_32_f32, // W_32^15 - ]); + let twiddle_re = f32x16::simd_from( + simd, + [ + 1.0_f32, // W_32^0 = 1 + 0.980_785_25_f32, // W_32^1 = cos(π/16) + 0.923_879_5_f32, // W_32^2 = cos(π/8) + 0.831_469_6_f32, // W_32^3 = cos(3π/16) + std::f32::consts::FRAC_1_SQRT_2, // W_32^4 = sqrt(2)/2 + 0.555_570_24_f32, // W_32^5 = cos(5π/16) + 0.382_683_43_f32, // W_32^6 = cos(3π/8) + 0.195_090_32_f32, // W_32^7 = cos(7π/16) + 0.0_f32, // W_32^8 = 0 - i + -0.195_090_32_f32, // W_32^9 + -0.382_683_43_f32, // W_32^10 + -0.555_570_24_f32, // W_32^11 + -f32::consts::FRAC_1_SQRT_2, // W_32^12 + -0.831_469_6_f32, // W_32^13 + -0.923_879_5_f32, // W_32^14 + -0.980_785_25_f32, // W_32^15 + ], + ); + + let twiddle_im = f32x16::simd_from( + simd, + [ + 0.0_f32, // W_32^0 + -0.195_090_32_f32, // W_32^1 = -sin(π/16) + -0.382_683_43_f32, // W_32^2 = -sin(π/8) + -0.555_570_24_f32, // W_32^3 = -sin(3π/16) + -std::f32::consts::FRAC_1_SQRT_2, // W_32^4 = -sqrt(2)/2 + -0.831_469_6_f32, // W_32^5 = -sin(5π/16) + -0.923_879_5_f32, // W_32^6 = -sin(3π/8) + -0.980_785_25_f32, // W_32^7 = -sin(7π/16) + -1.0_f32, // W_32^8 + -0.980_785_25_f32, // W_32^9 + -0.923_879_5_f32, // W_32^10 + -0.831_469_6_f32, // W_32^11 + -std::f32::consts::FRAC_1_SQRT_2, // W_32^12 + -0.555_570_24_f32, // W_32^13 + -0.382_683_43_f32, // W_32^14 + -0.195_090_32_f32, // W_32^15 + ], + ); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -526,10 +588,10 @@ pub fn fft_dit_chunk_32_simd_f32(reals: &mut [f32], imags: &mut [f32]) { let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Process all 16 butterflies at once with f32x16 - let in0_re = f32x16::new(reals_s0[0..16].try_into().unwrap()); - let in1_re = f32x16::new(reals_s1[0..16].try_into().unwrap()); - let in0_im = f32x16::new(imags_s0[0..16].try_into().unwrap()); - let in1_im = f32x16::new(imags_s1[0..16].try_into().unwrap()); + let in0_re = f32x16::from_slice(simd, &reals_s0[0..16]); + let in1_re = f32x16::from_slice(simd, &reals_s1[0..16]); + let in0_im = f32x16::from_slice(simd, &imags_s0[0..16]); + let in1_im = f32x16::from_slice(simd, &imags_s1[0..16]); let out0_re = twiddle_im.mul_add(-in1_im, twiddle_re.mul_add(in1_re, in0_re)); let out0_im = twiddle_im.mul_add(in1_re, twiddle_re.mul_add(in1_im, in0_im)); @@ -537,121 +599,146 @@ pub fn fft_dit_chunk_32_simd_f32(reals: &mut [f32], imags: &mut [f32]) { let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0.copy_from_slice(out0_re.as_array()); - imags_s0.copy_from_slice(out0_im.as_array()); - reals_s1.copy_from_slice(out1_re.as_array()); - imags_s1.copy_from_slice(out1_im.as_array()); + out0_re.store_slice(reals_s0); + out0_im.store_slice(imags_s0); + out1_re.store_slice(reals_s1); + out1_im.store_slice(imags_s1); }); } /// DIT butterfly for chunk_size == 64 (f64) -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) { +#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure +pub fn fft_dit_chunk_64_f64(simd: S, reals: &mut [f64], imags: &mut [f64]) { + simd.vectorize( + #[inline(always)] + || fft_dit_chunk_64_simd_f64(simd, reals, imags), + ) +} + +/// DIT butterfly for chunk_size == 64 (f64) +#[inline(always)] // required by fearless_simd +fn fft_dit_chunk_64_simd_f64(simd: S, reals: &mut [f64], imags: &mut [f64]) { const DIST: usize = 32; const CHUNK_SIZE: usize = DIST << 1; - let two = f64x8::splat(2.0); + let two = f64x8::splat(simd, 2.0); // Process in 4 iterations of 8 butterflies each // Twiddles for W_64^k where k = 0..7 - let twiddle_re_0_7 = f64x8::new([ - 1.0, // W_64^0 = 1 - 0.9951847266721969, // W_64^1 = cos(π/32) - 0.9807852804032304, // W_64^2 = cos(π/16) - 0.9569403357322089, // W_64^3 = cos(3π/32) - 0.9238795325112867, // W_64^4 = cos(π/8) - 0.8819212643483549, // W_64^5 = cos(5π/32) - 0.8314696123025452, // W_64^6 = cos(3π/16) - 0.773010453362737, // W_64^7 = cos(7π/32) - ]); - - let twiddle_im_0_7 = f64x8::new([ - 0.0, // W_64^0 - -0.0980171403295606, // W_64^1 = -sin(π/32) - -0.19509032201612825, // W_64^2 = -sin(π/16) - -0.29028467725446233, // W_64^3 = -sin(3π/32) - -0.3826834323650898, // W_64^4 = -sin(π/8) - -0.47139673682599764, // W_64^5 = -sin(5π/32) - -0.5555702330196022, // W_64^6 = -sin(3π/16) - -0.6343932841636455, // W_64^7 = -sin(7π/32) - ]); + let twiddle_re_0_7 = f64x8::simd_from( + simd, + [ + 1.0, // W_64^0 = 1 + 0.9951847266721969, // W_64^1 = cos(π/32) + 0.9807852804032304, // W_64^2 = cos(π/16) + 0.9569403357322089, // W_64^3 = cos(3π/32) + 0.9238795325112867, // W_64^4 = cos(π/8) + 0.8819212643483549, // W_64^5 = cos(5π/32) + 0.8314696123025452, // W_64^6 = cos(3π/16) + 0.773010453362737, // W_64^7 = cos(7π/32) + ], + ); + + let twiddle_im_0_7 = f64x8::simd_from( + simd, + [ + 0.0, // W_64^0 + -0.0980171403295606, // W_64^1 = -sin(π/32) + -0.19509032201612825, // W_64^2 = -sin(π/16) + -0.29028467725446233, // W_64^3 = -sin(3π/32) + -0.3826834323650898, // W_64^4 = -sin(π/8) + -0.47139673682599764, // W_64^5 = -sin(5π/32) + -0.5555702330196022, // W_64^6 = -sin(3π/16) + -0.6343932841636455, // W_64^7 = -sin(7π/32) + ], + ); // Twiddles for k = 8..15 - let twiddle_re_8_15 = f64x8::new([ - std::f64::consts::FRAC_1_SQRT_2, // W_64^8 = sqrt(2)/2 - 0.6343932841636455, // W_64^9 - 0.5555702330196022, // W_64^10 - 0.47139673682599764, // W_64^11 - 0.3826834323650898, // W_64^12 - 0.29028467725446233, // W_64^13 - 0.19509032201612825, // W_64^14 - 0.0980171403295606, // W_64^15 - ]); - - let twiddle_im_8_15 = f64x8::new([ - -std::f64::consts::FRAC_1_SQRT_2, // W_64^8 - -0.773010453362737, // W_64^9 - -0.8314696123025452, // W_64^10 - -0.8819212643483549, // W_64^11 - -0.9238795325112867, // W_64^12 - -0.9569403357322089, // W_64^13 - -0.9807852804032304, // W_64^14 - -0.9951847266721969, // W_64^15 - ]); + let twiddle_re_8_15 = f64x8::simd_from( + simd, + [ + std::f64::consts::FRAC_1_SQRT_2, // W_64^8 = sqrt(2)/2 + 0.6343932841636455, // W_64^9 + 0.5555702330196022, // W_64^10 + 0.47139673682599764, // W_64^11 + 0.3826834323650898, // W_64^12 + 0.29028467725446233, // W_64^13 + 0.19509032201612825, // W_64^14 + 0.0980171403295606, // W_64^15 + ], + ); + + let twiddle_im_8_15 = f64x8::simd_from( + simd, + [ + -std::f64::consts::FRAC_1_SQRT_2, // W_64^8 + -0.773010453362737, // W_64^9 + -0.8314696123025452, // W_64^10 + -0.8819212643483549, // W_64^11 + -0.9238795325112867, // W_64^12 + -0.9569403357322089, // W_64^13 + -0.9807852804032304, // W_64^14 + -0.9951847266721969, // W_64^15 + ], + ); // Twiddles for k = 16..23 - let twiddle_re_16_23 = f64x8::new([ - 0.0, // W_64^16 = -i - -0.0980171403295606, // W_64^17 - -0.19509032201612825, // W_64^18 - -0.29028467725446233, // W_64^19 - -0.3826834323650898, // W_64^20 - -0.47139673682599764, // W_64^21 - -0.5555702330196022, // W_64^22 - -0.6343932841636455, // W_64^23 - ]); - - let twiddle_im_16_23 = f64x8::new([ - -1.0, // W_64^16 - -0.9951847266721969, // W_64^17 - -0.9807852804032304, // W_64^18 - -0.9569403357322089, // W_64^19 - -0.9238795325112867, // W_64^20 - -0.8819212643483549, // W_64^21 - -0.8314696123025452, // W_64^22 - -0.773010453362737, // W_64^23 - ]); + let twiddle_re_16_23 = f64x8::simd_from( + simd, + [ + 0.0, // W_64^16 = -i + -0.0980171403295606, // W_64^17 + -0.19509032201612825, // W_64^18 + -0.29028467725446233, // W_64^19 + -0.3826834323650898, // W_64^20 + -0.47139673682599764, // W_64^21 + -0.5555702330196022, // W_64^22 + -0.6343932841636455, // W_64^23 + ], + ); + + let twiddle_im_16_23 = f64x8::simd_from( + simd, + [ + -1.0, // W_64^16 + -0.9951847266721969, // W_64^17 + -0.9807852804032304, // W_64^18 + -0.9569403357322089, // W_64^19 + -0.9238795325112867, // W_64^20 + -0.8819212643483549, // W_64^21 + -0.8314696123025452, // W_64^22 + -0.773010453362737, // W_64^23 + ], + ); // Twiddles for k = 24..31 - let twiddle_re_24_31 = f64x8::new([ - -std::f64::consts::FRAC_1_SQRT_2, // W_64^24 - -0.773010453362737, // W_64^25 - -0.8314696123025452, // W_64^26 - -0.8819212643483549, // W_64^27 - -0.9238795325112867, // W_64^28 - -0.9569403357322089, // W_64^29 - -0.9807852804032304, // W_64^30 - -0.9951847266721969, // W_64^31 - ]); - - let twiddle_im_24_31 = f64x8::new([ - -std::f64::consts::FRAC_1_SQRT_2, // W_64^24 - -0.6343932841636455, // W_64^25 - -0.5555702330196022, // W_64^26 - -0.47139673682599764, // W_64^27 - -0.3826834323650898, // W_64^28 - -0.29028467725446233, // W_64^29 - -0.19509032201612825, // W_64^30 - -0.0980171403295606, // W_64^31 - ]); + let twiddle_re_24_31 = f64x8::simd_from( + simd, + [ + -std::f64::consts::FRAC_1_SQRT_2, // W_64^24 + -0.773010453362737, // W_64^25 + -0.8314696123025452, // W_64^26 + -0.8819212643483549, // W_64^27 + -0.9238795325112867, // W_64^28 + -0.9569403357322089, // W_64^29 + -0.9807852804032304, // W_64^30 + -0.9951847266721969, // W_64^31 + ], + ); + + let twiddle_im_24_31 = f64x8::simd_from( + simd, + [ + -std::f64::consts::FRAC_1_SQRT_2, // W_64^24 + -0.6343932841636455, // W_64^25 + -0.5555702330196022, // W_64^26 + -0.47139673682599764, // W_64^27 + -0.3826834323650898, // W_64^28 + -0.29028467725446233, // W_64^29 + -0.19509032201612825, // W_64^30 + -0.0980171403295606, // W_64^31 + ], + ); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -660,42 +747,42 @@ pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Process butterflies 0..7 - let in0_re = f64x8::new(reals_s0[0..8].try_into().unwrap()); - let in1_re = f64x8::new(reals_s1[0..8].try_into().unwrap()); - let in0_im = f64x8::new(imags_s0[0..8].try_into().unwrap()); - let in1_im = f64x8::new(imags_s1[0..8].try_into().unwrap()); + let in0_re = f64x8::from_slice(simd, &reals_s0[0..8]); + let in1_re = f64x8::from_slice(simd, &reals_s1[0..8]); + let in0_im = f64x8::from_slice(simd, &imags_s0[0..8]); + let in1_im = f64x8::from_slice(simd, &imags_s1[0..8]); let out0_re = twiddle_im_0_7.mul_add(-in1_im, twiddle_re_0_7.mul_add(in1_re, in0_re)); let out0_im = twiddle_im_0_7.mul_add(in1_re, twiddle_re_0_7.mul_add(in1_im, in0_im)); let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0[0..8].copy_from_slice(out0_re.as_array()); - imags_s0[0..8].copy_from_slice(out0_im.as_array()); - reals_s1[0..8].copy_from_slice(out1_re.as_array()); - imags_s1[0..8].copy_from_slice(out1_im.as_array()); + out0_re.store_slice(&mut reals_s0[0..8]); + out0_im.store_slice(&mut imags_s0[0..8]); + out1_re.store_slice(&mut reals_s1[0..8]); + out1_im.store_slice(&mut imags_s1[0..8]); // Process butterflies 8..15 - let in0_re = f64x8::new(reals_s0[8..16].try_into().unwrap()); - let in1_re = f64x8::new(reals_s1[8..16].try_into().unwrap()); - let in0_im = f64x8::new(imags_s0[8..16].try_into().unwrap()); - let in1_im = f64x8::new(imags_s1[8..16].try_into().unwrap()); + let in0_re = f64x8::from_slice(simd, &reals_s0[8..16]); + let in1_re = f64x8::from_slice(simd, &reals_s1[8..16]); + let in0_im = f64x8::from_slice(simd, &imags_s0[8..16]); + let in1_im = f64x8::from_slice(simd, &imags_s1[8..16]); let out0_re = twiddle_im_8_15.mul_add(-in1_im, twiddle_re_8_15.mul_add(in1_re, in0_re)); let out0_im = twiddle_im_8_15.mul_add(in1_re, twiddle_re_8_15.mul_add(in1_im, in0_im)); let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0[8..16].copy_from_slice(out0_re.as_array()); - imags_s0[8..16].copy_from_slice(out0_im.as_array()); - reals_s1[8..16].copy_from_slice(out1_re.as_array()); - imags_s1[8..16].copy_from_slice(out1_im.as_array()); + out0_re.store_slice(&mut reals_s0[8..16]); + out0_im.store_slice(&mut imags_s0[8..16]); + out1_re.store_slice(&mut reals_s1[8..16]); + out1_im.store_slice(&mut imags_s1[8..16]); // Process butterflies 16..23 - let in0_re = f64x8::new(reals_s0[16..24].try_into().unwrap()); - let in1_re = f64x8::new(reals_s1[16..24].try_into().unwrap()); - let in0_im = f64x8::new(imags_s0[16..24].try_into().unwrap()); - let in1_im = f64x8::new(imags_s1[16..24].try_into().unwrap()); + let in0_re = f64x8::from_slice(simd, &reals_s0[16..24]); + let in1_re = f64x8::from_slice(simd, &reals_s1[16..24]); + let in0_im = f64x8::from_slice(simd, &imags_s0[16..24]); + let in1_im = f64x8::from_slice(simd, &imags_s1[16..24]); let out0_re = twiddle_im_16_23.mul_add(-in1_im, twiddle_re_16_23.mul_add(in1_re, in0_re)); @@ -704,16 +791,16 @@ pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0[16..24].copy_from_slice(out0_re.as_array()); - imags_s0[16..24].copy_from_slice(out0_im.as_array()); - reals_s1[16..24].copy_from_slice(out1_re.as_array()); - imags_s1[16..24].copy_from_slice(out1_im.as_array()); + out0_re.store_slice(&mut reals_s0[16..24]); + out0_im.store_slice(&mut imags_s0[16..24]); + out1_re.store_slice(&mut reals_s1[16..24]); + out1_im.store_slice(&mut imags_s1[16..24]); // Process butterflies 24..31 - let in0_re = f64x8::new(reals_s0[24..32].try_into().unwrap()); - let in1_re = f64x8::new(reals_s1[24..32].try_into().unwrap()); - let in0_im = f64x8::new(imags_s0[24..32].try_into().unwrap()); - let in1_im = f64x8::new(imags_s1[24..32].try_into().unwrap()); + let in0_re = f64x8::from_slice(simd, &reals_s0[24..32]); + let in1_re = f64x8::from_slice(simd, &reals_s1[24..32]); + let in0_im = f64x8::from_slice(simd, &imags_s0[24..32]); + let in1_im = f64x8::from_slice(simd, &imags_s1[24..32]); let out0_re = twiddle_im_24_31.mul_add(-in1_im, twiddle_re_24_31.mul_add(in1_re, in0_re)); @@ -722,107 +809,120 @@ pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0[24..32].copy_from_slice(out0_re.as_array()); - imags_s0[24..32].copy_from_slice(out0_im.as_array()); - reals_s1[24..32].copy_from_slice(out1_re.as_array()); - imags_s1[24..32].copy_from_slice(out1_im.as_array()); + out0_re.store_slice(&mut reals_s0[24..32]); + out0_im.store_slice(&mut imags_s0[24..32]); + out1_re.store_slice(&mut reals_s1[24..32]); + out1_im.store_slice(&mut imags_s1[24..32]); }); } /// DIT butterfly for chunk_size == 64 (f32) -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_chunk_64_simd_f32(reals: &mut [f32], imags: &mut [f32]) { +#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure +pub fn fft_dit_chunk_64_f32(simd: S, reals: &mut [f32], imags: &mut [f32]) { + simd.vectorize( + #[inline(always)] + || fft_dit_chunk_64_simd_f32(simd, reals, imags), + ) +} + +/// DIT butterfly for chunk_size == 64 (f32) +#[inline(always)] // required by fearless_simd +fn fft_dit_chunk_64_simd_f32(simd: S, reals: &mut [f32], imags: &mut [f32]) { const DIST: usize = 32; const CHUNK_SIZE: usize = DIST << 1; - let two = f32x16::splat(2.0); + let two = f32x16::splat(simd, 2.0); // Process in 2 iterations of 16 butterflies each // Twiddles for W_64^k where k = 0..15 - let twiddle_re_0_15 = f32x16::new([ - 1.0_f32, // W_64^0 = 1 - 0.995_184_7_f32, // W_64^1 = cos(π/32) - 0.980_785_25_f32, // W_64^2 = cos(π/16) - 0.956_940_35_f32, // W_64^3 = cos(3π/32) - 0.923_879_5_f32, // W_64^4 = cos(π/8) - 0.881_921_3_f32, // W_64^5 = cos(5π/32) - 0.831_469_6_f32, // W_64^6 = cos(3π/16) - 0.773_010_43_f32, // W_64^7 = cos(7π/32) - std::f32::consts::FRAC_1_SQRT_2, // W_64^8 = sqrt(2)/2 - 0.634_393_3_f32, // W_64^9 - 0.555_570_24_f32, // W_64^10 - 0.471_396_74_f32, // W_64^11 - 0.382_683_43_f32, // W_64^12 - 0.290_284_66_f32, // W_64^13 - 0.195_090_32_f32, // W_64^14 - 0.098_017_14_f32, // W_64^15 - ]); - - let twiddle_im_0_15 = f32x16::new([ - 0.0_f32, // W_64^0 - -0.098_017_14_f32, // W_64^1 = -sin(π/32) - -0.195_090_32_f32, // W_64^2 = -sin(π/16) - -0.290_284_66_f32, // W_64^3 = -sin(3π/32) - -0.382_683_43_f32, // W_64^4 = -sin(π/8) - -0.471_396_74_f32, // W_64^5 = -sin(5π/32) - -0.555_570_24_f32, // W_64^6 = -sin(3π/16) - -0.634_393_3_f32, // W_64^7 = -sin(7π/32) - -std::f32::consts::FRAC_1_SQRT_2, // W_64^8 - -0.773_010_43_f32, // W_64^9 - -0.831_469_6_f32, // W_64^10 - -0.881_921_3_f32, // W_64^11 - -0.923_879_5_f32, // W_64^12 - -0.956_940_35_f32, // W_64^13 - -0.980_785_25_f32, // W_64^14 - -0.995_184_7_f32, // W_64^15 - ]); + let twiddle_re_0_15 = f32x16::simd_from( + simd, + [ + 1.0_f32, // W_64^0 = 1 + 0.995_184_7_f32, // W_64^1 = cos(π/32) + 0.980_785_25_f32, // W_64^2 = cos(π/16) + 0.956_940_35_f32, // W_64^3 = cos(3π/32) + 0.923_879_5_f32, // W_64^4 = cos(π/8) + 0.881_921_3_f32, // W_64^5 = cos(5π/32) + 0.831_469_6_f32, // W_64^6 = cos(3π/16) + 0.773_010_43_f32, // W_64^7 = cos(7π/32) + std::f32::consts::FRAC_1_SQRT_2, // W_64^8 = sqrt(2)/2 + 0.634_393_3_f32, // W_64^9 + 0.555_570_24_f32, // W_64^10 + 0.471_396_74_f32, // W_64^11 + 0.382_683_43_f32, // W_64^12 + 0.290_284_66_f32, // W_64^13 + 0.195_090_32_f32, // W_64^14 + 0.098_017_14_f32, // W_64^15 + ], + ); + + let twiddle_im_0_15 = f32x16::simd_from( + simd, + [ + 0.0_f32, // W_64^0 + -0.098_017_14_f32, // W_64^1 = -sin(π/32) + -0.195_090_32_f32, // W_64^2 = -sin(π/16) + -0.290_284_66_f32, // W_64^3 = -sin(3π/32) + -0.382_683_43_f32, // W_64^4 = -sin(π/8) + -0.471_396_74_f32, // W_64^5 = -sin(5π/32) + -0.555_570_24_f32, // W_64^6 = -sin(3π/16) + -0.634_393_3_f32, // W_64^7 = -sin(7π/32) + -std::f32::consts::FRAC_1_SQRT_2, // W_64^8 + -0.773_010_43_f32, // W_64^9 + -0.831_469_6_f32, // W_64^10 + -0.881_921_3_f32, // W_64^11 + -0.923_879_5_f32, // W_64^12 + -0.956_940_35_f32, // W_64^13 + -0.980_785_25_f32, // W_64^14 + -0.995_184_7_f32, // W_64^15 + ], + ); // Twiddles for k = 16..31 - let twiddle_re_16_31 = f32x16::new([ - 0.0_f32, // W_64^16 = -i - -0.098_017_14_f32, // W_64^17 - -0.195_090_32_f32, // W_64^18 - -0.290_284_66_f32, // W_64^19 - -0.382_683_43_f32, // W_64^20 - -0.471_396_74_f32, // W_64^21 - -0.555_570_24_f32, // W_64^22 - -0.634_393_3_f32, // W_64^23 - -std::f32::consts::FRAC_1_SQRT_2, // W_64^24 - -0.773_010_43_f32, // W_64^25 - -0.831_469_6_f32, // W_64^26 - -0.881_921_3_f32, // W_64^27 - -0.923_879_5_f32, // W_64^28 - -0.956_940_35_f32, // W_64^29 - -0.980_785_25_f32, // W_64^30 - -0.995_184_7_f32, // W_64^31 - ]); - - let twiddle_im_16_31 = f32x16::new([ - -1.0_f32, // W_64^16 - -0.995_184_7_f32, // W_64^17 - -0.980_785_25_f32, // W_64^18 - -0.956_940_35_f32, // W_64^19 - -0.923_879_5_f32, // W_64^20 - -0.881_921_3_f32, // W_64^21 - -0.831_469_6_f32, // W_64^22 - -0.773_010_43_f32, // W_64^23 - -std::f32::consts::FRAC_1_SQRT_2, // W_64^24 - -0.634_393_3_f32, // W_64^25 - -0.555_570_24_f32, // W_64^26 - -0.471_396_74_f32, // W_64^27 - -0.382_683_43_f32, // W_64^28 - -0.290_284_66_f32, // W_64^29 - -0.195_090_32_f32, // W_64^30 - -0.098_017_14_f32, // W_64^31 - ]); + let twiddle_re_16_31 = f32x16::simd_from( + simd, + [ + 0.0_f32, // W_64^16 = -i + -0.098_017_14_f32, // W_64^17 + -0.195_090_32_f32, // W_64^18 + -0.290_284_66_f32, // W_64^19 + -0.382_683_43_f32, // W_64^20 + -0.471_396_74_f32, // W_64^21 + -0.555_570_24_f32, // W_64^22 + -0.634_393_3_f32, // W_64^23 + -std::f32::consts::FRAC_1_SQRT_2, // W_64^24 + -0.773_010_43_f32, // W_64^25 + -0.831_469_6_f32, // W_64^26 + -0.881_921_3_f32, // W_64^27 + -0.923_879_5_f32, // W_64^28 + -0.956_940_35_f32, // W_64^29 + -0.980_785_25_f32, // W_64^30 + -0.995_184_7_f32, // W_64^31 + ], + ); + + let twiddle_im_16_31 = f32x16::simd_from( + simd, + [ + -1.0_f32, // W_64^16 + -0.995_184_7_f32, // W_64^17 + -0.980_785_25_f32, // W_64^18 + -0.956_940_35_f32, // W_64^19 + -0.923_879_5_f32, // W_64^20 + -0.881_921_3_f32, // W_64^21 + -0.831_469_6_f32, // W_64^22 + -0.773_010_43_f32, // W_64^23 + -std::f32::consts::FRAC_1_SQRT_2, // W_64^24 + -0.634_393_3_f32, // W_64^25 + -0.555_570_24_f32, // W_64^26 + -0.471_396_74_f32, // W_64^27 + -0.382_683_43_f32, // W_64^28 + -0.290_284_66_f32, // W_64^29 + -0.195_090_32_f32, // W_64^30 + -0.098_017_14_f32, // W_64^31 + ], + ); (reals.as_chunks_mut::().0.iter_mut()) .zip(imags.as_chunks_mut::().0.iter_mut()) @@ -831,26 +931,26 @@ pub fn fft_dit_chunk_64_simd_f32(reals: &mut [f32], imags: &mut [f32]) { let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); // Process butterflies 0..15 - let in0_re = f32x16::new(reals_s0[0..16].try_into().unwrap()); - let in1_re = f32x16::new(reals_s1[0..16].try_into().unwrap()); - let in0_im = f32x16::new(imags_s0[0..16].try_into().unwrap()); - let in1_im = f32x16::new(imags_s1[0..16].try_into().unwrap()); + let in0_re = f32x16::from_slice(simd, &reals_s0[0..16]); + let in1_re = f32x16::from_slice(simd, &reals_s1[0..16]); + let in0_im = f32x16::from_slice(simd, &imags_s0[0..16]); + let in1_im = f32x16::from_slice(simd, &imags_s1[0..16]); let out0_re = twiddle_im_0_15.mul_add(-in1_im, twiddle_re_0_15.mul_add(in1_re, in0_re)); let out0_im = twiddle_im_0_15.mul_add(in1_re, twiddle_re_0_15.mul_add(in1_im, in0_im)); let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0[0..16].copy_from_slice(out0_re.as_array()); - imags_s0[0..16].copy_from_slice(out0_im.as_array()); - reals_s1[0..16].copy_from_slice(out1_re.as_array()); - imags_s1[0..16].copy_from_slice(out1_im.as_array()); + out0_re.store_slice(&mut reals_s0[0..16]); + out0_im.store_slice(&mut imags_s0[0..16]); + out1_re.store_slice(&mut reals_s1[0..16]); + out1_im.store_slice(&mut imags_s1[0..16]); // Process butterflies 16..31 - let in0_re = f32x16::new(reals_s0[16..32].try_into().unwrap()); - let in1_re = f32x16::new(reals_s1[16..32].try_into().unwrap()); - let in0_im = f32x16::new(imags_s0[16..32].try_into().unwrap()); - let in1_im = f32x16::new(imags_s1[16..32].try_into().unwrap()); + let in0_re = f32x16::from_slice(simd, &reals_s0[16..32]); + let in1_re = f32x16::from_slice(simd, &reals_s1[16..32]); + let in0_im = f32x16::from_slice(simd, &imags_s0[16..32]); + let in1_im = f32x16::from_slice(simd, &imags_s1[16..32]); let out0_re = twiddle_im_16_31.mul_add(-in1_im, twiddle_re_16_31.mul_add(in1_re, in0_re)); @@ -859,24 +959,33 @@ pub fn fft_dit_chunk_64_simd_f32(reals: &mut [f32], imags: &mut [f32]) { let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - reals_s0[16..32].copy_from_slice(out0_re.as_array()); - imags_s0[16..32].copy_from_slice(out0_im.as_array()); - reals_s1[16..32].copy_from_slice(out1_re.as_array()); - imags_s1[16..32].copy_from_slice(out1_im.as_array()); + out0_re.store_slice(&mut reals_s0[16..32]); + out0_im.store_slice(&mut imags_s0[16..32]); + out1_re.store_slice(&mut reals_s1[16..32]); + out1_im.store_slice(&mut imags_s1[16..32]); }); } /// General DIT butterfly for f64 -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_64_chunk_n_simd( +#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure +pub fn fft_dit_chunk_n_f64( + simd: S, + reals: &mut [f64], + imags: &mut [f64], + twiddles_re: &[f64], + twiddles_im: &[f64], + dist: usize, +) { + simd.vectorize( + #[inline(always)] + || fft_dit_chunk_n_simd_f64(simd, reals, imags, twiddles_re, twiddles_im, dist), + ) +} + +/// General DIT butterfly for f64 +#[inline(always)] // required by fearless_simd +fn fft_dit_chunk_n_simd_f64( + simd: S, reals: &mut [f64], imags: &mut [f64], twiddles_re: &[f64], @@ -901,14 +1010,14 @@ pub fn fft_dit_64_chunk_n_simd( .zip(twiddles_re.as_chunks::().0.iter()) .zip(twiddles_im.as_chunks::().0.iter()) .for_each(|(((((re_s0, re_s1), im_s0), im_s1), tw_re), tw_im)| { - let two = f64x8::splat(2.0); - let in0_re = f64x8::new(*re_s0); - let in1_re = f64x8::new(*re_s1); - let in0_im = f64x8::new(*im_s0); - let in1_im = f64x8::new(*im_s1); + let two = f64x8::splat(simd, 2.0); + let in0_re = f64x8::simd_from(simd, *re_s0); + let in1_re = f64x8::simd_from(simd, *re_s1); + let in0_im = f64x8::simd_from(simd, *im_s0); + let in1_im = f64x8::simd_from(simd, *im_s1); - let tw_re = f64x8::new(*tw_re); - let tw_im = f64x8::new(*tw_im); + let tw_re = f64x8::simd_from(simd, *tw_re); + let tw_im = f64x8::simd_from(simd, *tw_im); // out0.re = (in0.re + tw_re * in1.re) - tw_im * in1.im let out0_re = tw_im.mul_add(-in1_im, tw_re.mul_add(in1_re, in0_re)); @@ -919,25 +1028,34 @@ pub fn fft_dit_64_chunk_n_simd( let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - re_s0.copy_from_slice(out0_re.as_array()); - im_s0.copy_from_slice(out0_im.as_array()); - re_s1.copy_from_slice(out1_re.as_array()); - im_s1.copy_from_slice(out1_im.as_array()); + out0_re.store_slice(re_s0); + out0_im.store_slice(im_s0); + out1_re.store_slice(re_s1); + out1_im.store_slice(im_s1); }); }); } /// General DIT butterfly for f32 -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -pub fn fft_dit_32_chunk_n_simd( +#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure +pub fn fft_dit_chunk_n_f32( + simd: S, + reals: &mut [f32], + imags: &mut [f32], + twiddles_re: &[f32], + twiddles_im: &[f32], + dist: usize, +) { + simd.vectorize( + #[inline(always)] + || fft_dit_chunk_n_simd_f32(simd, reals, imags, twiddles_re, twiddles_im, dist), + ) +} + +/// General DIT butterfly for f32 +#[inline(always)] // required by fearless_simd +fn fft_dit_chunk_n_simd_f32( + simd: S, reals: &mut [f32], imags: &mut [f32], twiddles_re: &[f32], @@ -962,14 +1080,14 @@ pub fn fft_dit_32_chunk_n_simd( .zip(twiddles_re.as_chunks::().0.iter()) .zip(twiddles_im.as_chunks::().0.iter()) .for_each(|(((((re_s0, re_s1), im_s0), im_s1), tw_re), tw_im)| { - let two = f32x16::splat(2.0); - let in0_re = f32x16::new(*re_s0); - let in1_re = f32x16::new(*re_s1); - let in0_im = f32x16::new(*im_s0); - let in1_im = f32x16::new(*im_s1); + let two = f32x16::splat(simd, 2.0); + let in0_re = f32x16::simd_from(simd, *re_s0); + let in1_re = f32x16::simd_from(simd, *re_s1); + let in0_im = f32x16::simd_from(simd, *im_s0); + let in1_im = f32x16::simd_from(simd, *im_s1); - let tw_re = f32x16::new(*tw_re); - let tw_im = f32x16::new(*tw_im); + let tw_re = f32x16::simd_from(simd, *tw_re); + let tw_im = f32x16::simd_from(simd, *tw_im); // out0.re = (in0.re + tw_re * in1.re) - tw_im * in1.im let out0_re = tw_im.mul_add(-in1_im, tw_re.mul_add(in1_re, in0_re)); @@ -980,10 +1098,10 @@ pub fn fft_dit_32_chunk_n_simd( let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); - re_s0.copy_from_slice(out0_re.as_array()); - im_s0.copy_from_slice(out0_im.as_array()); - re_s1.copy_from_slice(out1_re.as_array()); - im_s1.copy_from_slice(out1_im.as_array()); + out0_re.store_slice(re_s0); + out0_im.store_slice(im_s0); + out1_re.store_slice(re_s1); + out1_im.store_slice(im_s1); }); }); }