diff --git a/src/lib.rs b/src/lib.rs index fd8e4b9..d0d84ca 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,6 +21,7 @@ mod kernels; pub mod options; pub mod planner; mod twiddles; +mod utils; macro_rules! impl_fft_for { ($func_name:ident, $precision:ty, $planner:ty, $opts_and_plan:ident) => { diff --git a/src/utils.rs b/src/utils.rs new file mode 100644 index 0000000..70ed734 --- /dev/null +++ b/src/utils.rs @@ -0,0 +1,86 @@ +//! Utility functions such as interleave/deinterleave + +use std::simd::{prelude::Simd, simd_swizzle, SimdElement}; + +// We don't multiversion for AVX-512 here and keep the chunk size below AVX-512 +// because we haven't seen any gains from it in benchmarks. +// This might be due to us running benchmarks on Zen4 which implements AVX-512 +// on top of 256-bit wide execution units. +// +// If benchmarks on "real" AVX-512 show improvement on AVX-512 +// without degrading AVX2 machines due to larger chunk size, +// the AVX-512 specialization should be re-enabled. +#[multiversion::multiversion( + targets( + "x86_64+avx2+fma", // x86_64-v3 + "x86_64+sse4.2", // x86_64-v2 + "x86+avx2+fma", + "x86+sse4.2", + "x86+sse2", + ))] +/// Separates data like `[1, 2, 3, 4]` into `([1, 3], [2, 4])` for any length +pub(crate) fn deinterleave(input: &[T]) -> (Vec, Vec) { + const CHUNK_SIZE: usize = 4; + const DOUBLE_CHUNK: usize = CHUNK_SIZE * 2; + + let out_len = input.len() / 2; + // We've benchmarked, and it turns out that this approach with zeroed memory + // is faster than using uninit memory and bumping the length once in a while! + let mut out_odd = vec![T::default(); out_len]; + let mut out_even = vec![T::default(); out_len]; + + input + .chunks_exact(DOUBLE_CHUNK) + .zip(out_odd.chunks_exact_mut(CHUNK_SIZE)) + .zip(out_even.chunks_exact_mut(CHUNK_SIZE)) + .for_each(|((in_chunk, odds), evens)| { + let in_simd: Simd = Simd::from_array(in_chunk.try_into().unwrap()); + // This generates *slightly* faster code than just assigning values by index. + // You'd think simd::deinterleave would be appropriate, but it does something different! + let result = simd_swizzle!(in_simd, [0, 2, 4, 6, 1, 3, 5, 7]); + let result_arr = result.to_array(); + odds.copy_from_slice(&result_arr[..CHUNK_SIZE]); + evens.copy_from_slice(&result_arr[CHUNK_SIZE..]); + }); + + // Process the remainder, too small for the vectorized loop + let input_rem = input.chunks_exact(DOUBLE_CHUNK).remainder(); + let odds_rem = out_odd.chunks_exact_mut(CHUNK_SIZE).into_remainder(); + let evens_rem = out_even.chunks_exact_mut(CHUNK_SIZE).into_remainder(); + input_rem + .chunks_exact(2) + .zip(odds_rem.iter_mut()) + .zip(evens_rem.iter_mut()) + .for_each(|((inp, odd), even)| { + *odd = inp[0]; + *even = inp[1]; + }); + + (out_odd, out_even) +} + +#[cfg(test)] +mod tests { + use super::deinterleave; + + fn gen_test_vec(len: usize) -> Vec { + (0..len).into_iter().collect() + } + + /// Slow but obviously correct implementation of deinterleaving, + /// to be used in tests + fn deinterleave_naive(input: &[T]) -> (Vec, Vec) { + input.chunks_exact(2).map(|c| (c[0], c[1])).unzip() + } + + #[test] + fn deinterleaving_correctness() { + for len in [0, 1, 2, 3, 15, 16, 17, 127, 128, 129, 130, 135, 100500] { + let input = gen_test_vec(len); + let (naive_a, naive_b) = deinterleave_naive(&input); + let (opt_a, opt_b) = deinterleave(&input); + assert_eq!(naive_a, opt_a); + assert_eq!(naive_b, opt_b); + } + } +}