From 3c802b944cf19e4633f0b3304dd0936a795f1f2d Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 19 Jun 2024 19:29:12 +0100 Subject: [PATCH 1/5] Add deinterleaving function --- src/lib.rs | 1 + src/utils.rs | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 src/utils.rs diff --git a/src/lib.rs b/src/lib.rs index fd8e4b9..d0d84ca 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,6 +21,7 @@ mod kernels; pub mod options; pub mod planner; mod twiddles; +mod utils; macro_rules! impl_fft_for { ($func_name:ident, $precision:ty, $planner:ty, $opts_and_plan:ident) => { diff --git a/src/utils.rs b/src/utils.rs new file mode 100644 index 0000000..70ed734 --- /dev/null +++ b/src/utils.rs @@ -0,0 +1,86 @@ +//! Utility functions such as interleave/deinterleave + +use std::simd::{prelude::Simd, simd_swizzle, SimdElement}; + +// We don't multiversion for AVX-512 here and keep the chunk size below AVX-512 +// because we haven't seen any gains from it in benchmarks. +// This might be due to us running benchmarks on Zen4 which implements AVX-512 +// on top of 256-bit wide execution units. +// +// If benchmarks on "real" AVX-512 show improvement on AVX-512 +// without degrading AVX2 machines due to larger chunk size, +// the AVX-512 specialization should be re-enabled. +#[multiversion::multiversion( + targets( + "x86_64+avx2+fma", // x86_64-v3 + "x86_64+sse4.2", // x86_64-v2 + "x86+avx2+fma", + "x86+sse4.2", + "x86+sse2", + ))] +/// Separates data like `[1, 2, 3, 4]` into `([1, 3], [2, 4])` for any length +pub(crate) fn deinterleave(input: &[T]) -> (Vec, Vec) { + const CHUNK_SIZE: usize = 4; + const DOUBLE_CHUNK: usize = CHUNK_SIZE * 2; + + let out_len = input.len() / 2; + // We've benchmarked, and it turns out that this approach with zeroed memory + // is faster than using uninit memory and bumping the length once in a while! + let mut out_odd = vec![T::default(); out_len]; + let mut out_even = vec![T::default(); out_len]; + + input + .chunks_exact(DOUBLE_CHUNK) + .zip(out_odd.chunks_exact_mut(CHUNK_SIZE)) + .zip(out_even.chunks_exact_mut(CHUNK_SIZE)) + .for_each(|((in_chunk, odds), evens)| { + let in_simd: Simd = Simd::from_array(in_chunk.try_into().unwrap()); + // This generates *slightly* faster code than just assigning values by index. + // You'd think simd::deinterleave would be appropriate, but it does something different! + let result = simd_swizzle!(in_simd, [0, 2, 4, 6, 1, 3, 5, 7]); + let result_arr = result.to_array(); + odds.copy_from_slice(&result_arr[..CHUNK_SIZE]); + evens.copy_from_slice(&result_arr[CHUNK_SIZE..]); + }); + + // Process the remainder, too small for the vectorized loop + let input_rem = input.chunks_exact(DOUBLE_CHUNK).remainder(); + let odds_rem = out_odd.chunks_exact_mut(CHUNK_SIZE).into_remainder(); + let evens_rem = out_even.chunks_exact_mut(CHUNK_SIZE).into_remainder(); + input_rem + .chunks_exact(2) + .zip(odds_rem.iter_mut()) + .zip(evens_rem.iter_mut()) + .for_each(|((inp, odd), even)| { + *odd = inp[0]; + *even = inp[1]; + }); + + (out_odd, out_even) +} + +#[cfg(test)] +mod tests { + use super::deinterleave; + + fn gen_test_vec(len: usize) -> Vec { + (0..len).into_iter().collect() + } + + /// Slow but obviously correct implementation of deinterleaving, + /// to be used in tests + fn deinterleave_naive(input: &[T]) -> (Vec, Vec) { + input.chunks_exact(2).map(|c| (c[0], c[1])).unzip() + } + + #[test] + fn deinterleaving_correctness() { + for len in [0, 1, 2, 3, 15, 16, 17, 127, 128, 129, 130, 135, 100500] { + let input = gen_test_vec(len); + let (naive_a, naive_b) = deinterleave_naive(&input); + let (opt_a, opt_b) = deinterleave(&input); + assert_eq!(naive_a, opt_a); + assert_eq!(naive_b, opt_b); + } + } +} From 7011dfc040b866cd773185667915a9920b9c5a80 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Thu, 20 Jun 2024 19:03:39 -0400 Subject: [PATCH 2/5] Make sure benchmark runs --- Cargo.toml | 5 +++++ src/lib.rs | 2 +- src/utils.rs | 17 +++++++++-------- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 12d50ef..6a47f5d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,11 +13,16 @@ exclude = ["assets", "scripts", "benches"] [dependencies] num-traits = "0.2.18" multiversion = "0.7" +criterion = "0.5.1" [dev-dependencies] utilities = { path = "utilities" } fftw = "0.8.0" +[[bench]] +name = "bench" +harness = false + [profile.release] codegen-units = 1 lto = true diff --git a/src/lib.rs b/src/lib.rs index d0d84ca..87e6275 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,7 +21,7 @@ mod kernels; pub mod options; pub mod planner; mod twiddles; -mod utils; +pub mod utils; macro_rules! impl_fft_for { ($func_name:ident, $precision:ty, $planner:ty, $opts_and_plan:ident) => { diff --git a/src/utils.rs b/src/utils.rs index 70ed734..9facec9 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -59,18 +59,19 @@ pub(crate) fn deinterleave(input: &[T]) -> (Vec (out_odd, out_even) } +/// Slow but obviously correct implementation of deinterleaving, +/// to be used in tests +#[allow(dead_code)] +pub fn deinterleave_naive(input: &[T]) -> (Vec, Vec) { + input.chunks_exact(2).map(|c| (c[0], c[1])).unzip() +} + #[cfg(test)] mod tests { - use super::deinterleave; + use super::*; fn gen_test_vec(len: usize) -> Vec { - (0..len).into_iter().collect() - } - - /// Slow but obviously correct implementation of deinterleaving, - /// to be used in tests - fn deinterleave_naive(input: &[T]) -> (Vec, Vec) { - input.chunks_exact(2).map(|c| (c[0], c[1])).unzip() + (0..len).collect() } #[test] From b70dd4b318f59af32740c4bd4c9e75a69bcd690b Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Thu, 20 Jun 2024 19:04:36 -0400 Subject: [PATCH 3/5] Forgot to add benchmark file --- benches/bench.rs | 225 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 benches/bench.rs diff --git a/benches/bench.rs b/benches/bench.rs new file mode 100644 index 0000000..416c438 --- /dev/null +++ b/benches/bench.rs @@ -0,0 +1,225 @@ +#![feature(portable_simd, avx512_target_feature)] + +use std::simd::{simd_swizzle, Simd, SimdElement}; + +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; + +use phastft::utils::deinterleave_naive; + +// fn criterion_benchmark(c: &mut Criterion) { +// let sizes = vec![1 << 10, 1 << 12, 1 << 14, 1 << 16, 1 << 18, 1 << 20]; +// +// let mut group = c.benchmark_group("r2c_versus_c2c"); +// for &size in &sizes { +// group.throughput(Throughput::Elements(size as u64)); +// +// group.bench_with_input(BenchmarkId::new("r2c_fft", size), &size, |b, &size| { +// let mut s_re = vec![0.0; size]; +// let mut s_im = vec![0.0; size]; +// gen_random_signal(&mut s_re, &mut s_im); +// +// b.iter(|| { +// let mut output_re = vec![0.0; size]; +// let mut output_im = vec![0.0; size]; +// r2c_fft_f64( +// black_box(&mut s_re), +// black_box(&mut output_re), +// black_box(&mut output_im), +// ); +// }); +// }); +// +// group.bench_with_input(BenchmarkId::new("c2c_fft", size), &size, |b, &size| { +// let mut s_re = vec![0.0; size]; +// let mut s_im = vec![0.0; size]; +// gen_random_signal(&mut s_re, &mut s_im); +// s_im = vec![0.0; size]; +// +// b.iter(|| { +// fft_64( +// black_box(&mut s_re), +// black_box(&mut s_im), +// Direction::Forward, +// ); +// }); +// }); +// +// group.bench_with_input(BenchmarkId::new("real_fft", size), &size, |b, &size| { +// let mut s_re = vec![0.0; size]; +// let mut s_im = vec![0.0; size]; +// gen_random_signal(&mut s_re, &mut s_im); +// let mut output = vec![Complex::default(); s_re.len() / 2 + 1]; +// +// b.iter(|| { +// let mut planner = RealFftPlanner::::new(); +// let fft = planner.plan_fft_forward(s_re.len()); +// fft.process(&mut s_re, &mut output) +// .expect("fft.process() failed!"); +// }); +// }); +// } +// group.finish(); +// } + +#[multiversion::multiversion( + targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4 + "x86_64+avx2+fma", // x86_64-v3 + "x86_64+sse4.2", // x86_64-v2 + "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", + "x86+avx2+fma", + "x86+sse4.2", + "x86+sse2", + ))] +fn deinterleave(input: &[T]) -> (Vec, Vec) { + const CHUNK_SIZE: usize = 4; + + let out_len = input.len() / 2; + let mut out_odd = vec![T::default(); out_len]; + let mut out_even = vec![T::default(); out_len]; + + input + .chunks_exact(CHUNK_SIZE * 2) + .zip(out_odd.chunks_exact_mut(CHUNK_SIZE)) + .zip(out_even.chunks_exact_mut(CHUNK_SIZE)) + .for_each(|((in_chunk, odds), evens)| { + odds[0] = in_chunk[0]; + evens[0] = in_chunk[1]; + odds[1] = in_chunk[2]; + evens[1] = in_chunk[3]; + odds[2] = in_chunk[4]; + evens[2] = in_chunk[5]; + odds[3] = in_chunk[6]; + evens[3] = in_chunk[7]; + }); + + (out_odd, out_even) +} + +#[multiversion::multiversion( + targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4 + "x86_64+avx2+fma", // x86_64-v3 + "x86_64+sse4.2", // x86_64-v2 + "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", + "x86+avx2+fma", + "x86+sse4.2", + "x86+sse2", + ))] +fn deinterleave_simd_swizzle(input: &[T]) -> (Vec, Vec) { + const CHUNK_SIZE: usize = 4; + const DOUBLE_CHUNK: usize = CHUNK_SIZE * 2; + + let out_len = input.len() / 2; + let mut out_odd = vec![T::default(); out_len]; + let mut out_even = vec![T::default(); out_len]; + + input + .chunks_exact(DOUBLE_CHUNK) + .zip(out_odd.chunks_exact_mut(CHUNK_SIZE)) + .zip(out_even.chunks_exact_mut(CHUNK_SIZE)) + .for_each(|((in_chunk, odds), evens)| { + let in_simd: Simd = Simd::from_array(in_chunk.try_into().unwrap()); + let result = simd_swizzle!(in_simd, [0, 2, 4, 6, 1, 3, 5, 7]); + let result_arr = result.to_array(); + odds.copy_from_slice(&result_arr[..CHUNK_SIZE]); + evens.copy_from_slice(&result_arr[CHUNK_SIZE..]); + }); + + (out_odd, out_even) +} + +// We don't multiversion for AVX-512 here and keep the chunk size below AVX-512 +// because we haven't seen any gains from it in benchmarks. +// This might be due to us running benchmarks on Zen4 which implements AVX-512 +// on top of 256-bit wide execution units. +// +// If benchmarks on "real" AVX-512 show improvement on AVX-512 +// without degrading AVX2 machines due to larger chunk size, +// the AVX-512 specialization should be re-enabled. +#[multiversion::multiversion( + targets( + "x86_64+avx2+fma", // x86_64-v3 + "x86_64+sse4.2", // x86_64-v2 + "x86+avx2+fma", + "x86+sse4.2", + "x86+sse2", + ))] +/// Separates data like `[1, 2, 3, 4]` into `([1, 3], [2, 4])` for any length +pub(crate) fn deinterleave_from_pr( + input: &[T], +) -> (Vec, Vec) { + const CHUNK_SIZE: usize = 4; + const DOUBLE_CHUNK: usize = CHUNK_SIZE * 2; + + let out_len = input.len() / 2; + // We've benchmarked, and it turns out that this approach with zeroed memory + // is faster than using uninit memory and bumping the length once in a while! + let mut out_odd = vec![T::default(); out_len]; + let mut out_even = vec![T::default(); out_len]; + + input + .chunks_exact(DOUBLE_CHUNK) + .zip(out_odd.chunks_exact_mut(CHUNK_SIZE)) + .zip(out_even.chunks_exact_mut(CHUNK_SIZE)) + .for_each(|((in_chunk, odds), evens)| { + let in_simd: Simd = Simd::from_array(in_chunk.try_into().unwrap()); + // This generates *slightly* faster code than just assigning values by index. + // You'd think simd::deinterleave would be appropriate, but it does something different! + let result = simd_swizzle!(in_simd, [0, 2, 4, 6, 1, 3, 5, 7]); + let result_arr = result.to_array(); + odds.copy_from_slice(&result_arr[..CHUNK_SIZE]); + evens.copy_from_slice(&result_arr[CHUNK_SIZE..]); + }); + + // Process the remainder, too small for the vectorized loop + let input_rem = input.chunks_exact(DOUBLE_CHUNK).remainder(); + let odds_rem = out_odd.chunks_exact_mut(CHUNK_SIZE).into_remainder(); + let evens_rem = out_even.chunks_exact_mut(CHUNK_SIZE).into_remainder(); + input_rem + .chunks_exact(2) + .zip(odds_rem.iter_mut()) + .zip(evens_rem.iter_mut()) + .for_each(|((inp, odd), even)| { + *odd = inp[0]; + *even = inp[1]; + }); + + (out_odd, out_even) +} + +fn benchmark_deinterleave(c: &mut Criterion) { + let mut group = c.benchmark_group(format!("deinterleave")); + + for s in (4..=28).step_by(4) { + let size = 1 << s; + let input: Vec = (0..size).map(|x| x as f64).collect(); + + group.bench_with_input( + BenchmarkId::new("Naive deinterleave", size), + &input, + |b, input| b.iter(|| deinterleave_naive(black_box(input))), + ); + + group.bench_with_input( + BenchmarkId::new("Autovectorized deinterleave", size), + &input, + |b, input| b.iter(|| deinterleave(black_box(input))), + ); + + group.bench_with_input( + BenchmarkId::new("Simd Swizzle deinterleave", size), + &input, + |b, input| b.iter(|| deinterleave_simd_swizzle(black_box(input))), + ); + + group.bench_with_input( + BenchmarkId::new("PR deinterleave", size), + &input, + |b, input| b.iter(|| deinterleave_simd_swizzle(black_box(input))), + ); + } + + group.finish(); +} + +criterion_group!(benches, benchmark_deinterleave); +criterion_main!(benches); From a089121b986af6e5ee8536a1be25fc8e736a2c83 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Fri, 21 Jun 2024 01:29:58 -0400 Subject: [PATCH 4/5] Revert "Make sure benchmark runs" This reverts commit 7011dfc040b866cd773185667915a9920b9c5a80. --- Cargo.toml | 5 ----- src/lib.rs | 2 +- src/utils.rs | 17 ++++++++--------- 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6a47f5d..12d50ef 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,16 +13,11 @@ exclude = ["assets", "scripts", "benches"] [dependencies] num-traits = "0.2.18" multiversion = "0.7" -criterion = "0.5.1" [dev-dependencies] utilities = { path = "utilities" } fftw = "0.8.0" -[[bench]] -name = "bench" -harness = false - [profile.release] codegen-units = 1 lto = true diff --git a/src/lib.rs b/src/lib.rs index 87e6275..d0d84ca 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,7 +21,7 @@ mod kernels; pub mod options; pub mod planner; mod twiddles; -pub mod utils; +mod utils; macro_rules! impl_fft_for { ($func_name:ident, $precision:ty, $planner:ty, $opts_and_plan:ident) => { diff --git a/src/utils.rs b/src/utils.rs index 9facec9..70ed734 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -59,19 +59,18 @@ pub(crate) fn deinterleave(input: &[T]) -> (Vec (out_odd, out_even) } -/// Slow but obviously correct implementation of deinterleaving, -/// to be used in tests -#[allow(dead_code)] -pub fn deinterleave_naive(input: &[T]) -> (Vec, Vec) { - input.chunks_exact(2).map(|c| (c[0], c[1])).unzip() -} - #[cfg(test)] mod tests { - use super::*; + use super::deinterleave; fn gen_test_vec(len: usize) -> Vec { - (0..len).collect() + (0..len).into_iter().collect() + } + + /// Slow but obviously correct implementation of deinterleaving, + /// to be used in tests + fn deinterleave_naive(input: &[T]) -> (Vec, Vec) { + input.chunks_exact(2).map(|c| (c[0], c[1])).unzip() } #[test] From 303eef5f1b05b415146df259369a209e2cf7bd2b Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Fri, 21 Jun 2024 01:45:56 -0400 Subject: [PATCH 5/5] Revert "Forgot to add benchmark file" This reverts commit b70dd4b318f59af32740c4bd4c9e75a69bcd690b. --- benches/bench.rs | 225 ----------------------------------------------- 1 file changed, 225 deletions(-) delete mode 100644 benches/bench.rs diff --git a/benches/bench.rs b/benches/bench.rs deleted file mode 100644 index 416c438..0000000 --- a/benches/bench.rs +++ /dev/null @@ -1,225 +0,0 @@ -#![feature(portable_simd, avx512_target_feature)] - -use std::simd::{simd_swizzle, Simd, SimdElement}; - -use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; - -use phastft::utils::deinterleave_naive; - -// fn criterion_benchmark(c: &mut Criterion) { -// let sizes = vec![1 << 10, 1 << 12, 1 << 14, 1 << 16, 1 << 18, 1 << 20]; -// -// let mut group = c.benchmark_group("r2c_versus_c2c"); -// for &size in &sizes { -// group.throughput(Throughput::Elements(size as u64)); -// -// group.bench_with_input(BenchmarkId::new("r2c_fft", size), &size, |b, &size| { -// let mut s_re = vec![0.0; size]; -// let mut s_im = vec![0.0; size]; -// gen_random_signal(&mut s_re, &mut s_im); -// -// b.iter(|| { -// let mut output_re = vec![0.0; size]; -// let mut output_im = vec![0.0; size]; -// r2c_fft_f64( -// black_box(&mut s_re), -// black_box(&mut output_re), -// black_box(&mut output_im), -// ); -// }); -// }); -// -// group.bench_with_input(BenchmarkId::new("c2c_fft", size), &size, |b, &size| { -// let mut s_re = vec![0.0; size]; -// let mut s_im = vec![0.0; size]; -// gen_random_signal(&mut s_re, &mut s_im); -// s_im = vec![0.0; size]; -// -// b.iter(|| { -// fft_64( -// black_box(&mut s_re), -// black_box(&mut s_im), -// Direction::Forward, -// ); -// }); -// }); -// -// group.bench_with_input(BenchmarkId::new("real_fft", size), &size, |b, &size| { -// let mut s_re = vec![0.0; size]; -// let mut s_im = vec![0.0; size]; -// gen_random_signal(&mut s_re, &mut s_im); -// let mut output = vec![Complex::default(); s_re.len() / 2 + 1]; -// -// b.iter(|| { -// let mut planner = RealFftPlanner::::new(); -// let fft = planner.plan_fft_forward(s_re.len()); -// fft.process(&mut s_re, &mut output) -// .expect("fft.process() failed!"); -// }); -// }); -// } -// group.finish(); -// } - -#[multiversion::multiversion( - targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4 - "x86_64+avx2+fma", // x86_64-v3 - "x86_64+sse4.2", // x86_64-v2 - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", - ))] -fn deinterleave(input: &[T]) -> (Vec, Vec) { - const CHUNK_SIZE: usize = 4; - - let out_len = input.len() / 2; - let mut out_odd = vec![T::default(); out_len]; - let mut out_even = vec![T::default(); out_len]; - - input - .chunks_exact(CHUNK_SIZE * 2) - .zip(out_odd.chunks_exact_mut(CHUNK_SIZE)) - .zip(out_even.chunks_exact_mut(CHUNK_SIZE)) - .for_each(|((in_chunk, odds), evens)| { - odds[0] = in_chunk[0]; - evens[0] = in_chunk[1]; - odds[1] = in_chunk[2]; - evens[1] = in_chunk[3]; - odds[2] = in_chunk[4]; - evens[2] = in_chunk[5]; - odds[3] = in_chunk[6]; - evens[3] = in_chunk[7]; - }); - - (out_odd, out_even) -} - -#[multiversion::multiversion( - targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4 - "x86_64+avx2+fma", // x86_64-v3 - "x86_64+sse4.2", // x86_64-v2 - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", - ))] -fn deinterleave_simd_swizzle(input: &[T]) -> (Vec, Vec) { - const CHUNK_SIZE: usize = 4; - const DOUBLE_CHUNK: usize = CHUNK_SIZE * 2; - - let out_len = input.len() / 2; - let mut out_odd = vec![T::default(); out_len]; - let mut out_even = vec![T::default(); out_len]; - - input - .chunks_exact(DOUBLE_CHUNK) - .zip(out_odd.chunks_exact_mut(CHUNK_SIZE)) - .zip(out_even.chunks_exact_mut(CHUNK_SIZE)) - .for_each(|((in_chunk, odds), evens)| { - let in_simd: Simd = Simd::from_array(in_chunk.try_into().unwrap()); - let result = simd_swizzle!(in_simd, [0, 2, 4, 6, 1, 3, 5, 7]); - let result_arr = result.to_array(); - odds.copy_from_slice(&result_arr[..CHUNK_SIZE]); - evens.copy_from_slice(&result_arr[CHUNK_SIZE..]); - }); - - (out_odd, out_even) -} - -// We don't multiversion for AVX-512 here and keep the chunk size below AVX-512 -// because we haven't seen any gains from it in benchmarks. -// This might be due to us running benchmarks on Zen4 which implements AVX-512 -// on top of 256-bit wide execution units. -// -// If benchmarks on "real" AVX-512 show improvement on AVX-512 -// without degrading AVX2 machines due to larger chunk size, -// the AVX-512 specialization should be re-enabled. -#[multiversion::multiversion( - targets( - "x86_64+avx2+fma", // x86_64-v3 - "x86_64+sse4.2", // x86_64-v2 - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", - ))] -/// Separates data like `[1, 2, 3, 4]` into `([1, 3], [2, 4])` for any length -pub(crate) fn deinterleave_from_pr( - input: &[T], -) -> (Vec, Vec) { - const CHUNK_SIZE: usize = 4; - const DOUBLE_CHUNK: usize = CHUNK_SIZE * 2; - - let out_len = input.len() / 2; - // We've benchmarked, and it turns out that this approach with zeroed memory - // is faster than using uninit memory and bumping the length once in a while! - let mut out_odd = vec![T::default(); out_len]; - let mut out_even = vec![T::default(); out_len]; - - input - .chunks_exact(DOUBLE_CHUNK) - .zip(out_odd.chunks_exact_mut(CHUNK_SIZE)) - .zip(out_even.chunks_exact_mut(CHUNK_SIZE)) - .for_each(|((in_chunk, odds), evens)| { - let in_simd: Simd = Simd::from_array(in_chunk.try_into().unwrap()); - // This generates *slightly* faster code than just assigning values by index. - // You'd think simd::deinterleave would be appropriate, but it does something different! - let result = simd_swizzle!(in_simd, [0, 2, 4, 6, 1, 3, 5, 7]); - let result_arr = result.to_array(); - odds.copy_from_slice(&result_arr[..CHUNK_SIZE]); - evens.copy_from_slice(&result_arr[CHUNK_SIZE..]); - }); - - // Process the remainder, too small for the vectorized loop - let input_rem = input.chunks_exact(DOUBLE_CHUNK).remainder(); - let odds_rem = out_odd.chunks_exact_mut(CHUNK_SIZE).into_remainder(); - let evens_rem = out_even.chunks_exact_mut(CHUNK_SIZE).into_remainder(); - input_rem - .chunks_exact(2) - .zip(odds_rem.iter_mut()) - .zip(evens_rem.iter_mut()) - .for_each(|((inp, odd), even)| { - *odd = inp[0]; - *even = inp[1]; - }); - - (out_odd, out_even) -} - -fn benchmark_deinterleave(c: &mut Criterion) { - let mut group = c.benchmark_group(format!("deinterleave")); - - for s in (4..=28).step_by(4) { - let size = 1 << s; - let input: Vec = (0..size).map(|x| x as f64).collect(); - - group.bench_with_input( - BenchmarkId::new("Naive deinterleave", size), - &input, - |b, input| b.iter(|| deinterleave_naive(black_box(input))), - ); - - group.bench_with_input( - BenchmarkId::new("Autovectorized deinterleave", size), - &input, - |b, input| b.iter(|| deinterleave(black_box(input))), - ); - - group.bench_with_input( - BenchmarkId::new("Simd Swizzle deinterleave", size), - &input, - |b, input| b.iter(|| deinterleave_simd_swizzle(black_box(input))), - ); - - group.bench_with_input( - BenchmarkId::new("PR deinterleave", size), - &input, - |b, input| b.iter(|| deinterleave_simd_swizzle(black_box(input))), - ); - } - - group.finish(); -} - -criterion_group!(benches, benchmark_deinterleave); -criterion_main!(benches);