From 21d77dbf52883bcdf48d474f4856e8640bb8d6f4 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Tue, 30 Apr 2024 12:05:30 -0400 Subject: [PATCH 01/23] Add a function to separate AoS to SoA --- Cargo.toml | 1 + src/lib.rs | 30 +++++++++++++++++++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 98e51eb..4b5c7a7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,7 @@ exclude = ["assets", "scripts", "benches"] [dependencies] num-traits = "0.2.18" multiversion = "0.7" +num-complex = "0.4.5" [dev-dependencies] utilities = { path = "utilities" } diff --git a/src/lib.rs b/src/lib.rs index c65a74f..44b78d8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,6 +8,9 @@ #![forbid(unsafe_code)] #![feature(portable_simd, avx512_target_feature)] +use num_complex::Complex; +use num_traits::Float; + use crate::cobra::cobra_apply; use crate::kernels::{ fft_32_chunk_n_simd, fft_64_chunk_n_simd, fft_chunk_2, fft_chunk_4, fft_chunk_n, @@ -156,8 +159,8 @@ mod tests { use std::ops::Range; use utilities::assert_float_closeness; - use utilities::rustfft::num_complex::Complex; use utilities::rustfft::FftPlanner; + use utilities::rustfft::num_complex::Complex; use super::*; @@ -257,3 +260,28 @@ mod tests { test_fft_correctness!(fft_correctness_32, f32, fft_32, 4, 9); test_fft_correctness!(fft_correctness_64, f64, fft_64, 4, 17); } + +/// Utility function to separate interleaved format signals (i.e., Vector of Complex Number Structs) +/// into separate vectors for the corresponding real and imaginary components. +pub fn separate_re_im(signal: &[Complex], chunk_size: usize) -> (Vec, Vec) { + let mut reals = Vec::with_capacity(signal.len()); + let mut imaginaries = Vec::with_capacity(signal.len()); + let iter = signal.chunks_exact(chunk_size); + let rem = iter.remainder(); + + // We don't assume power of 2 size, so we don't use `chunks_exact`. + // Process each chunk, including the last chunk which may be smaller. + for chunk in iter { + for num in chunk { + reals.push(num.re); + imaginaries.push(num.im); + } + } + + for num in rem { + reals.push(num.re); + imaginaries.push(num.im); + } + + (reals, imaginaries) +} From 58374f1f2ce659d17ac6f3e07346cdcd8c8feec0 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Tue, 30 Apr 2024 12:08:18 -0400 Subject: [PATCH 02/23] Fix formatting --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 44b78d8..2821ab7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -159,8 +159,8 @@ mod tests { use std::ops::Range; use utilities::assert_float_closeness; - use utilities::rustfft::FftPlanner; use utilities::rustfft::num_complex::Complex; + use utilities::rustfft::FftPlanner; use super::*; From 59de096ae48829d5ec55b03688528c8f042ec425 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Wed, 1 May 2024 13:40:19 -0400 Subject: [PATCH 03/23] Add reverse separate fn and add test --- src/lib.rs | 84 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 58 insertions(+), 26 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 2821ab7..bd8aea1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -154,16 +154,73 @@ impl_fft_with_opts_and_plan_for!( 16 ); +/// Utility function to separate interleaved format signals (i.e., Vector of Complex Number Structs) +/// into separate vectors for the corresponding real and imaginary components. +pub fn separate_re_im(signal: &[Complex], chunk_size: usize) -> (Vec, Vec) { + let mut reals = Vec::with_capacity(signal.len()); + let mut imaginaries = Vec::with_capacity(signal.len()); + let iter = signal.chunks_exact(chunk_size); + let rem = iter.remainder(); + + // We don't assume power of 2 size, so we don't use `chunks_exact`. + // Process each chunk, including the last chunk which may be smaller. + for chunk in iter { + for num in chunk { + reals.push(num.re); + imaginaries.push(num.im); + } + } + + for num in rem { + reals.push(num.re); + imaginaries.push(num.im); + } + + (reals, imaginaries) +} + +/// Utility function to combine separate vectors of real and imaginary components +/// into a single vector of Complex Number Structs. +/// +/// # Panics +/// +/// Panics if `reals.len() != imags.len()`. +pub fn combine_re_im(reals: &[T], imags: &[T]) -> Vec> { + assert_eq!(reals.len(), imags.len()); + + reals + .iter() + .zip(imags.iter()) + .map(|(z_re, z_im)| Complex::new(*z_re, *z_im)) + .collect() +} + #[cfg(test)] mod tests { use std::ops::Range; use utilities::assert_float_closeness; - use utilities::rustfft::num_complex::Complex; use utilities::rustfft::FftPlanner; + use utilities::rustfft::num_complex::Complex; use super::*; + #[test] + fn test_separate_and_combine_re_im() { + let complex_vec = vec![ + Complex::new(1.0, 2.0), + Complex::new(3.0, 4.0), + Complex::new(5.0, 6.0), + Complex::new(7.0, 8.0), + ]; + + let (reals, imags) = separate_re_im(&complex_vec, 2); + + let recombined_vec = combine_re_im(&reals, &imags); + + assert_eq!(complex_vec, recombined_vec); + } + macro_rules! non_power_of_2_planner { ($test_name:ident, $planner:ty) => { #[should_panic] @@ -260,28 +317,3 @@ mod tests { test_fft_correctness!(fft_correctness_32, f32, fft_32, 4, 9); test_fft_correctness!(fft_correctness_64, f64, fft_64, 4, 17); } - -/// Utility function to separate interleaved format signals (i.e., Vector of Complex Number Structs) -/// into separate vectors for the corresponding real and imaginary components. -pub fn separate_re_im(signal: &[Complex], chunk_size: usize) -> (Vec, Vec) { - let mut reals = Vec::with_capacity(signal.len()); - let mut imaginaries = Vec::with_capacity(signal.len()); - let iter = signal.chunks_exact(chunk_size); - let rem = iter.remainder(); - - // We don't assume power of 2 size, so we don't use `chunks_exact`. - // Process each chunk, including the last chunk which may be smaller. - for chunk in iter { - for num in chunk { - reals.push(num.re); - imaginaries.push(num.im); - } - } - - for num in rem { - reals.push(num.re); - imaginaries.push(num.im); - } - - (reals, imaginaries) -} From a716cd7010031cef88bb188e0ec77065e8779f63 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Wed, 1 May 2024 14:05:28 -0400 Subject: [PATCH 04/23] Add fft_*_interleaved impls and tests --- src/lib.rs | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index bd8aea1..a83ea38 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -65,6 +65,26 @@ macro_rules! impl_fft_for { impl_fft_for!(fft_64, f64, Planner64, fft_64_with_opts_and_plan); impl_fft_for!(fft_32, f32, Planner32, fft_32_with_opts_and_plan); +macro_rules! impl_fft_interleaved_for { + ($func_name:ident, $precision:ty, $fft_func:ident) => { + /// FFT -- Decimation in Frequency. This is just the decimation-in-time algorithm, reversed. + /// This call to FFT is run, in-place. + /// The input should be provided in normal order, and then the modified input is bit-reversed. + /// + /// + /// ## References + /// + pub fn $func_name(signal: &mut [Complex<$precision>], direction: Direction) { + let (mut reals, mut imags) = separate_re_im(signal, 2); + $fft_func(&mut reals, &mut imags, direction); + signal.copy_from_slice(&combine_re_im(&reals, &imags)) + } + }; +} + +impl_fft_interleaved_for!(fft_32_interleaved, f32, fft_32); +impl_fft_interleaved_for!(fft_64_interleaved, f64, fft_64); + macro_rules! impl_fft_with_opts_and_plan_for { ($func_name:ident, $precision:ty, $planner:ty, $simd_butterfly_kernel:ident, $lanes:literal) => { /// Same as [fft], but also accepts [`Options`] that control optimization strategies, as well as @@ -316,4 +336,43 @@ mod tests { test_fft_correctness!(fft_correctness_32, f32, fft_32, 4, 9); test_fft_correctness!(fft_correctness_64, f64, fft_64, 4, 17); + + #[test] + fn fft_interleaved_correctness() { + let n = 4; + let big_n = 1 << n; + let mut actual_signal: Vec<_> = (1..=big_n).map(|i| Complex::new(i as f64, 0.0)).collect(); + let mut expected_reals: Vec<_> = (1..=big_n).map(|i| i as f64).collect(); + let mut expected_imags = vec![0.0; big_n]; + + fft_64_interleaved(&mut actual_signal, Direction::Forward); + fft_64(&mut expected_reals, &mut expected_imags, Direction::Forward); + + actual_signal + .iter() + .zip(expected_reals) + .zip(expected_imags) + .for_each(|((z, z_re), z_im)| { + assert_float_closeness(z.re, z_re, 1e-10); + assert_float_closeness(z.im, z_im, 1e-10); + }); + + let n = 4; + let big_n = 1 << n; + let mut actual_signal: Vec<_> = (1..=big_n).map(|i| Complex::new(i as f32, 0.0)).collect(); + let mut expected_reals: Vec<_> = (1..=big_n).map(|i| i as f32).collect(); + let mut expected_imags = vec![0.0; big_n]; + + fft_32_interleaved(&mut actual_signal, Direction::Forward); + fft_32(&mut expected_reals, &mut expected_imags, Direction::Forward); + + actual_signal + .iter() + .zip(expected_reals) + .zip(expected_imags) + .for_each(|((z, z_re), z_im)| { + assert_float_closeness(z.re, z_re, 1e-10); + assert_float_closeness(z.im, z_im, 1e-10); + }); + } } From ecca75be52a3ea057cfeb5d131d24a5f44328c25 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Wed, 1 May 2024 14:06:53 -0400 Subject: [PATCH 05/23] Fix formatting --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index a83ea38..076c73b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -220,8 +220,8 @@ mod tests { use std::ops::Range; use utilities::assert_float_closeness; - use utilities::rustfft::FftPlanner; use utilities::rustfft::num_complex::Complex; + use utilities::rustfft::FftPlanner; use super::*; From 23c1d21e1db7b1d873e4e5474ed8ead40cfc800e Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Fri, 3 May 2024 22:51:31 +0100 Subject: [PATCH 06/23] Simplify separate_re_im --- src/lib.rs | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 076c73b..8f2dbfb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -75,7 +75,7 @@ macro_rules! impl_fft_interleaved_for { /// ## References /// pub fn $func_name(signal: &mut [Complex<$precision>], direction: Direction) { - let (mut reals, mut imags) = separate_re_im(signal, 2); + let (mut reals, mut imags) = separate_re_im(signal); $fft_func(&mut reals, &mut imags, direction); signal.copy_from_slice(&combine_re_im(&reals, &imags)) } @@ -176,27 +176,8 @@ impl_fft_with_opts_and_plan_for!( /// Utility function to separate interleaved format signals (i.e., Vector of Complex Number Structs) /// into separate vectors for the corresponding real and imaginary components. -pub fn separate_re_im(signal: &[Complex], chunk_size: usize) -> (Vec, Vec) { - let mut reals = Vec::with_capacity(signal.len()); - let mut imaginaries = Vec::with_capacity(signal.len()); - let iter = signal.chunks_exact(chunk_size); - let rem = iter.remainder(); - - // We don't assume power of 2 size, so we don't use `chunks_exact`. - // Process each chunk, including the last chunk which may be smaller. - for chunk in iter { - for num in chunk { - reals.push(num.re); - imaginaries.push(num.im); - } - } - - for num in rem { - reals.push(num.re); - imaginaries.push(num.im); - } - - (reals, imaginaries) +pub fn separate_re_im(signal: &[Complex]) -> (Vec, Vec) { + signal.iter().map(|z| (z.re, z.im)).unzip() } /// Utility function to combine separate vectors of real and imaginary components @@ -234,7 +215,7 @@ mod tests { Complex::new(7.0, 8.0), ]; - let (reals, imags) = separate_re_im(&complex_vec, 2); + let (reals, imags) = separate_re_im(&complex_vec); let recombined_vec = combine_re_im(&reals, &imags); From 13fe64c1058ac3d4997b9910c6382936837ba8d9 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Wed, 8 May 2024 12:57:20 -0400 Subject: [PATCH 07/23] Update docs for interleaved fft --- src/lib.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 8f2dbfb..df32329 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -67,10 +67,11 @@ impl_fft_for!(fft_32, f32, Planner32, fft_32_with_opts_and_plan); macro_rules! impl_fft_interleaved_for { ($func_name:ident, $precision:ty, $fft_func:ident) => { - /// FFT -- Decimation in Frequency. This is just the decimation-in-time algorithm, reversed. - /// This call to FFT is run, in-place. - /// The input should be provided in normal order, and then the modified input is bit-reversed. + /// FFT Interleaved -- this is an alternative to `fft_64`/`fft_32` in the case where + /// the input data is a array of `[num_complex::Complex]`. /// + /// The input should be provided in normal order, and then the modified input is + /// bit-reversed. /// /// ## References /// From a0d2822e3f4f0d8e4cce4fb55f0f9aef7fb63192 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Wed, 8 May 2024 13:25:14 -0400 Subject: [PATCH 08/23] Transition `num-complex` dependency to optional - Make num-complex optional and non-default, and all functions that take and test interleaved FFT are now under the same feature - Bump num-complex version 0.4.5 --> 0.4.6 - Enable num-complex for docs/docs.rs - Update github action workflow to run tests for all features - Fix formatting in docs to fix links --- .github/workflows/rust.yml | 1 + Cargo.toml | 8 +++++++- src/lib.rs | 15 ++++++++++++--- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 157d5aa..626f294 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -63,6 +63,7 @@ jobs: uses: actions-rs/cargo@v1 with: command: test + args: --all-features coverage: runs-on: ubuntu-latest diff --git a/Cargo.toml b/Cargo.toml index 4b5c7a7..ac74756 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,11 @@ exclude = ["assets", "scripts", "benches"] [dependencies] num-traits = "0.2.18" multiversion = "0.7" -num-complex = "0.4.5" +num-complex = { version = "0.4.6", optional = true } + +[features] +default = [] +complex-nums = ["dep:num-complex"] [dev-dependencies] utilities = { path = "utilities" } @@ -28,3 +32,5 @@ panic = "abort" inherits = "release" debug = true +[package.metadata.docs.rs] +all-features = true \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index df32329..c16b641 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,7 +8,9 @@ #![forbid(unsafe_code)] #![feature(portable_simd, avx512_target_feature)] +#[cfg(feature = "complex-nums")] use num_complex::Complex; +#[cfg(feature = "complex-nums")] use num_traits::Float; use crate::cobra::cobra_apply; @@ -65,10 +67,11 @@ macro_rules! impl_fft_for { impl_fft_for!(fft_64, f64, Planner64, fft_64_with_opts_and_plan); impl_fft_for!(fft_32, f32, Planner32, fft_32_with_opts_and_plan); +#[cfg(feature = "complex-nums")] macro_rules! impl_fft_interleaved_for { ($func_name:ident, $precision:ty, $fft_func:ident) => { - /// FFT Interleaved -- this is an alternative to `fft_64`/`fft_32` in the case where - /// the input data is a array of `[num_complex::Complex]`. + /// FFT Interleaved -- this is an alternative to [`fft_64`]/[`fft_32`] in the case where + /// the input data is a array of [`num_complex::Complex`]. /// /// The input should be provided in normal order, and then the modified input is /// bit-reversed. @@ -83,7 +86,9 @@ macro_rules! impl_fft_interleaved_for { }; } +#[cfg(feature = "complex-nums")] impl_fft_interleaved_for!(fft_32_interleaved, f32, fft_32); +#[cfg(feature = "complex-nums")] impl_fft_interleaved_for!(fft_64_interleaved, f64, fft_64); macro_rules! impl_fft_with_opts_and_plan_for { @@ -177,6 +182,7 @@ impl_fft_with_opts_and_plan_for!( /// Utility function to separate interleaved format signals (i.e., Vector of Complex Number Structs) /// into separate vectors for the corresponding real and imaginary components. +#[cfg(feature = "complex-nums")] pub fn separate_re_im(signal: &[Complex]) -> (Vec, Vec) { signal.iter().map(|z| (z.re, z.im)).unzip() } @@ -187,6 +193,7 @@ pub fn separate_re_im(signal: &[Complex]) -> (Vec, Vec) { /// # Panics /// /// Panics if `reals.len() != imags.len()`. +#[cfg(feature = "complex-nums")] pub fn combine_re_im(reals: &[T], imags: &[T]) -> Vec> { assert_eq!(reals.len(), imags.len()); @@ -202,11 +209,12 @@ mod tests { use std::ops::Range; use utilities::assert_float_closeness; - use utilities::rustfft::num_complex::Complex; use utilities::rustfft::FftPlanner; + use utilities::rustfft::num_complex::Complex; use super::*; + #[cfg(feature = "complex-nums")] #[test] fn test_separate_and_combine_re_im() { let complex_vec = vec![ @@ -319,6 +327,7 @@ mod tests { test_fft_correctness!(fft_correctness_32, f32, fft_32, 4, 9); test_fft_correctness!(fft_correctness_64, f64, fft_64, 4, 17); + #[cfg(feature = "complex-nums")] #[test] fn fft_interleaved_correctness() { let n = 4; From 05713ffbb1d92b0aea5ff36f75e922925295bb21 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Wed, 8 May 2024 13:29:34 -0400 Subject: [PATCH 09/23] Fix formatting --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index c16b641..fec23e1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -209,8 +209,8 @@ mod tests { use std::ops::Range; use utilities::assert_float_closeness; - use utilities::rustfft::FftPlanner; use utilities::rustfft::num_complex::Complex; + use utilities::rustfft::FftPlanner; use super::*; From 1377ead84098df502fcb4ce801e631fdb2f4c429 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Wed, 8 May 2024 13:41:03 -0400 Subject: [PATCH 10/23] Fix formatting --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 56ecf6c..383dbd7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -230,8 +230,8 @@ mod tests { use std::ops::Range; use utilities::assert_float_closeness; - use utilities::rustfft::FftPlanner; use utilities::rustfft::num_complex::Complex; + use utilities::rustfft::FftPlanner; use super::*; From 9699f9ee967b1c318f173eb1e39be593c5cefa09 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Wed, 22 May 2024 12:32:52 -0400 Subject: [PATCH 11/23] Vectorize deinterleaving of AoS --> SoA Use bytemuck + SIMD::deinterleave to rearrange input data from a slice of Complex values into 2 slices of f32 or f64 values --- Cargo.toml | 5 +-- src/lib.rs | 103 ++++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 86 insertions(+), 22 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 2bc3158..e3ab005 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,11 +13,12 @@ exclude = ["assets", "scripts", "benches"] [dependencies] num-traits = "0.2.18" multiversion = "0.7" -num-complex = { version = "0.4.6", optional = true } +num-complex = { version = "0.4.6", features = ["bytemuck"], optional = true } +bytemuck = { version = "1.16.0", optional = true } [features] default = [] -complex-nums = ["dep:num-complex"] +complex-nums = ["dep:num-complex", "dep:bytemuck"] [dev-dependencies] utilities = { path = "utilities" } diff --git a/src/lib.rs b/src/lib.rs index 383dbd7..da93cdd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,13 +1,20 @@ #![doc = include_str!("../README.md")] -#![warn(clippy::complexity)] -#![warn(missing_docs)] -#![warn(clippy::style)] -#![warn(clippy::correctness)] -#![warn(clippy::suspicious)] -#![warn(clippy::perf)] +#![warn( + missing_docs, + clippy::complexity, + clippy::perf, + clippy::style, + clippy::correctness, + clippy::suspicious +)] #![forbid(unsafe_code)] #![feature(portable_simd, avx512_target_feature)] +#[cfg(feature = "complex-nums")] +use std::simd::{f32x16, f64x8}; + +#[cfg(feature = "complex-nums")] +use bytemuck::cast_slice; #[cfg(feature = "complex-nums")] use num_complex::Complex; #[cfg(feature = "complex-nums")] @@ -90,9 +97,9 @@ impl_fft_for!(fft_32, f32, Planner32, fft_32_with_opts_and_plan); #[cfg(feature = "complex-nums")] macro_rules! impl_fft_interleaved_for { - ($func_name:ident, $precision:ty, $fft_func:ident) => { + ($func_name:ident, $precision:ty, $fft_func:ident, $deinterleaving_func: ident) => { /// FFT Interleaved -- this is an alternative to [`fft_64`]/[`fft_32`] in the case where - /// the input data is a array of [`num_complex::Complex`]. + /// the input data is a array of [`Complex`]. /// /// The input should be provided in normal order, and then the modified input is /// bit-reversed. @@ -100,7 +107,7 @@ macro_rules! impl_fft_interleaved_for { /// ## References /// pub fn $func_name(signal: &mut [Complex<$precision>], direction: Direction) { - let (mut reals, mut imags) = separate_re_im(signal); + let (mut reals, mut imags) = $deinterleaving_func(signal); $fft_func(&mut reals, &mut imags, direction); signal.copy_from_slice(&combine_re_im(&reals, &imags)) } @@ -108,9 +115,9 @@ macro_rules! impl_fft_interleaved_for { } #[cfg(feature = "complex-nums")] -impl_fft_interleaved_for!(fft_32_interleaved, f32, fft_32); +impl_fft_interleaved_for!(fft_32_interleaved, f32, fft_32, separate_re_im_f32); #[cfg(feature = "complex-nums")] -impl_fft_interleaved_for!(fft_64_interleaved, f64, fft_64); +impl_fft_interleaved_for!(fft_64_interleaved, f64, fft_64, separate_re_im_f64); macro_rules! impl_fft_with_opts_and_plan_for { ($func_name:ident, $precision:ty, $planner:ty, $simd_butterfly_kernel:ident, $lanes:literal) => { @@ -201,13 +208,69 @@ impl_fft_with_opts_and_plan_for!( 16 ); -/// Utility function to separate interleaved format signals (i.e., Vector of Complex Number Structs) -/// into separate vectors for the corresponding real and imaginary components. -#[cfg(feature = "complex-nums")] -pub fn separate_re_im(signal: &[Complex]) -> (Vec, Vec) { - signal.iter().map(|z| (z.re, z.im)).unzip() +macro_rules! impl_separate_re_im { + ($func_name:ident, $precision:ty, $lanes:literal, $simd_vec:ty) => { + /// Utility function to separate interleaved format signals (i.e., Vector of Complex Number Structs) + /// into separate vectors for the corresponding real and imaginary components. + #[multiversion::multiversion( + targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4 + "x86_64+avx2+fma", // x86_64-v3 + "x86_64+sse4.2", // x86_64-v2 + "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", + "x86+avx2+fma", + "x86+sse4.2", + "x86+sse2", + ))] + pub fn $func_name( + signal: &[Complex<$precision>], + ) -> (Vec<$precision>, Vec<$precision>) { + let n = signal.len(); + let mut reals = vec![0.0; n]; + let mut imags = vec![0.0; n]; + + let complex_f64 = cast_slice(signal); + const CHUNK_SIZE: usize = $lanes * 2; + + let mut i = 0; + for ((chunk, chunk_re), chunk_im) in complex_f64 + .chunks_exact(CHUNK_SIZE) + .zip(reals.chunks_exact_mut($lanes)) + .zip(imags.chunks_exact_mut($lanes)) + { + let (first_half, second_half) = chunk.split_at($lanes); + + let a = <$simd_vec>::from_slice(&first_half); + let b = <$simd_vec>::from_slice(&second_half); + let (re_deinterleaved, im_deinterleaved) = a.deinterleave(b); + + chunk_re.copy_from_slice(&re_deinterleaved.to_array()); + chunk_im.copy_from_slice(&im_deinterleaved.to_array()); + i += CHUNK_SIZE; + } + + let remainder = complex_f64.chunks_exact(CHUNK_SIZE).remainder(); + if !remainder.is_empty() { + remainder + .chunks_exact(2) + .zip(reals[i..].iter_mut()) + .zip(imags[i..].iter_mut()) + .for_each(|((c, re), im)| { + *re = c[0]; + *im = c[1]; + }); + } + + (reals, imags) + } + }; } +#[cfg(feature = "complex-nums")] +impl_separate_re_im!(separate_re_im_f32, f32, 16, f32x16); + +#[cfg(feature = "complex-nums")] +impl_separate_re_im!(separate_re_im_f64, f64, 8, f64x8); + /// Utility function to combine separate vectors of real and imaginary components /// into a single vector of Complex Number Structs. /// @@ -238,14 +301,14 @@ mod tests { #[cfg(feature = "complex-nums")] #[test] fn test_separate_and_combine_re_im() { - let complex_vec = vec![ + let complex_vec: Vec<_> = vec![ Complex::new(1.0, 2.0), Complex::new(3.0, 4.0), Complex::new(5.0, 6.0), Complex::new(7.0, 8.0), ]; - let (reals, imags) = separate_re_im(&complex_vec); + let (reals, imags) = separate_re_im_f64(&complex_vec); let recombined_vec = combine_re_im(&reals, &imags); @@ -351,7 +414,7 @@ mod tests { #[cfg(feature = "complex-nums")] #[test] fn fft_interleaved_correctness() { - let n = 4; + let n = 10; let big_n = 1 << n; let mut actual_signal: Vec<_> = (1..=big_n).map(|i| Complex::new(i as f64, 0.0)).collect(); let mut expected_reals: Vec<_> = (1..=big_n).map(|i| i as f64).collect(); @@ -369,7 +432,7 @@ mod tests { assert_float_closeness(z.im, z_im, 1e-10); }); - let n = 4; + let n = 10; let big_n = 1 << n; let mut actual_signal: Vec<_> = (1..=big_n).map(|i| Complex::new(i as f32, 0.0)).collect(); let mut expected_reals: Vec<_> = (1..=big_n).map(|i| i as f32).collect(); From 4d431903ba0b78c7ce19a2fc36a587f09da8a59d Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Wed, 22 May 2024 12:37:50 -0400 Subject: [PATCH 12/23] Put macro definition begind feature flag --- src/lib.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index da93cdd..6bdd81e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -208,6 +208,7 @@ impl_fft_with_opts_and_plan_for!( 16 ); +#[cfg(feature = "complex-nums")] macro_rules! impl_separate_re_im { ($func_name:ident, $precision:ty, $lanes:literal, $simd_vec:ty) => { /// Utility function to separate interleaved format signals (i.e., Vector of Complex Number Structs) @@ -293,8 +294,8 @@ mod tests { use std::ops::Range; use utilities::assert_float_closeness; - use utilities::rustfft::num_complex::Complex; use utilities::rustfft::FftPlanner; + use utilities::rustfft::num_complex::Complex; use super::*; From 356e468cfd79bdd7420e2fcdb97c4b32f16245f9 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Wed, 22 May 2024 12:40:01 -0400 Subject: [PATCH 13/23] Fix formatting --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 6bdd81e..f1b804a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -294,8 +294,8 @@ mod tests { use std::ops::Range; use utilities::assert_float_closeness; - use utilities::rustfft::FftPlanner; use utilities::rustfft::num_complex::Complex; + use utilities::rustfft::FftPlanner; use super::*; From fbb1bad41f66617123399cc8fe49ce44e6398948 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Wed, 22 May 2024 13:09:49 -0400 Subject: [PATCH 14/23] Remove index tracker from hot path --- src/lib.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index f1b804a..77ba60c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -229,11 +229,10 @@ macro_rules! impl_separate_re_im { let mut reals = vec![0.0; n]; let mut imags = vec![0.0; n]; - let complex_f64 = cast_slice(signal); + let complex_t: &[$precision] = cast_slice(signal); const CHUNK_SIZE: usize = $lanes * 2; - let mut i = 0; - for ((chunk, chunk_re), chunk_im) in complex_f64 + for ((chunk, chunk_re), chunk_im) in complex_t .chunks_exact(CHUNK_SIZE) .zip(reals.chunks_exact_mut($lanes)) .zip(imags.chunks_exact_mut($lanes)) @@ -246,11 +245,11 @@ macro_rules! impl_separate_re_im { chunk_re.copy_from_slice(&re_deinterleaved.to_array()); chunk_im.copy_from_slice(&im_deinterleaved.to_array()); - i += CHUNK_SIZE; } - let remainder = complex_f64.chunks_exact(CHUNK_SIZE).remainder(); + let remainder = complex_t.chunks_exact(CHUNK_SIZE).remainder(); if !remainder.is_empty() { + let i = reals.len() - remainder.len() / 2; remainder .chunks_exact(2) .zip(reals[i..].iter_mut()) From 39578345755d689262c98225dd8bd4391486a347 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Wed, 22 May 2024 13:32:55 -0400 Subject: [PATCH 15/23] Add benchmark using criterion --- Cargo.toml | 5 +++ benches/bench.rs | 82 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 benches/bench.rs diff --git a/Cargo.toml b/Cargo.toml index e3ab005..41a5d40 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,11 @@ complex-nums = ["dep:num-complex", "dep:bytemuck"] [dev-dependencies] utilities = { path = "utilities" } fftw = "0.8.0" +criterion = "0.5.1" + +[[bench]] +name = "bench" +harness = false [profile.release] codegen-units = 1 diff --git a/benches/bench.rs b/benches/bench.rs new file mode 100644 index 0000000..c0d2a6e --- /dev/null +++ b/benches/bench.rs @@ -0,0 +1,82 @@ +#![feature(portable_simd, avx512_target_feature)] + +use std::simd::{f32x16, f64x8}; + +use bytemuck::cast_slice; +use criterion::{black_box, Criterion, criterion_group, criterion_main}; +use num_complex::Complex; + +use phastft::separate_re_im_f64; + +macro_rules! impl_separate_re_im_with_capacity { + ($func_name:ident, $precision:ty, $lanes:literal, $simd_vec:ty) => { + /// Utility function to separate interleaved format signals (i.e., Vector of Complex Number Structs) + /// into separate vectors for the corresponding real and imaginary components. + #[multiversion::multiversion( + targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4 + "x86_64+avx2+fma", // x86_64-v3 + "x86_64+sse4.2", // x86_64-v2 + "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", + "x86+avx2+fma", + "x86+sse4.2", + "x86+sse2", + ))] + pub fn $func_name( + signal: &[Complex<$precision>], + ) -> (Vec<$precision>, Vec<$precision>) { + let n = signal.len(); + let mut reals = Vec::with_capacity(n); + let mut imags = Vec::with_capacity(n); + + let complex_t: &[$precision] = cast_slice(signal); + const CHUNK_SIZE: usize = $lanes * 2; + + for chunk in complex_t + .chunks_exact(CHUNK_SIZE) + { + let (first_half, second_half) = chunk.split_at($lanes); + + let a = <$simd_vec>::from_slice(&first_half); + let b = <$simd_vec>::from_slice(&second_half); + let (re_deinterleaved, im_deinterleaved) = a.deinterleave(b); + + reals.extend_from_slice(&re_deinterleaved.to_array()); + imags.extend_from_slice(&im_deinterleaved.to_array()); + } + + let remainder = complex_t.chunks_exact(CHUNK_SIZE).remainder(); + if !remainder.is_empty() { + let i = n - remainder.len() / 2; + for (j, chunk) in remainder.chunks_exact(2).enumerate() { + reals.push(chunk[0]); + imags.push(chunk[1]); + } + } + + (reals, imags) + } + }; +} + +#[cfg(feature = "complex-nums")] +impl_separate_re_im_with_capacity!(separate_re_im_f32_with_cap, f32, 16, f32x16); + +#[cfg(feature = "complex-nums")] +impl_separate_re_im_with_capacity!(separate_re_im_f64_with_cap, f64, 8, f64x8); + +fn criterion_benchmark(c: &mut Criterion) { + let signal: Vec> = (0..(1 << 20)) + .map(|x| Complex::new(x as f64, (x * 2) as f64)) + .collect(); + + c.bench_function("separate_re_im_with_zeroed_vecs", |b| { + b.iter(|| separate_re_im_f64(black_box(&signal))); + }); + + c.bench_function("separate_re_im_with_capacity", |b| { + b.iter(|| separate_re_im_f64_with_cap(black_box(&signal))); + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); From b429a32f2f6c5a51837c2c02b96542d665c4fb6a Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Wed, 22 May 2024 13:33:34 -0400 Subject: [PATCH 16/23] Fix formatting --- benches/bench.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benches/bench.rs b/benches/bench.rs index c0d2a6e..4812cb5 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -3,7 +3,7 @@ use std::simd::{f32x16, f64x8}; use bytemuck::cast_slice; -use criterion::{black_box, Criterion, criterion_group, criterion_main}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; use num_complex::Complex; use phastft::separate_re_im_f64; From b8affb2cf0ec401d0ba0a90aabd558a253b433a1 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Wed, 22 May 2024 16:19:11 -0400 Subject: [PATCH 17/23] Account for different signal sizes --- benches/bench.rs | 45 +++++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/benches/bench.rs b/benches/bench.rs index 4812cb5..7e3b12f 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -3,7 +3,7 @@ use std::simd::{f32x16, f64x8}; use bytemuck::cast_slice; -use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; use num_complex::Complex; use phastft::separate_re_im_f64; @@ -44,14 +44,13 @@ macro_rules! impl_separate_re_im_with_capacity { imags.extend_from_slice(&im_deinterleaved.to_array()); } - let remainder = complex_t.chunks_exact(CHUNK_SIZE).remainder(); - if !remainder.is_empty() { - let i = n - remainder.len() / 2; - for (j, chunk) in remainder.chunks_exact(2).enumerate() { - reals.push(chunk[0]); - imags.push(chunk[1]); - } - } + let remainder = complex_t.chunks_exact(CHUNK_SIZE).remainder(); + if !remainder.is_empty() { + for chunk in remainder.chunks_exact(2) { + reals.push(chunk[0]); + imags.push(chunk[1]); + } + } (reals, imags) } @@ -65,17 +64,27 @@ impl_separate_re_im_with_capacity!(separate_re_im_f32_with_cap, f32, 16, f32x16) impl_separate_re_im_with_capacity!(separate_re_im_f64_with_cap, f64, 8, f64x8); fn criterion_benchmark(c: &mut Criterion) { - let signal: Vec> = (0..(1 << 20)) - .map(|x| Complex::new(x as f64, (x * 2) as f64)) - .collect(); + let sizes = vec![1 << 10, 1 << 12, 1 << 14, 1 << 16, 1 << 18, 1 << 20]; - c.bench_function("separate_re_im_with_zeroed_vecs", |b| { - b.iter(|| separate_re_im_f64(black_box(&signal))); - }); + let mut group = c.benchmark_group("separate_re_im"); + for size in sizes.into_iter() { + let signal: Vec<_> = (0..size) + .map(|x| Complex::new(x as f64, (x * 2) as f64)) + .collect(); - c.bench_function("separate_re_im_with_capacity", |b| { - b.iter(|| separate_re_im_f64_with_cap(black_box(&signal))); - }); + group.throughput(Throughput::Elements(size)); + + group.bench_with_input( + BenchmarkId::new("with_zeroed_vecs", size), + &signal, + |b, s| b.iter(|| separate_re_im_f64(black_box(s))), + ); + + group.bench_with_input(BenchmarkId::new("with_capacity", size), &signal, |b, s| { + b.iter(|| separate_re_im_f64_with_cap(black_box(s))); + }); + } + group.finish(); } criterion_group!(benches, criterion_benchmark); From 98503e94c7cd0bacfe2c9180df1a486e1bb9922a Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Sun, 14 Jul 2024 14:15:01 -0400 Subject: [PATCH 18/23] Add new python benchmarking "framework" --- benches/bench.rs | 38 ++++--- benches/run_benches.py | 228 +++++++++++++++++++++++++++++++++++++++++ src/kernels.rs | 16 +-- src/lib.rs | 2 +- 4 files changed, 263 insertions(+), 21 deletions(-) create mode 100644 benches/run_benches.py diff --git a/benches/bench.rs b/benches/bench.rs index 9bf511e..8120787 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -13,8 +13,7 @@ use utilities::rustfft::num_complex::Complex; use utilities::rustfft::FftPlanner; const LENGTHS: &[usize] = &[ - 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, -]; + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]; fn generate_numbers(n: usize) -> (Vec, Vec) where @@ -112,22 +111,37 @@ fn benchmark_inverse_f32(c: &mut Criterion) { } fn benchmark_forward_f64(c: &mut Criterion) { - let options = Options::default(); + let mut group = c.benchmark_group("Forward f64"); for n in LENGTHS.iter() { let len = 1 << n; - let id = format!("FFT Forward f64 {} elements", len); + let id = "PhastFT FFT Forward"; + let options = Options::guess_options(len); let planner = Planner64::new(len, Direction::Forward); + let (mut reals, mut imags) = generate_numbers(len); + group.throughput(Throughput::Elements(len as u64)); - c.bench_function(&id, |b| { - let (mut reals, mut imags) = generate_numbers(len); + group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &len| { b.iter(|| { - black_box(fft_64_with_opts_and_plan( - &mut reals, &mut imags, &options, &planner, - )); + fft_64_with_opts_and_plan( + black_box(&mut reals), + black_box(&mut imags), + black_box(&options), + black_box(&planner), + ); }); }); + + let id = "RustFFT FFT Forward"; + let mut planner = FftPlanner::::new(); + let fft = planner.plan_fft_forward(len); + let mut signal = generate_complex_numbers(len); + + group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &len| { + b.iter(|| fft.process(black_box(&mut signal))); + }); } + group.finish(); } fn benchmark_inverse_f64(c: &mut Criterion) { @@ -151,9 +165,9 @@ fn benchmark_inverse_f64(c: &mut Criterion) { criterion_group!( benches, - benchmark_forward_f32, - benchmark_inverse_f32, + // benchmark_forward_f32, + // benchmark_inverse_f32, benchmark_forward_f64, - benchmark_inverse_f64 + // benchmark_inverse_f64 ); criterion_main!(benches); diff --git a/benches/run_benches.py b/benches/run_benches.py new file mode 100644 index 0000000..7c72b19 --- /dev/null +++ b/benches/run_benches.py @@ -0,0 +1,228 @@ +import os +import subprocess +import sys +from pathlib import Path +import matplotlib.pyplot as plt +import shutil +from datetime import datetime +import logging +import numpy as np + +# Configuration +OUTPUT_DIR = "benchmark_output" +HISTORY_DIR = "benchmark_history" +LOG_DIR = "benchmark_logs" +MAX_ITERS = 1 << 10 +START = 6 +END = 20 +STD_THRESHOLD = 0.05 # 5% standard deviation threshold + +# Ensure log directory exists +Path(LOG_DIR).mkdir(parents=True, exist_ok=True) + +# Setup logging +logging.basicConfig( + filename=Path(LOG_DIR) / "benchmark.log", + level=logging.INFO, + format="%(asctime)s - %(message)s", +) +console = logging.StreamHandler() +console.setLevel(logging.INFO) +formatter = logging.Formatter("%(asctime)s - %(message)s") +console.setFormatter(formatter) +logging.getLogger().addHandler(console) + + +def run_command(command, cwd=None): + result = subprocess.run( + command, shell=True, text=True, capture_output=True, cwd=cwd + ) + if result.returncode != 0: + logging.error(f"Error running command: {command}\n{result.stderr}") + sys.exit(result.returncode) + return result.stdout.strip() + + +def clean_build_rust(): + logging.info("Cleaning and building Rust project...") + run_command("cargo clean") + run_command("cargo build --release --examples") + + +def benchmark_with_stabilization(executable_name, n, max_iters, std_threshold): + times = [] + for i in range(max_iters): + result = run_command(f"../target/release/examples/{executable_name} {n}") + times.append(int(result)) + if len(times) > 10: # Start evaluating after a minimum number of runs + current_std = np.std(times) / np.mean(times) + if current_std < std_threshold: + break + return times + + +def benchmark( + benchmark_name, output_subdir, start, end, max_iters, std_threshold, executable_name +): + output_dir_path = Path(OUTPUT_DIR) / output_subdir + output_dir_path.mkdir(parents=True, exist_ok=True) + + for n in range(start, end + 1): + logging.info( + f"Running {benchmark_name} benchmark for N = 2^{n} with a standard deviation threshold of {std_threshold * 100}%..." + ) + times = benchmark_with_stabilization( + executable_name, n, max_iters, std_threshold + ) + output_file = output_dir_path / f"size_{n}" + with open(output_file, "w") as f: + for time in times: + f.write(f"{time}\n") + logging.info( + f"Completed N = 2^{n} in {len(times)} iterations with a final standard deviation of {np.std(times) / np.mean(times):.2%}" + ) + + +def read_benchmark_results(output_dir, start, end): + sizes = [] + times = [] + + for n in range(start, end + 1): + size_file = Path(output_dir) / f"size_{n}" + if size_file.exists(): + with open(size_file, "r") as f: + data = f.readlines() + data = [int(line.strip()) for line in data] + if data: + min_time_ns = min(data) + sizes.append(2**n) + times.append(min_time_ns) + else: + logging.warning(f"No data found in file: {size_file}") + else: + logging.warning(f"File does not exist: {size_file}") + + return sizes, times + + +def plot_benchmark_results(output_subdirs, start, end, history_dirs=[]): + plt.figure(figsize=(10, 6)) + has_data = False + + # Plot current results + for subdir in output_subdirs: + sizes, times = read_benchmark_results(Path(OUTPUT_DIR) / subdir, start, end) + if sizes and times: + has_data = True + plt.plot(sizes, times, marker="o", label=f"current {subdir}") + + # Plot previous results from history for PhastFT + for history_dir in history_dirs: + sizes, times = read_benchmark_results( + Path(history_dir) / "benchmark_output" / "phastft", start, end + ) + if sizes and times: + has_data = True + timestamp = Path(history_dir).stem + plt.plot( + sizes, times, marker="x", linestyle="--", label=f"{timestamp} phastft" + ) + + if has_data: + plt.title("Benchmark Results") + plt.xlabel("FFT Size (N)") + plt.ylabel("Minimum Time (ns)") + plt.xscale("log") + plt.yscale("log") + plt.grid(True, which="both", ls="--") + plt.legend() + plt.savefig(f"{OUTPUT_DIR}/benchmark_results.png", dpi=600) + # plt.show() + else: + logging.warning("No data available to plot.") + + +def compare_results(current_dir, previous_dir, start, end): + changes = {} + for n in range(start, end + 1): + current_file = Path(current_dir) / f"size_{n}" + previous_file = ( + Path(previous_dir) / "benchmark_output" / "phastft" / f"size_{n}" + ) + + if current_file.exists() and previous_file.exists(): + with open(current_file, "r") as cf, open(previous_file, "r") as pf: + current_data = [int(line.strip()) for line in cf.readlines()] + previous_data = [int(line.strip()) for line in pf.readlines()] + + if current_data and previous_data: + current_min = min(current_data) + previous_min = min(previous_data) + + if current_min != previous_min: + change = ((current_min - previous_min) / previous_min) * 100 + changes[n] = change + else: + logging.warning( + f"Data missing in files for size 2^{n}: Current data length: {len(current_data)}, Previous data length: {len(previous_data)}" + ) + else: + logging.warning( + f"Missing files for size 2^{n}: Current file exists: {current_file.exists()}, Previous file exists: {previous_file.exists()}" + ) + + return changes + + +def archive_current_results(): + if Path(OUTPUT_DIR).exists(): + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + history_dir = Path(HISTORY_DIR) / timestamp + history_dir.mkdir(parents=True, exist_ok=True) + shutil.move(OUTPUT_DIR, history_dir) + logging.info(f"Archived current results to: {history_dir}") + else: + logging.warning( + f"Output directory '{OUTPUT_DIR}' does not exist and cannot be archived." + ) + + +def main(): + clean_build_rust() + + # Check if there are previous results for comparison + history_dirs = ( + sorted(Path(HISTORY_DIR).iterdir(), key=os.path.getmtime) + if Path(HISTORY_DIR).exists() + else [] + ) + latest_previous_dir = history_dirs[-1] if history_dirs else None + + # Run new benchmarks for PhastFT, RustFFT, and FFTW3 + benchmark("PhastFT", "phastft", START, END, MAX_ITERS, STD_THRESHOLD, "benchmark") + benchmark("RustFFT", "rustfft", START, END, MAX_ITERS, STD_THRESHOLD, "rustfft") + benchmark( + "FFTW3 Rust bindings", "fftwrb", START, END, MAX_ITERS, STD_THRESHOLD, "fftwrb" + ) + + # Compare new PhastFT benchmarks against previous results + if latest_previous_dir: + logging.info(f"Comparing with previous results from: {latest_previous_dir}") + changes = compare_results( + Path(OUTPUT_DIR) / "phastft", latest_previous_dir, START, END + ) + for n, change in changes.items(): + status = "improvement" if change < 0 else "regression" + logging.info(f"N = 2^{n}: {abs(change):.2f}% {status}") + else: + logging.info("No previous results found for comparison.") + + # Plot benchmark results + plot_benchmark_results(["phastft", "rustfft", "fftwrb"], START, END, history_dirs) + + # Archive current results + archive_current_results() + + +if __name__ == "__main__": + main() diff --git a/src/kernels.rs b/src/kernels.rs index 8651ee9..6d5e3aa 100644 --- a/src/kernels.rs +++ b/src/kernels.rs @@ -119,15 +119,15 @@ pub(crate) fn fft_chunk_n( ))] #[inline] pub(crate) fn fft_chunk_4(reals: &mut [T], imags: &mut [T]) { - let dist = 2; - let chunk_size = dist << 1; + const DIST: usize = 2; + const CHUNK_SIZE: usize = DIST << 1; reals - .chunks_exact_mut(chunk_size) - .zip(imags.chunks_exact_mut(chunk_size)) + .array_chunks_mut::() + .zip(imags.array_chunks_mut::()) .for_each(|(reals_chunk, imags_chunk)| { - let (reals_s0, reals_s1) = reals_chunk.split_at_mut(dist); - let (imags_s0, imags_s1) = imags_chunk.split_at_mut(dist); + let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST); + let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); let real_c0 = reals_s0[0]; let real_c1 = reals_s1[0]; @@ -163,8 +163,8 @@ pub(crate) fn fft_chunk_4(reals: &mut [T], imags: &mut [T]) { #[inline] pub(crate) fn fft_chunk_2(reals: &mut [T], imags: &mut [T]) { reals - .chunks_exact_mut(2) - .zip(imags.chunks_exact_mut(2)) + .array_chunks_mut::<2>() + .zip(imags.array_chunks_mut::<2>()) .for_each(|(reals_chunk, imags_chunk)| { let z0_re = reals_chunk[0]; let z0_im = imags_chunk[0]; diff --git a/src/lib.rs b/src/lib.rs index c0e20f7..e1b6def 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,7 +8,7 @@ clippy::suspicious )] #![forbid(unsafe_code)] -#![feature(portable_simd, avx512_target_feature)] +#![feature(array_chunks, portable_simd, avx512_target_feature)] #[cfg(feature = "complex-nums")] use std::simd::{f32x16, f64x8}; From a201e2c1151d24885eeffbc5b2dc550cbd9f6193 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Sun, 14 Jul 2024 14:17:26 -0400 Subject: [PATCH 19/23] Update todo --- src/twiddles.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/twiddles.rs b/src/twiddles.rs index 7e11ba9..6008ad8 100644 --- a/src/twiddles.rs +++ b/src/twiddles.rs @@ -217,7 +217,7 @@ mod tests { use super::*; - // TODO(saveliy): use + // TODO(saveliy): try to use only real twiddle factors since sin is just a phase shift of cos #[test] fn twiddles_cos_only() { let n = 4; @@ -234,7 +234,7 @@ mod tests { assert!(fwd_twiddles_re.len() == dist && fwd_twiddles_im.len() == dist); for i in 0..dist { - let w_re = fwd_twiddles_re[i]; + let _w_re = fwd_twiddles_re[i]; let expected_w_im = fwd_twiddles_im[i]; let actual_w_im = -fwd_twiddles_re[(i + dist / 2) % dist]; From 403684b60b97c9163b0c93251da930944a0874a9 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Sun, 14 Jul 2024 14:41:15 -0400 Subject: [PATCH 20/23] Make examples output time elapsed in nanoseconds --- benches/bench.rs | 3 ++- examples/benchmark.rs | 2 +- examples/fftwrb.rs | 2 +- examples/rustfft.rs | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/benches/bench.rs b/benches/bench.rs index 8120787..305399d 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -13,7 +13,8 @@ use utilities::rustfft::num_complex::Complex; use utilities::rustfft::FftPlanner; const LENGTHS: &[usize] = &[ - 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]; + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, +]; fn generate_numbers(n: usize) -> (Vec, Vec) where diff --git a/examples/benchmark.rs b/examples/benchmark.rs index e6ce897..0fe62bf 100644 --- a/examples/benchmark.rs +++ b/examples/benchmark.rs @@ -18,7 +18,7 @@ fn benchmark_fft_64(n: usize) { let now = std::time::Instant::now(); fft_64_with_opts_and_plan(&mut reals, &mut imags, &opts, &planner); - let elapsed = now.elapsed().as_micros(); + let elapsed = now.elapsed().as_nanos(); println!("{elapsed}"); } diff --git a/examples/fftwrb.rs b/examples/fftwrb.rs index db4beee..3b1bf5a 100644 --- a/examples/fftwrb.rs +++ b/examples/fftwrb.rs @@ -34,7 +34,7 @@ fn benchmark_fftw(n: usize) { &mut nums, ) .unwrap(); - let elapsed = now.elapsed().as_micros(); + let elapsed = now.elapsed().as_nanos(); println!("{elapsed}"); } diff --git a/examples/rustfft.rs b/examples/rustfft.rs index 8e9ac51..6f40f80 100644 --- a/examples/rustfft.rs +++ b/examples/rustfft.rs @@ -28,7 +28,7 @@ fn benchmark_rustfft(n: usize) { let now = std::time::Instant::now(); fft.process(&mut signal); - let elapsed = now.elapsed().as_micros(); + let elapsed = now.elapsed().as_nanos(); println!("{elapsed}"); } From 1343268bd53bd0fb29872591c636dfc8442b5ae0 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Sun, 14 Jul 2024 14:45:19 -0400 Subject: [PATCH 21/23] Remove `array_chunks` --- src/kernels.rs | 8 ++++---- src/lib.rs | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/kernels.rs b/src/kernels.rs index 6d5e3aa..a1fe09c 100644 --- a/src/kernels.rs +++ b/src/kernels.rs @@ -123,8 +123,8 @@ pub(crate) fn fft_chunk_4(reals: &mut [T], imags: &mut [T]) { const CHUNK_SIZE: usize = DIST << 1; reals - .array_chunks_mut::() - .zip(imags.array_chunks_mut::()) + .chunks_exact_mut(CHUNK_SIZE) + .zip(imags.chunks_exact_mut(CHUNK_SIZE)) .for_each(|(reals_chunk, imags_chunk)| { let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST); let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST); @@ -163,8 +163,8 @@ pub(crate) fn fft_chunk_4(reals: &mut [T], imags: &mut [T]) { #[inline] pub(crate) fn fft_chunk_2(reals: &mut [T], imags: &mut [T]) { reals - .array_chunks_mut::<2>() - .zip(imags.array_chunks_mut::<2>()) + .chunks_exact_mut(2) + .zip(imags.chunks_exact_mut(2)) .for_each(|(reals_chunk, imags_chunk)| { let z0_re = reals_chunk[0]; let z0_im = imags_chunk[0]; diff --git a/src/lib.rs b/src/lib.rs index e1b6def..c0e20f7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,7 +8,7 @@ clippy::suspicious )] #![forbid(unsafe_code)] -#![feature(array_chunks, portable_simd, avx512_target_feature)] +#![feature(portable_simd, avx512_target_feature)] #[cfg(feature = "complex-nums")] use std::simd::{f32x16, f64x8}; From e94f2fc328d0f65a113e5c70b7fe17269795bc8e Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Wed, 17 Jul 2024 00:23:46 -0400 Subject: [PATCH 22/23] Use new de-interleaving function --- src/lib.rs | 84 ++-------------------------------------------------- src/utils.rs | 17 +++++++++++ 2 files changed, 20 insertions(+), 81 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index c0e20f7..c71dfb8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -230,86 +230,6 @@ impl_fft_with_opts_and_plan_for!( 16 ); -#[cfg(feature = "complex-nums")] -macro_rules! impl_separate_re_im { - ($func_name:ident, $precision:ty, $lanes:literal, $simd_vec:ty) => { - /// Utility function to separate interleaved format signals (i.e., Vector of Complex Number Structs) - /// into separate vectors for the corresponding real and imaginary components. - #[multiversion::multiversion( - targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4 - "x86_64+avx2+fma", // x86_64-v3 - "x86_64+sse4.2", // x86_64-v2 - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", - ))] - pub fn $func_name( - signal: &[Complex<$precision>], - ) -> (Vec<$precision>, Vec<$precision>) { - let n = signal.len(); - let mut reals = vec![0.0; n]; - let mut imags = vec![0.0; n]; - - let complex_t: &[$precision] = cast_slice(signal); - const CHUNK_SIZE: usize = $lanes * 2; - - for ((chunk, chunk_re), chunk_im) in complex_t - .chunks_exact(CHUNK_SIZE) - .zip(reals.chunks_exact_mut($lanes)) - .zip(imags.chunks_exact_mut($lanes)) - { - let (first_half, second_half) = chunk.split_at($lanes); - - let a = <$simd_vec>::from_slice(&first_half); - let b = <$simd_vec>::from_slice(&second_half); - let (re_deinterleaved, im_deinterleaved) = a.deinterleave(b); - - chunk_re.copy_from_slice(&re_deinterleaved.to_array()); - chunk_im.copy_from_slice(&im_deinterleaved.to_array()); - } - - let remainder = complex_t.chunks_exact(CHUNK_SIZE).remainder(); - if !remainder.is_empty() { - let i = reals.len() - remainder.len() / 2; - remainder - .chunks_exact(2) - .zip(reals[i..].iter_mut()) - .zip(imags[i..].iter_mut()) - .for_each(|((c, re), im)| { - *re = c[0]; - *im = c[1]; - }); - } - - (reals, imags) - } - }; -} - -#[cfg(feature = "complex-nums")] -impl_separate_re_im!(separate_re_im_f32, f32, 16, f32x16); - -#[cfg(feature = "complex-nums")] -impl_separate_re_im!(separate_re_im_f64, f64, 8, f64x8); - -/// Utility function to combine separate vectors of real and imaginary components -/// into a single vector of Complex Number Structs. -/// -/// # Panics -/// -/// Panics if `reals.len() != imags.len()`. -#[cfg(feature = "complex-nums")] -pub fn combine_re_im(reals: &[T], imags: &[T]) -> Vec> { - assert_eq!(reals.len(), imags.len()); - - reals - .iter() - .zip(imags.iter()) - .map(|(z_re, z_im)| Complex::new(*z_re, *z_im)) - .collect() -} - #[cfg(test)] mod tests { use std::ops::Range; @@ -323,6 +243,8 @@ mod tests { #[cfg(feature = "complex-nums")] #[test] fn test_separate_and_combine_re_im() { + use utils::deinterleave; + let complex_vec: Vec<_> = vec![ Complex::new(1.0, 2.0), Complex::new(3.0, 4.0), @@ -330,7 +252,7 @@ mod tests { Complex::new(7.0, 8.0), ]; - let (reals, imags) = separate_re_im_f64(&complex_vec); + let (reals, imags) = deinterleave(&complex_vec); let recombined_vec = combine_re_im(&reals, &imags); diff --git a/src/utils.rs b/src/utils.rs index 70ed734..d4be2fe 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -59,6 +59,23 @@ pub(crate) fn deinterleave(input: &[T]) -> (Vec (out_odd, out_even) } +/// Utility function to combine separate vectors of real and imaginary components +/// into a single vector of Complex Number Structs. +/// +/// # Panics +/// +/// Panics if `reals.len() != imags.len()`. +#[cfg(feature = "complex-nums")] +pub(crate) fn combine_re_im(reals: &[T], imags: &[T]) -> Vec> { + assert_eq!(reals.len(), imags.len()); + + reals + .iter() + .zip(imags.iter()) + .map(|(z_re, z_im)| Complex::new(*z_re, *z_im)) + .collect() +} + #[cfg(test)] mod tests { use super::deinterleave; From 713b3a66cf36e33a0450877629d5305ca3434365 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Wed, 17 Jul 2024 01:16:15 -0400 Subject: [PATCH 23/23] Update pre-commit hook to run clippy everywhere --- benches/README.md | 26 +++++------------------ benches/bench.rs | 32 ++++++++++++++++------------ hooks/pre-commit | 4 ++-- src/lib.rs | 38 ++++++--------------------------- src/twiddles.rs | 2 +- src/utils.rs | 54 +++++++++++++++++++++++++++++++++++++++++++++-- 6 files changed, 86 insertions(+), 70 deletions(-) diff --git a/benches/README.md b/benches/README.md index 3e84f65..e276e06 100644 --- a/benches/README.md +++ b/benches/README.md @@ -4,24 +4,15 @@ ### Setup Environment -1. Install [FFTW3](http://www.fftw.org/download.html)[^1] +1. Clone the `PhastFT` git repository [^2]. - It may be possible to install `fftw3` using a package manager. - - ##### debian - ```bash - sudo apt install libfftw3-dev - ``` - -2. Clone the `PhastFT` git repository [^2]. - -3. Create virtual env +2. Create virtual env ```bash cd ~/PhastFT/benches && python3 -m venv .env && source .env/bin/activate ``` -4. Install python dependencies[^1] +3. Install python dependencies[^1] ```bash pip install -r requirements.txt @@ -29,10 +20,10 @@ cd ~/PhastFT/pyphastft pip install . ``` -5. Run the `FFTW3` vs. `RustFFT` vs. `PhastFT` benchmark for all inputs of size `2^n`, where `n \in [4, 30].` +5. Run the `FFTW3-RB` vs. `RustFFT` vs. `PhastFT` benchmarks` ```bash -./benchmark.sh 4 29 +python run_benches.py ``` 6. Plot the results @@ -101,13 +92,6 @@ On linux, open access to performance monitoring, and observability operations fo echo -1 | sudo tee /proc/sys/kernel/perf_event_paranoid ``` -Add debug to `Cargo.toml` under `profile.release`: - -```bash -[profile.release] -debug = true -``` - Finally, run: ```bash diff --git a/benches/bench.rs b/benches/bench.rs index 305399d..00d7833 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -69,7 +69,7 @@ fn benchmark_forward_f32(c: &mut Criterion) { let planner = Planner32::new(len, Direction::Forward); let (mut reals, mut imags) = generate_numbers(len); - group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &len| { + group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &_len| { b.iter(|| { fft_32_with_opts_and_plan( black_box(&mut reals), @@ -85,7 +85,7 @@ fn benchmark_forward_f32(c: &mut Criterion) { let fft = planner.plan_fft_forward(len); let mut signal = generate_complex_numbers(len); - group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &len| { + group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &_len| { b.iter(|| fft.process(black_box(&mut signal))); }); } @@ -103,9 +103,12 @@ fn benchmark_inverse_f32(c: &mut Criterion) { c.bench_function(&id, |b| { let (mut reals, mut imags) = generate_numbers(len); b.iter(|| { - black_box(fft_32_with_opts_and_plan( - &mut reals, &mut imags, &options, &planner, - )); + fft_32_with_opts_and_plan( + black_box(&mut reals), + black_box(&mut imags), + black_box(&options), + black_box(&planner), + ); }); }); } @@ -122,7 +125,7 @@ fn benchmark_forward_f64(c: &mut Criterion) { let (mut reals, mut imags) = generate_numbers(len); group.throughput(Throughput::Elements(len as u64)); - group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &len| { + group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &_len| { b.iter(|| { fft_64_with_opts_and_plan( black_box(&mut reals), @@ -138,7 +141,7 @@ fn benchmark_forward_f64(c: &mut Criterion) { let fft = planner.plan_fft_forward(len); let mut signal = generate_complex_numbers(len); - group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &len| { + group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &_len| { b.iter(|| fft.process(black_box(&mut signal))); }); } @@ -156,9 +159,12 @@ fn benchmark_inverse_f64(c: &mut Criterion) { c.bench_function(&id, |b| { let (mut reals, mut imags) = generate_numbers(len); b.iter(|| { - black_box(fft_64_with_opts_and_plan( - &mut reals, &mut imags, &options, &planner, - )); + fft_64_with_opts_and_plan( + black_box(&mut reals), + black_box(&mut imags), + black_box(&options), + black_box(&planner), + ); }); }); } @@ -166,9 +172,9 @@ fn benchmark_inverse_f64(c: &mut Criterion) { criterion_group!( benches, - // benchmark_forward_f32, - // benchmark_inverse_f32, + benchmark_forward_f32, + benchmark_inverse_f32, benchmark_forward_f64, - // benchmark_inverse_f64 + benchmark_inverse_f64 ); criterion_main!(benches); diff --git a/hooks/pre-commit b/hooks/pre-commit index 6d3f529..8f289a5 100755 --- a/hooks/pre-commit +++ b/hooks/pre-commit @@ -9,13 +9,13 @@ then exit 1 fi -if ! cargo clippy -- -D warnings +if ! cargo clippy --all-targets --all-features --tests -- -D warnings then echo "There are some clippy issues." exit 1 fi -if ! cargo test +if ! cargo test --all-features then echo "There are some test issues." exit 1 diff --git a/src/lib.rs b/src/lib.rs index c71dfb8..8f03b6d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,14 +11,9 @@ #![feature(portable_simd, avx512_target_feature)] #[cfg(feature = "complex-nums")] -use std::simd::{f32x16, f64x8}; - -#[cfg(feature = "complex-nums")] -use bytemuck::cast_slice; +use crate::utils::{combine_re_im, deinterleave_complex32, deinterleave_complex64}; #[cfg(feature = "complex-nums")] use num_complex::Complex; -#[cfg(feature = "complex-nums")] -use num_traits::Float; use crate::cobra::cobra_apply; use crate::kernels::{ @@ -96,9 +91,9 @@ macro_rules! impl_fft_interleaved_for { } #[cfg(feature = "complex-nums")] -impl_fft_interleaved_for!(fft_32_interleaved, f32, fft_32, separate_re_im_f32); +impl_fft_interleaved_for!(fft_32_interleaved, f32, fft_32, deinterleave_complex32); #[cfg(feature = "complex-nums")] -impl_fft_interleaved_for!(fft_64_interleaved, f64, fft_64, separate_re_im_f64); +impl_fft_interleaved_for!(fft_64_interleaved, f64, fft_64, deinterleave_complex64); macro_rules! impl_fft_with_opts_and_plan_for { ($func_name:ident, $precision:ty, $planner:ty, $simd_butterfly_kernel:ident, $lanes:literal) => { @@ -240,25 +235,6 @@ mod tests { use super::*; - #[cfg(feature = "complex-nums")] - #[test] - fn test_separate_and_combine_re_im() { - use utils::deinterleave; - - let complex_vec: Vec<_> = vec![ - Complex::new(1.0, 2.0), - Complex::new(3.0, 4.0), - Complex::new(5.0, 6.0), - Complex::new(7.0, 8.0), - ]; - - let (reals, imags) = deinterleave(&complex_vec); - - let recombined_vec = combine_re_im(&reals, &imags); - - assert_eq!(complex_vec, recombined_vec); - } - macro_rules! non_power_of_2_planner { ($test_name:ident, $planner:ty) => { #[should_panic] @@ -436,9 +412,9 @@ mod tests { let mut re = reals.clone(); let mut im = imags.clone(); - let mut planner = Planner64::new(num_points, Direction::Forward); + let planner = Planner64::new(num_points, Direction::Forward); let opts = Options::guess_options(reals.len()); - fft_64_with_opts_and_plan(&mut reals, &mut imags, &opts, &mut planner); + fft_64_with_opts_and_plan(&mut reals, &mut imags, &opts, &planner); fft_64(&mut re, &mut im, Direction::Forward); @@ -467,9 +443,9 @@ mod tests { let mut re = reals.clone(); let mut im = imags.clone(); - let mut planner = Planner32::new(num_points, direction); + let planner = Planner32::new(num_points, direction); let opts = Options::guess_options(reals.len()); - fft_32_with_opts_and_plan(&mut reals, &mut imags, &opts, &mut planner); + fft_32_with_opts_and_plan(&mut reals, &mut imags, &opts, &planner); fft_32(&mut re, &mut im, direction); diff --git a/src/twiddles.rs b/src/twiddles.rs index 6008ad8..144e9fc 100644 --- a/src/twiddles.rs +++ b/src/twiddles.rs @@ -239,7 +239,7 @@ mod tests { let actual_w_im = -fwd_twiddles_re[(i + dist / 2) % dist]; //assert_float_closeness(actual_w_im, expected_w_im, 1e-6); - print!("actual: {actual_w_im} expected: {expected_w_im}\n"); + println!("actual: {actual_w_im} expected: {expected_w_im}"); } println!("{:?}", fwd_twiddles_re); println!("{:?}", fwd_twiddles_im); diff --git a/src/utils.rs b/src/utils.rs index d4be2fe..3c0f240 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,5 +1,14 @@ //! Utility functions such as interleave/deinterleave +#[cfg(feature = "complex-nums")] +use num_complex::Complex; + +#[cfg(feature = "complex-nums")] +use num_traits::Float; + +#[cfg(feature = "complex-nums")] +use bytemuck::cast_slice; + use std::simd::{prelude::Simd, simd_swizzle, SimdElement}; // We don't multiversion for AVX-512 here and keep the chunk size below AVX-512 @@ -59,6 +68,30 @@ pub(crate) fn deinterleave(input: &[T]) -> (Vec (out_odd, out_even) } +/// Utility function to separate a slice of [`Complex64``] +/// into a single vector of Complex Number Structs. +/// +/// # Panics +/// +/// Panics if `reals.len() != imags.len()`. +#[cfg(feature = "complex-nums")] +pub(crate) fn deinterleave_complex64(signal: &[Complex]) -> (Vec, Vec) { + let complex_t: &[f64] = cast_slice(signal); + deinterleave(complex_t) +} + +/// Utility function to separate a slice of [`Complex32``] +/// into a single vector of Complex Number Structs. +/// +/// # Panics +/// +/// Panics if `reals.len() != imags.len()`. +#[cfg(feature = "complex-nums")] +pub(crate) fn deinterleave_complex32(signal: &[Complex]) -> (Vec, Vec) { + let complex_t: &[f32] = cast_slice(signal); + deinterleave(complex_t) +} + /// Utility function to combine separate vectors of real and imaginary components /// into a single vector of Complex Number Structs. /// @@ -78,10 +111,10 @@ pub(crate) fn combine_re_im(reals: &[T], imags: &[T]) -> Vec Vec { - (0..len).into_iter().collect() + (0..len).collect() } /// Slow but obviously correct implementation of deinterleaving, @@ -100,4 +133,21 @@ mod tests { assert_eq!(naive_b, opt_b); } } + + #[cfg(feature = "complex-nums")] + #[test] + fn test_separate_and_combine_re_im() { + let complex_vec: Vec<_> = vec![ + Complex::new(1.0, 2.0), + Complex::new(3.0, 4.0), + Complex::new(5.0, 6.0), + Complex::new(7.0, 8.0), + ]; + + let (reals, imags) = deinterleave_complex64(&complex_vec); + + let recombined_vec = combine_re_im(&reals, &imags); + + assert_eq!(complex_vec, recombined_vec); + } }