From 21d77dbf52883bcdf48d474f4856e8640bb8d6f4 Mon Sep 17 00:00:00 2001
From: Saveliy Yusufov <sy2685@columbia.edu>
Date: Tue, 30 Apr 2024 12:05:30 -0400
Subject: [PATCH 01/23] Add a function to separate AoS to SoA

---
 Cargo.toml |  1 +
 src/lib.rs | 30 +++++++++++++++++++++++++++++-
 2 files changed, 30 insertions(+), 1 deletion(-)
diff --git a/Cargo.toml b/Cargo.toml
index 98e51eb..4b5c7a7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,6 +13,7 @@ exclude = ["assets", "scripts", "benches"]
 [dependencies]
 num-traits = "0.2.18"
 multiversion = "0.7"
+num-complex = "0.4.5"
 
 [dev-dependencies]
 utilities = { path = "utilities" }
diff --git a/src/lib.rs b/src/lib.rs
index c65a74f..44b78d8 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -8,6 +8,9 @@
 #![forbid(unsafe_code)]
 #![feature(portable_simd, avx512_target_feature)]
 
+use num_complex::Complex;
+use num_traits::Float;
+
 use crate::cobra::cobra_apply;
 use crate::kernels::{
     fft_32_chunk_n_simd, fft_64_chunk_n_simd, fft_chunk_2, fft_chunk_4, fft_chunk_n,
@@ -156,8 +159,8 @@ mod tests {
     use std::ops::Range;
 
     use utilities::assert_float_closeness;
-    use utilities::rustfft::num_complex::Complex;
     use utilities::rustfft::FftPlanner;
+    use utilities::rustfft::num_complex::Complex;
 
     use super::*;
 
@@ -257,3 +260,28 @@ mod tests {
     test_fft_correctness!(fft_correctness_32, f32, fft_32, 4, 9);
     test_fft_correctness!(fft_correctness_64, f64, fft_64, 4, 17);
 }
+
+/// Utility function to separate interleaved format signals (i.e., Vector of Complex Number Structs)
+/// into separate vectors for the corresponding real and imaginary components.
+pub fn separate_re_im<T: Float>(signal: &[Complex<T>], chunk_size: usize) -> (Vec<T>, Vec<T>) {
+    let mut reals = Vec::with_capacity(signal.len());
+    let mut imaginaries = Vec::with_capacity(signal.len());
+    let iter = signal.chunks_exact(chunk_size);
+    let rem = iter.remainder();
+
+    // We don't assume power of 2 size, so we don't use `chunks_exact`.
+    // Process each chunk, including the last chunk which may be smaller.
+    for chunk in iter {
+        for num in chunk {
+            reals.push(num.re);
+            imaginaries.push(num.im);
+        }
+    }
+
+    for num in rem {
+        reals.push(num.re);
+        imaginaries.push(num.im);
+    }
+
+    (reals, imaginaries)
+}

From 58374f1f2ce659d17ac6f3e07346cdcd8c8feec0 Mon Sep 17 00:00:00 2001
From: Saveliy Yusufov <sy2685@columbia.edu>
Date: Tue, 30 Apr 2024 12:08:18 -0400
Subject: [PATCH 02/23] Fix formatting

---
 src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index 44b78d8..2821ab7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -159,8 +159,8 @@ mod tests {
     use std::ops::Range;
 
     use utilities::assert_float_closeness;
-    use utilities::rustfft::FftPlanner;
     use utilities::rustfft::num_complex::Complex;
+    use utilities::rustfft::FftPlanner;
 
     use super::*;
 

From 59de096ae48829d5ec55b03688528c8f042ec425 Mon Sep 17 00:00:00 2001
From: Saveliy Yusufov <sy2685@columbia.edu>
Date: Wed, 1 May 2024 13:40:19 -0400
Subject: [PATCH 03/23] Add reverse separate fn and add test

---
 src/lib.rs | 84 +++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 58 insertions(+), 26 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 2821ab7..bd8aea1 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -154,16 +154,73 @@ impl_fft_with_opts_and_plan_for!(
     16
 );
 
+/// Utility function to separate interleaved format signals (i.e., Vector of Complex Number Structs)
+/// into separate vectors for the corresponding real and imaginary components.
+pub fn separate_re_im<T: Float>(signal: &[Complex<T>], chunk_size: usize) -> (Vec<T>, Vec<T>) {
+    let mut reals = Vec::with_capacity(signal.len());
+    let mut imaginaries = Vec::with_capacity(signal.len());
+    let iter = signal.chunks_exact(chunk_size);
+    let rem = iter.remainder();
+
+    // We don't assume power of 2 size, so we don't use `chunks_exact`.
+    // Process each chunk, including the last chunk which may be smaller.
+    for chunk in iter {
+        for num in chunk {
+            reals.push(num.re);
+            imaginaries.push(num.im);
+        }
+    }
+
+    for num in rem {
+        reals.push(num.re);
+        imaginaries.push(num.im);
+    }
+
+    (reals, imaginaries)
+}
+
+/// Utility function to combine separate vectors of real and imaginary components
+/// into a single vector of Complex Number Structs.
+///
+/// # Panics
+///
+/// Panics if `reals.len() != imags.len()`.
+pub fn combine_re_im<T: Float>(reals: &[T], imags: &[T]) -> Vec<Complex<T>> {
+    assert_eq!(reals.len(), imags.len());
+
+    reals
+        .iter()
+        .zip(imags.iter())
+        .map(|(z_re, z_im)| Complex::new(*z_re, *z_im))
+        .collect()
+}
+
 #[cfg(test)]
 mod tests {
     use std::ops::Range;
 
     use utilities::assert_float_closeness;
-    use utilities::rustfft::num_complex::Complex;
     use utilities::rustfft::FftPlanner;
+    use utilities::rustfft::num_complex::Complex;
 
     use super::*;
 
+    #[test]
+    fn test_separate_and_combine_re_im() {
+        let complex_vec = vec![
+            Complex::new(1.0, 2.0),
+            Complex::new(3.0, 4.0),
+            Complex::new(5.0, 6.0),
+            Complex::new(7.0, 8.0),
+        ];
+
+        let (reals, imags) = separate_re_im(&complex_vec, 2);
+
+        let recombined_vec = combine_re_im(&reals, &imags);
+
+        assert_eq!(complex_vec, recombined_vec);
+    }
+
     macro_rules! non_power_of_2_planner {
         ($test_name:ident, $planner:ty) => {
             #[should_panic]
@@ -260,28 +317,3 @@ mod tests {
     test_fft_correctness!(fft_correctness_32, f32, fft_32, 4, 9);
     test_fft_correctness!(fft_correctness_64, f64, fft_64, 4, 17);
 }
-
-/// Utility function to separate interleaved format signals (i.e., Vector of Complex Number Structs)
-/// into separate vectors for the corresponding real and imaginary components.
-pub fn separate_re_im<T: Float>(signal: &[Complex<T>], chunk_size: usize) -> (Vec<T>, Vec<T>) {
-    let mut reals = Vec::with_capacity(signal.len());
-    let mut imaginaries = Vec::with_capacity(signal.len());
-    let iter = signal.chunks_exact(chunk_size);
-    let rem = iter.remainder();
-
-    // We don't assume power of 2 size, so we don't use `chunks_exact`.
-    // Process each chunk, including the last chunk which may be smaller.
-    for chunk in iter {
-        for num in chunk {
-            reals.push(num.re);
-            imaginaries.push(num.im);
-        }
-    }
-
-    for num in rem {
-        reals.push(num.re);
-        imaginaries.push(num.im);
-    }
-
-    (reals, imaginaries)
-}

From a716cd7010031cef88bb188e0ec77065e8779f63 Mon Sep 17 00:00:00 2001
From: Saveliy Yusufov <sy2685@columbia.edu>
Date: Wed, 1 May 2024 14:05:28 -0400
Subject: [PATCH 04/23] Add fft_*_interleaved impls and tests

---
 src/lib.rs | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/src/lib.rs b/src/lib.rs
index bd8aea1..a83ea38 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -65,6 +65,26 @@ macro_rules! impl_fft_for {
 impl_fft_for!(fft_64, f64, Planner64, fft_64_with_opts_and_plan);
 impl_fft_for!(fft_32, f32, Planner32, fft_32_with_opts_and_plan);
 
+macro_rules! impl_fft_interleaved_for {
+    ($func_name:ident, $precision:ty, $fft_func:ident) => {
+        /// FFT -- Decimation in Frequency. This is just the decimation-in-time algorithm, reversed.
+        /// This call to FFT is run, in-place.
+        /// The input should be provided in normal order, and then the modified input is bit-reversed.
+        ///
+        ///
+        /// ## References
+        /// <https://inst.eecs.berkeley.edu/~ee123/sp15/Notes/Lecture08_FFT_and_SpectAnalysis.key.pdf>
+        pub fn $func_name(signal: &mut [Complex<$precision>], direction: Direction) {
+            let (mut reals, mut imags) = separate_re_im(signal, 2);
+            $fft_func(&mut reals, &mut imags, direction);
+            signal.copy_from_slice(&combine_re_im(&reals, &imags))
+        }
+    };
+}
+
+impl_fft_interleaved_for!(fft_32_interleaved, f32, fft_32);
+impl_fft_interleaved_for!(fft_64_interleaved, f64, fft_64);
+
 macro_rules! impl_fft_with_opts_and_plan_for {
     ($func_name:ident, $precision:ty, $planner:ty, $simd_butterfly_kernel:ident, $lanes:literal) => {
         /// Same as [fft], but also accepts [`Options`] that control optimization strategies, as well as
@@ -316,4 +336,43 @@ mod tests {
 
     test_fft_correctness!(fft_correctness_32, f32, fft_32, 4, 9);
     test_fft_correctness!(fft_correctness_64, f64, fft_64, 4, 17);
+
+    #[test]
+    fn fft_interleaved_correctness() {
+        let n = 4;
+        let big_n = 1 << n;
+        let mut actual_signal: Vec<_> = (1..=big_n).map(|i| Complex::new(i as f64, 0.0)).collect();
+        let mut expected_reals: Vec<_> = (1..=big_n).map(|i| i as f64).collect();
+        let mut expected_imags = vec![0.0; big_n];
+
+        fft_64_interleaved(&mut actual_signal, Direction::Forward);
+        fft_64(&mut expected_reals, &mut expected_imags, Direction::Forward);
+
+        actual_signal
+            .iter()
+            .zip(expected_reals)
+            .zip(expected_imags)
+            .for_each(|((z, z_re), z_im)| {
+                assert_float_closeness(z.re, z_re, 1e-10);
+                assert_float_closeness(z.im, z_im, 1e-10);
+            });
+
+        let n = 4;
+        let big_n = 1 << n;
+        let mut actual_signal: Vec<_> = (1..=big_n).map(|i| Complex::new(i as f32, 0.0)).collect();
+        let mut expected_reals: Vec<_> = (1..=big_n).map(|i| i as f32).collect();
+        let mut expected_imags = vec![0.0; big_n];
+
+        fft_32_interleaved(&mut actual_signal, Direction::Forward);
+        fft_32(&mut expected_reals, &mut expected_imags, Direction::Forward);
+
+        actual_signal
+            .iter()
+            .zip(expected_reals)
+            .zip(expected_imags)
+            .for_each(|((z, z_re), z_im)| {
+                assert_float_closeness(z.re, z_re, 1e-10);
+                assert_float_closeness(z.im, z_im, 1e-10);
+            });
+    }
 }

From ecca75be52a3ea057cfeb5d131d24a5f44328c25 Mon Sep 17 00:00:00 2001
From: Saveliy Yusufov <sy2685@columbia.edu>
Date: Wed, 1 May 2024 14:06:53 -0400
Subject: [PATCH 05/23] Fix formatting

---
 src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index a83ea38..076c73b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -220,8 +220,8 @@ mod tests {
     use std::ops::Range;
 
     use utilities::assert_float_closeness;
-    use utilities::rustfft::FftPlanner;
     use utilities::rustfft::num_complex::Complex;
+    use utilities::rustfft::FftPlanner;
 
     use super::*;
 

From 23c1d21e1db7b1d873e4e5474ed8ead40cfc800e Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Fri, 3 May 2024 22:51:31 +0100
Subject: [PATCH 06/23] Simplify separate_re_im

---
 src/lib.rs | 27 ++++-----------------------
 1 file changed, 4 insertions(+), 23 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 076c73b..8f2dbfb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -75,7 +75,7 @@ macro_rules! impl_fft_interleaved_for {
         /// ## References
         /// <https://inst.eecs.berkeley.edu/~ee123/sp15/Notes/Lecture08_FFT_and_SpectAnalysis.key.pdf>
         pub fn $func_name(signal: &mut [Complex<$precision>], direction: Direction) {
-            let (mut reals, mut imags) = separate_re_im(signal, 2);
+            let (mut reals, mut imags) = separate_re_im(signal);
             $fft_func(&mut reals, &mut imags, direction);
             signal.copy_from_slice(&combine_re_im(&reals, &imags))
         }
@@ -176,27 +176,8 @@ impl_fft_with_opts_and_plan_for!(
 
 /// Utility function to separate interleaved format signals (i.e., Vector of Complex Number Structs)
 /// into separate vectors for the corresponding real and imaginary components.
-pub fn separate_re_im<T: Float>(signal: &[Complex<T>], chunk_size: usize) -> (Vec<T>, Vec<T>) {
-    let mut reals = Vec::with_capacity(signal.len());
-    let mut imaginaries = Vec::with_capacity(signal.len());
-    let iter = signal.chunks_exact(chunk_size);
-    let rem = iter.remainder();
-
-    // We don't assume power of 2 size, so we don't use `chunks_exact`.
-    // Process each chunk, including the last chunk which may be smaller.
-    for chunk in iter {
-        for num in chunk {
-            reals.push(num.re);
-            imaginaries.push(num.im);
-        }
-    }
-
-    for num in rem {
-        reals.push(num.re);
-        imaginaries.push(num.im);
-    }
-
-    (reals, imaginaries)
+pub fn separate_re_im<T: Float>(signal: &[Complex<T>]) -> (Vec<T>, Vec<T>) {
+    signal.iter().map(|z| (z.re, z.im)).unzip()
 }
 
 /// Utility function to combine separate vectors of real and imaginary components
@@ -234,7 +215,7 @@ mod tests {
             Complex::new(7.0, 8.0),
         ];
 
-        let (reals, imags) = separate_re_im(&complex_vec, 2);
+        let (reals, imags) = separate_re_im(&complex_vec);
 
         let recombined_vec = combine_re_im(&reals, &imags);
 

From 13fe64c1058ac3d4997b9910c6382936837ba8d9 Mon Sep 17 00:00:00 2001
From: Saveliy Yusufov <sy2685@columbia.edu>
Date: Wed, 8 May 2024 12:57:20 -0400
Subject: [PATCH 07/23] Update docs for interleaved fft

---
 src/lib.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 8f2dbfb..df32329 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -67,10 +67,11 @@ impl_fft_for!(fft_32, f32, Planner32, fft_32_with_opts_and_plan);
 
 macro_rules! impl_fft_interleaved_for {
     ($func_name:ident, $precision:ty, $fft_func:ident) => {
-        /// FFT -- Decimation in Frequency. This is just the decimation-in-time algorithm, reversed.
-        /// This call to FFT is run, in-place.
-        /// The input should be provided in normal order, and then the modified input is bit-reversed.
+        /// FFT Interleaved -- this is an alternative to `fft_64`/`fft_32` in the case where
+        /// the input data is a array of `[num_complex::Complex]`.
         ///
+        /// The input should be provided in normal order, and then the modified input is
+        /// bit-reversed.
         ///
         /// ## References
         /// <https://inst.eecs.berkeley.edu/~ee123/sp15/Notes/Lecture08_FFT_and_SpectAnalysis.key.pdf>

From a0d2822e3f4f0d8e4cce4fb55f0f9aef7fb63192 Mon Sep 17 00:00:00 2001
From: Saveliy Yusufov <sy2685@columbia.edu>
Date: Wed, 8 May 2024 13:25:14 -0400
Subject: [PATCH 08/23] Transition `num-complex` dependency to optional

- Make num-complex optional and non-default, and all functions that take
  and test interleaved FFT are now under the same feature

- Bump num-complex version 0.4.5 --> 0.4.6

- Enable num-complex for docs/docs.rs

- Update github action workflow to run tests for all features

- Fix formatting in docs to fix links
---
 .github/workflows/rust.yml |  1 +
 Cargo.toml                 |  8 +++++++-
 src/lib.rs                 | 15 ++++++++++++---
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 157d5aa..626f294 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -63,6 +63,7 @@ jobs:
         uses: actions-rs/cargo@v1
         with:
           command: test
+          args: --all-features
 
   coverage:
     runs-on: ubuntu-latest
diff --git a/Cargo.toml b/Cargo.toml
index 4b5c7a7..ac74756 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,11 @@ exclude = ["assets", "scripts", "benches"]
 [dependencies]
 num-traits = "0.2.18"
 multiversion = "0.7"
-num-complex = "0.4.5"
+num-complex = { version = "0.4.6", optional = true }
+
+[features]
+default = []
+complex-nums = ["dep:num-complex"]
 
 [dev-dependencies]
 utilities = { path = "utilities" }
@@ -28,3 +32,5 @@ panic = "abort"
 inherits = "release"
 debug = true
 
+[package.metadata.docs.rs]
+all-features = true
\ No newline at end of file
diff --git a/src/lib.rs b/src/lib.rs
index df32329..c16b641 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -8,7 +8,9 @@
 #![forbid(unsafe_code)]
 #![feature(portable_simd, avx512_target_feature)]
 
+#[cfg(feature = "complex-nums")]
 use num_complex::Complex;
+#[cfg(feature = "complex-nums")]
 use num_traits::Float;
 
 use crate::cobra::cobra_apply;
@@ -65,10 +67,11 @@ macro_rules! impl_fft_for {
 impl_fft_for!(fft_64, f64, Planner64, fft_64_with_opts_and_plan);
 impl_fft_for!(fft_32, f32, Planner32, fft_32_with_opts_and_plan);
 
+#[cfg(feature = "complex-nums")]
 macro_rules! impl_fft_interleaved_for {
     ($func_name:ident, $precision:ty, $fft_func:ident) => {
-        /// FFT Interleaved -- this is an alternative to `fft_64`/`fft_32` in the case where
-        /// the input data is a array of `[num_complex::Complex]`.
+        /// FFT Interleaved -- this is an alternative to [`fft_64`]/[`fft_32`] in the case where
+        /// the input data is a array of [`num_complex::Complex`].
         ///
         /// The input should be provided in normal order, and then the modified input is
         /// bit-reversed.
@@ -83,7 +86,9 @@ macro_rules! impl_fft_interleaved_for {
     };
 }
 
+#[cfg(feature = "complex-nums")]
 impl_fft_interleaved_for!(fft_32_interleaved, f32, fft_32);
+#[cfg(feature = "complex-nums")]
 impl_fft_interleaved_for!(fft_64_interleaved, f64, fft_64);
 
 macro_rules! impl_fft_with_opts_and_plan_for {
@@ -177,6 +182,7 @@ impl_fft_with_opts_and_plan_for!(
 
 /// Utility function to separate interleaved format signals (i.e., Vector of Complex Number Structs)
 /// into separate vectors for the corresponding real and imaginary components.
+#[cfg(feature = "complex-nums")]
 pub fn separate_re_im<T: Float>(signal: &[Complex<T>]) -> (Vec<T>, Vec<T>) {
     signal.iter().map(|z| (z.re, z.im)).unzip()
 }
@@ -187,6 +193,7 @@ pub fn separate_re_im<T: Float>(signal: &[Complex<T>]) -> (Vec<T>, Vec<T>) {
 /// # Panics
 ///
 /// Panics if `reals.len() != imags.len()`.
+#[cfg(feature = "complex-nums")]
 pub fn combine_re_im<T: Float>(reals: &[T], imags: &[T]) -> Vec<Complex<T>> {
     assert_eq!(reals.len(), imags.len());
 
@@ -202,11 +209,12 @@ mod tests {
     use std::ops::Range;
 
     use utilities::assert_float_closeness;
-    use utilities::rustfft::num_complex::Complex;
     use utilities::rustfft::FftPlanner;
+    use utilities::rustfft::num_complex::Complex;
 
     use super::*;
 
+    #[cfg(feature = "complex-nums")]
     #[test]
     fn test_separate_and_combine_re_im() {
         let complex_vec = vec![
@@ -319,6 +327,7 @@ mod tests {
     test_fft_correctness!(fft_correctness_32, f32, fft_32, 4, 9);
     test_fft_correctness!(fft_correctness_64, f64, fft_64, 4, 17);
 
+    #[cfg(feature = "complex-nums")]
     #[test]
     fn fft_interleaved_correctness() {
         let n = 4;

From 05713ffbb1d92b0aea5ff36f75e922925295bb21 Mon Sep 17 00:00:00 2001
From: Saveliy Yusufov <sy2685@columbia.edu>
Date: Wed, 8 May 2024 13:29:34 -0400
Subject: [PATCH 09/23] Fix formatting

---
 src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index c16b641..fec23e1 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -209,8 +209,8 @@ mod tests {
     use std::ops::Range;
 
     use utilities::assert_float_closeness;
-    use utilities::rustfft::FftPlanner;
     use utilities::rustfft::num_complex::Complex;
+    use utilities::rustfft::FftPlanner;
 
     use super::*;
 

From 1377ead84098df502fcb4ce801e631fdb2f4c429 Mon Sep 17 00:00:00 2001
From: Saveliy Yusufov <sy2685@columbia.edu>
Date: Wed, 8 May 2024 13:41:03 -0400
Subject: [PATCH 10/23] Fix formatting

---
 src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index 56ecf6c..383dbd7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -230,8 +230,8 @@ mod tests {
     use std::ops::Range;
 
     use utilities::assert_float_closeness;
-    use utilities::rustfft::FftPlanner;
     use utilities::rustfft::num_complex::Complex;
+    use utilities::rustfft::FftPlanner;
 
     use super::*;
 

From 9699f9ee967b1c318f173eb1e39be593c5cefa09 Mon Sep 17 00:00:00 2001
From: Saveliy Yusufov <sy2685@columbia.edu>
Date: Wed, 22 May 2024 12:32:52 -0400
Subject: [PATCH 11/23] Vectorize deinterleaving of AoS --> SoA

Use bytemuck + SIMD::deinterleave to rearrange input data from a slice
of Complex values into 2 slices of f32 or f64 values
---
 Cargo.toml |   5 +--
 src/lib.rs | 103 ++++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 86 insertions(+), 22 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 2bc3158..e3ab005 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,11 +13,12 @@ exclude = ["assets", "scripts", "benches"]
 [dependencies]
 num-traits = "0.2.18"
 multiversion = "0.7"
-num-complex = { version = "0.4.6", optional = true }
+num-complex = { version = "0.4.6", features = ["bytemuck"], optional = true }
+bytemuck = { version = "1.16.0", optional = true }
 
 [features]
 default = []
-complex-nums = ["dep:num-complex"]
+complex-nums = ["dep:num-complex", "dep:bytemuck"]
 
 [dev-dependencies]
 utilities = { path = "utilities" }
diff --git a/src/lib.rs b/src/lib.rs
index 383dbd7..da93cdd 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,13 +1,20 @@
 #![doc = include_str!("../README.md")]
-#![warn(clippy::complexity)]
-#![warn(missing_docs)]
-#![warn(clippy::style)]
-#![warn(clippy::correctness)]
-#![warn(clippy::suspicious)]
-#![warn(clippy::perf)]
+#![warn(
+    missing_docs,
+    clippy::complexity,
+    clippy::perf,
+    clippy::style,
+    clippy::correctness,
+    clippy::suspicious
+)]
 #![forbid(unsafe_code)]
 #![feature(portable_simd, avx512_target_feature)]
 
+#[cfg(feature = "complex-nums")]
+use std::simd::{f32x16, f64x8};
+
+#[cfg(feature = "complex-nums")]
+use bytemuck::cast_slice;
 #[cfg(feature = "complex-nums")]
 use num_complex::Complex;
 #[cfg(feature = "complex-nums")]
@@ -90,9 +97,9 @@ impl_fft_for!(fft_32, f32, Planner32, fft_32_with_opts_and_plan);
 
 #[cfg(feature = "complex-nums")]
 macro_rules! impl_fft_interleaved_for {
-    ($func_name:ident, $precision:ty, $fft_func:ident) => {
+    ($func_name:ident, $precision:ty, $fft_func:ident, $deinterleaving_func: ident) => {
         /// FFT Interleaved -- this is an alternative to [`fft_64`]/[`fft_32`] in the case where
-        /// the input data is a array of [`num_complex::Complex`].
+        /// the input data is a array of [`Complex`].
         ///
         /// The input should be provided in normal order, and then the modified input is
         /// bit-reversed.
@@ -100,7 +107,7 @@ macro_rules! impl_fft_interleaved_for {
         /// ## References
         /// <https://inst.eecs.berkeley.edu/~ee123/sp15/Notes/Lecture08_FFT_and_SpectAnalysis.key.pdf>
         pub fn $func_name(signal: &mut [Complex<$precision>], direction: Direction) {
-            let (mut reals, mut imags) = separate_re_im(signal);
+            let (mut reals, mut imags) = $deinterleaving_func(signal);
             $fft_func(&mut reals, &mut imags, direction);
             signal.copy_from_slice(&combine_re_im(&reals, &imags))
         }
@@ -108,9 +115,9 @@ macro_rules! impl_fft_interleaved_for {
 }
 
 #[cfg(feature = "complex-nums")]
-impl_fft_interleaved_for!(fft_32_interleaved, f32, fft_32);
+impl_fft_interleaved_for!(fft_32_interleaved, f32, fft_32, separate_re_im_f32);
 #[cfg(feature = "complex-nums")]
-impl_fft_interleaved_for!(fft_64_interleaved, f64, fft_64);
+impl_fft_interleaved_for!(fft_64_interleaved, f64, fft_64, separate_re_im_f64);
 
 macro_rules! impl_fft_with_opts_and_plan_for {
     ($func_name:ident, $precision:ty, $planner:ty, $simd_butterfly_kernel:ident, $lanes:literal) => {
@@ -201,13 +208,69 @@ impl_fft_with_opts_and_plan_for!(
     16
 );
 
-/// Utility function to separate interleaved format signals (i.e., Vector of Complex Number Structs)
-/// into separate vectors for the corresponding real and imaginary components.
-#[cfg(feature = "complex-nums")]
-pub fn separate_re_im<T: Float>(signal: &[Complex<T>]) -> (Vec<T>, Vec<T>) {
-    signal.iter().map(|z| (z.re, z.im)).unzip()
+macro_rules! impl_separate_re_im {
+    ($func_name:ident, $precision:ty, $lanes:literal, $simd_vec:ty) => {
+        /// Utility function to separate interleaved format signals (i.e., Vector of Complex Number Structs)
+        /// into separate vectors for the corresponding real and imaginary components.
+        #[multiversion::multiversion(
+                                    targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4
+                                            "x86_64+avx2+fma", // x86_64-v3
+                                            "x86_64+sse4.2", // x86_64-v2
+                                            "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl",
+                                            "x86+avx2+fma",
+                                            "x86+sse4.2",
+                                            "x86+sse2",
+                                        ))]
+        pub fn $func_name(
+            signal: &[Complex<$precision>],
+        ) -> (Vec<$precision>, Vec<$precision>) {
+            let n = signal.len();
+            let mut reals = vec![0.0; n];
+            let mut imags = vec![0.0; n];
+
+            let complex_f64 = cast_slice(signal);
+            const CHUNK_SIZE: usize = $lanes * 2;
+
+            let mut i = 0;
+            for ((chunk, chunk_re), chunk_im) in complex_f64
+                .chunks_exact(CHUNK_SIZE)
+                .zip(reals.chunks_exact_mut($lanes))
+                .zip(imags.chunks_exact_mut($lanes))
+            {
+                let (first_half, second_half) = chunk.split_at($lanes);
+
+                let a = <$simd_vec>::from_slice(&first_half);
+                let b = <$simd_vec>::from_slice(&second_half);
+                let (re_deinterleaved, im_deinterleaved) = a.deinterleave(b);
+
+                chunk_re.copy_from_slice(&re_deinterleaved.to_array());
+                chunk_im.copy_from_slice(&im_deinterleaved.to_array());
+                i += CHUNK_SIZE;
+            }
+
+            let remainder = complex_f64.chunks_exact(CHUNK_SIZE).remainder();
+            if !remainder.is_empty() {
+            remainder
+                .chunks_exact(2)
+                .zip(reals[i..].iter_mut())
+                .zip(imags[i..].iter_mut())
+                .for_each(|((c, re), im)| {
+                    *re = c[0];
+                    *im = c[1];
+                });
+            }
+
+            (reals, imags)
+        }
+    };
 }
 
+#[cfg(feature = "complex-nums")]
+impl_separate_re_im!(separate_re_im_f32, f32, 16, f32x16);
+
+#[cfg(feature = "complex-nums")]
+impl_separate_re_im!(separate_re_im_f64, f64, 8, f64x8);
+
 /// Utility function to combine separate vectors of real and imaginary components
 /// into a single vector of Complex Number Structs.
 ///
@@ -238,14 +301,14 @@ mod tests {
     #[cfg(feature = "complex-nums")]
     #[test]
     fn test_separate_and_combine_re_im() {
-        let complex_vec = vec![
+        let complex_vec: Vec<_> = vec![
             Complex::new(1.0, 2.0),
             Complex::new(3.0, 4.0),
             Complex::new(5.0, 6.0),
             Complex::new(7.0, 8.0),
         ];
 
-        let (reals, imags) = separate_re_im(&complex_vec);
+        let (reals, imags) = separate_re_im_f64(&complex_vec);
 
         let recombined_vec = combine_re_im(&reals, &imags);
 
@@ -351,7 +414,7 @@ mod tests {
     #[cfg(feature = "complex-nums")]
     #[test]
     fn fft_interleaved_correctness() {
-        let n = 4;
+        let n = 10;
         let big_n = 1 << n;
         let mut actual_signal: Vec<_> = (1..=big_n).map(|i| Complex::new(i as f64, 0.0)).collect();
         let mut expected_reals: Vec<_> = (1..=big_n).map(|i| i as f64).collect();
@@ -369,7 +432,7 @@ mod tests {
                 assert_float_closeness(z.im, z_im, 1e-10);
             });
 
-        let n = 4;
+        let n = 10;
         let big_n = 1 << n;
         let mut actual_signal: Vec<_> = (1..=big_n).map(|i| Complex::new(i as f32, 0.0)).collect();
         let mut expected_reals: Vec<_> = (1..=big_n).map(|i| i as f32).collect();

From 4d431903ba0b78c7ce19a2fc36a587f09da8a59d Mon Sep 17 00:00:00 2001
From: Saveliy Yusufov <sy2685@columbia.edu>
Date: Wed, 22 May 2024 12:37:50 -0400
Subject: [PATCH 12/23] Put macro definition begind feature flag

---
 src/lib.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index da93cdd..6bdd81e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -208,6 +208,7 @@ impl_fft_with_opts_and_plan_for!(
     16
 );
 
+#[cfg(feature = "complex-nums")]
 macro_rules! impl_separate_re_im {
     ($func_name:ident, $precision:ty, $lanes:literal, $simd_vec:ty) => {
         /// Utility function to separate interleaved format signals (i.e., Vector of Complex Number Structs)
@@ -293,8 +294,8 @@ mod tests {
     use std::ops::Range;
 
     use utilities::assert_float_closeness;
-    use utilities::rustfft::num_complex::Complex;
     use utilities::rustfft::FftPlanner;
+    use utilities::rustfft::num_complex::Complex;
 
     use super::*;
 

From 356e468cfd79bdd7420e2fcdb97c4b32f16245f9 Mon Sep 17 00:00:00 2001
From: Saveliy Yusufov <sy2685@columbia.edu>
Date: Wed, 22 May 2024 12:40:01 -0400
Subject: [PATCH 13/23] Fix formatting

---
 src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index 6bdd81e..f1b804a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -294,8 +294,8 @@ mod tests {
     use std::ops::Range;
 
     use utilities::assert_float_closeness;
-    use utilities::rustfft::FftPlanner;
     use utilities::rustfft::num_complex::Complex;
+    use utilities::rustfft::FftPlanner;
 
     use super::*;
 

From fbb1bad41f66617123399cc8fe49ce44e6398948 Mon Sep 17 00:00:00 2001
From: Saveliy Yusufov <sy2685@columbia.edu>
Date: Wed, 22 May 2024 13:09:49 -0400
Subject: [PATCH 14/23] Remove index tracker from hot path

---
 src/lib.rs | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index f1b804a..77ba60c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -229,11 +229,10 @@ macro_rules! impl_separate_re_im {
             let mut reals = vec![0.0; n];
             let mut imags = vec![0.0; n];
 
-            let complex_f64 = cast_slice(signal);
+            let complex_t: &[$precision] = cast_slice(signal);
             const CHUNK_SIZE: usize = $lanes * 2;
 
-            let mut i = 0;
-            for ((chunk, chunk_re), chunk_im) in complex_f64
+            for ((chunk, chunk_re), chunk_im) in complex_t
                 .chunks_exact(CHUNK_SIZE)
                 .zip(reals.chunks_exact_mut($lanes))
                 .zip(imags.chunks_exact_mut($lanes))
@@ -246,11 +245,11 @@ macro_rules! impl_separate_re_im {
 
                 chunk_re.copy_from_slice(&re_deinterleaved.to_array());
                 chunk_im.copy_from_slice(&im_deinterleaved.to_array());
-                i += CHUNK_SIZE;
             }
 
-            let remainder = complex_f64.chunks_exact(CHUNK_SIZE).remainder();
+            let remainder = complex_t.chunks_exact(CHUNK_SIZE).remainder();
             if !remainder.is_empty() {
+                let i = reals.len() - remainder.len() / 2;
             remainder
                 .chunks_exact(2)
                 .zip(reals[i..].iter_mut())

From 39578345755d689262c98225dd8bd4391486a347 Mon Sep 17 00:00:00 2001
From: Saveliy Yusufov <sy2685@columbia.edu>
Date: Wed, 22 May 2024 13:32:55 -0400
Subject: [PATCH 15/23] Add benchmark using criterion

---
 Cargo.toml       |  5 +++
 benches/bench.rs | 82 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 benches/bench.rs

diff --git a/Cargo.toml b/Cargo.toml
index e3ab005..41a5d40 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,6 +23,11 @@ complex-nums = ["dep:num-complex", "dep:bytemuck"]
 [dev-dependencies]
 utilities = { path = "utilities" }
 fftw = "0.8.0"
+criterion = "0.5.1"
+
+[[bench]]
+name = "bench"
+harness = false
 
 [profile.release]
 codegen-units = 1
diff --git a/benches/bench.rs b/benches/bench.rs
new file mode 100644
index 0000000..c0d2a6e
--- /dev/null
+++ b/benches/bench.rs
@@ -0,0 +1,82 @@
+#![feature(portable_simd, avx512_target_feature)]
+
+use std::simd::{f32x16, f64x8};
+
+use bytemuck::cast_slice;
+use criterion::{black_box, Criterion, criterion_group, criterion_main};
+use num_complex::Complex;
+
+use phastft::separate_re_im_f64;
+
+macro_rules! impl_separate_re_im_with_capacity {
+    ($func_name:ident, $precision:ty, $lanes:literal, $simd_vec:ty) => {
+        /// Utility function to separate interleaved format signals (i.e., Vector of Complex Number Structs)
+        /// into separate vectors for the corresponding real and imaginary components.
+        #[multiversion::multiversion(
+                                    targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4
+                                            "x86_64+avx2+fma", // x86_64-v3
+                                            "x86_64+sse4.2", // x86_64-v2
+                                            "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl",
+                                            "x86+avx2+fma",
+                                            "x86+sse4.2",
+                                            "x86+sse2",
+                                        ))]
+        pub fn $func_name(
+            signal: &[Complex<$precision>],
+        ) -> (Vec<$precision>, Vec<$precision>) {
+            let n = signal.len();
+            let mut reals = Vec::with_capacity(n);
+            let mut imags = Vec::with_capacity(n);
+
+            let complex_t: &[$precision] = cast_slice(signal);
+            const CHUNK_SIZE: usize = $lanes * 2;
+
+            for chunk in complex_t
+                .chunks_exact(CHUNK_SIZE)
+            {
+                let (first_half, second_half) = chunk.split_at($lanes);
+
+                let a = <$simd_vec>::from_slice(&first_half);
+                let b = <$simd_vec>::from_slice(&second_half);
+                let (re_deinterleaved, im_deinterleaved) = a.deinterleave(b);
+
+                reals.extend_from_slice(&re_deinterleaved.to_array());
+                imags.extend_from_slice(&im_deinterleaved.to_array());
+            }
+
+    let remainder = complex_t.chunks_exact(CHUNK_SIZE).remainder();
+    if !remainder.is_empty() {
+        let i = n - remainder.len() / 2;
+        for (j, chunk) in remainder.chunks_exact(2).enumerate() {
+            reals.push(chunk[0]);
+            imags.push(chunk[1]);
+        }
+    }
+
+            (reals, imags)
+        }
+    };
+}
+
+#[cfg(feature = "complex-nums")]
+impl_separate_re_im_with_capacity!(separate_re_im_f32_with_cap, f32, 16, f32x16);
+
+#[cfg(feature = "complex-nums")]
+impl_separate_re_im_with_capacity!(separate_re_im_f64_with_cap, f64, 8, f64x8);
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let signal: Vec<Complex<f64>> = (0..(1 << 20))
+        .map(|x| Complex::new(x as f64, (x * 2) as f64))
+        .collect();
+
+    c.bench_function("separate_re_im_with_zeroed_vecs", |b| {
+        b.iter(|| separate_re_im_f64(black_box(&signal)));
+    });
+
+    c.bench_function("separate_re_im_with_capacity", |b| {
+        b.iter(|| separate_re_im_f64_with_cap(black_box(&signal)));
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);

From b429a32f2f6c5a51837c2c02b96542d665c4fb6a Mon Sep 17 00:00:00 2001
From: Saveliy Yusufov <sy2685@columbia.edu>
Date: Wed, 22 May 2024 13:33:34 -0400
Subject: [PATCH 16/23] Fix formatting

---
 benches/bench.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benches/bench.rs b/benches/bench.rs
index c0d2a6e..4812cb5 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -3,7 +3,7 @@
 use std::simd::{f32x16, f64x8};
 
 use bytemuck::cast_slice;
-use criterion::{black_box, Criterion, criterion_group, criterion_main};
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
 use num_complex::Complex;
 
 use phastft::separate_re_im_f64;

From b8affb2cf0ec401d0ba0a90aabd558a253b433a1 Mon Sep 17 00:00:00 2001
From: Saveliy Yusufov <sy2685@columbia.edu>
Date: Wed, 22 May 2024 16:19:11 -0400
Subject: [PATCH 17/23] Account for different signal sizes

---
 benches/bench.rs | 45 +++++++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/benches/bench.rs b/benches/bench.rs
index 4812cb5..7e3b12f 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -3,7 +3,7 @@
 use std::simd::{f32x16, f64x8};
 
 use bytemuck::cast_slice;
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
 use num_complex::Complex;
 
 use phastft::separate_re_im_f64;
@@ -44,14 +44,13 @@ macro_rules! impl_separate_re_im_with_capacity {
                 imags.extend_from_slice(&im_deinterleaved.to_array());
             }
 
-    let remainder = complex_t.chunks_exact(CHUNK_SIZE).remainder();
-    if !remainder.is_empty() {
-        let i = n - remainder.len() / 2;
-        for (j, chunk) in remainder.chunks_exact(2).enumerate() {
-            reals.push(chunk[0]);
-            imags.push(chunk[1]);
-        }
-    }
+            let remainder = complex_t.chunks_exact(CHUNK_SIZE).remainder();
+            if !remainder.is_empty() {
+                for chunk in remainder.chunks_exact(2) {
+                    reals.push(chunk[0]);
+                    imags.push(chunk[1]);
+                }
+            }
 
             (reals, imags)
         }
@@ -65,17 +64,27 @@ impl_separate_re_im_with_capacity!(separate_re_im_f32_with_cap, f32, 16, f32x16)
 impl_separate_re_im_with_capacity!(separate_re_im_f64_with_cap, f64, 8, f64x8);
 
 fn criterion_benchmark(c: &mut Criterion) {
-    let signal: Vec<Complex<f64>> = (0..(1 << 20))
-        .map(|x| Complex::new(x as f64, (x * 2) as f64))
-        .collect();
+    let sizes = vec![1 << 10, 1 << 12, 1 << 14, 1 << 16, 1 << 18, 1 << 20];
 
-    c.bench_function("separate_re_im_with_zeroed_vecs", |b| {
-        b.iter(|| separate_re_im_f64(black_box(&signal)));
-    });
+    let mut group = c.benchmark_group("separate_re_im");
+    for size in sizes.into_iter() {
+        let signal: Vec<_> = (0..size)
+            .map(|x| Complex::new(x as f64, (x * 2) as f64))
+            .collect();
 
-    c.bench_function("separate_re_im_with_capacity", |b| {
-        b.iter(|| separate_re_im_f64_with_cap(black_box(&signal)));
-    });
+        group.throughput(Throughput::Elements(size));
+
+        group.bench_with_input(
+            BenchmarkId::new("with_zeroed_vecs", size),
+            &signal,
+            |b, s| b.iter(|| separate_re_im_f64(black_box(s))),
+        );
+
+        group.bench_with_input(BenchmarkId::new("with_capacity", size), &signal, |b, s| {
+            b.iter(|| separate_re_im_f64_with_cap(black_box(s)));
+        });
+    }
+    group.finish();
 }
 
 criterion_group!(benches, criterion_benchmark);

From 98503e94c7cd0bacfe2c9180df1a486e1bb9922a Mon Sep 17 00:00:00 2001
From: Saveliy Yusufov <sy2685@columbia.edu>
Date: Sun, 14 Jul 2024 14:15:01 -0400
Subject: [PATCH 18/23] Add new python benchmarking "framework"

---
 benches/bench.rs       |  38 ++++---
 benches/run_benches.py | 228 +++++++++++++++++++++++++++++++++++++++++
 src/kernels.rs         |  16 +--
 src/lib.rs             |   2 +-
 4 files changed, 263 insertions(+), 21 deletions(-)
 create mode 100644 benches/run_benches.py

diff --git a/benches/bench.rs b/benches/bench.rs
index 9bf511e..8120787 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -13,8 +13,7 @@ use utilities::rustfft::num_complex::Complex;
 use utilities::rustfft::FftPlanner;
 
 const LENGTHS: &[usize] = &[
-    6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
-];
+    6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
 
 fn generate_numbers<T: Float>(n: usize) -> (Vec<T>, Vec<T>)
 where
@@ -112,22 +111,37 @@ fn benchmark_inverse_f32(c: &mut Criterion) {
 }
 
 fn benchmark_forward_f64(c: &mut Criterion) {
-    let options = Options::default();
+    let mut group = c.benchmark_group("Forward f64");
 
     for n in LENGTHS.iter() {
         let len = 1 << n;
-        let id = format!("FFT Forward f64 {} elements", len);
+        let id = "PhastFT FFT Forward";
+        let options = Options::guess_options(len);
         let planner = Planner64::new(len, Direction::Forward);
+        let (mut reals, mut imags) = generate_numbers(len);
+        group.throughput(Throughput::Elements(len as u64));
 
-        c.bench_function(&id, |b| {
-            let (mut reals, mut imags) = generate_numbers(len);
+        group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &len| {
             b.iter(|| {
-                black_box(fft_64_with_opts_and_plan(
-                    &mut reals, &mut imags, &options, &planner,
-                ));
+                fft_64_with_opts_and_plan(
+                    black_box(&mut reals),
+                    black_box(&mut imags),
+                    black_box(&options),
+                    black_box(&planner),
+                );
             });
         });
+
+        let id = "RustFFT FFT Forward";
+        let mut planner = FftPlanner::<f64>::new();
+        let fft = planner.plan_fft_forward(len);
+        let mut signal = generate_complex_numbers(len);
+
+        group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &len| {
+            b.iter(|| fft.process(black_box(&mut signal)));
+        });
     }
+    group.finish();
 }
 
 fn benchmark_inverse_f64(c: &mut Criterion) {
@@ -151,9 +165,9 @@ fn benchmark_inverse_f64(c: &mut Criterion) {
 
 criterion_group!(
     benches,
-    benchmark_forward_f32,
-    benchmark_inverse_f32,
+    // benchmark_forward_f32,
+    // benchmark_inverse_f32,
     benchmark_forward_f64,
-    benchmark_inverse_f64
+    // benchmark_inverse_f64
 );
 criterion_main!(benches);
diff --git a/benches/run_benches.py b/benches/run_benches.py
new file mode 100644
index 0000000..7c72b19
--- /dev/null
+++ b/benches/run_benches.py
@@ -0,0 +1,228 @@
+import os
+import subprocess
+import sys
+from pathlib import Path
+import matplotlib.pyplot as plt
+import shutil
+from datetime import datetime
+import logging
+import numpy as np
+
+# Configuration
+OUTPUT_DIR = "benchmark_output"
+HISTORY_DIR = "benchmark_history"
+LOG_DIR = "benchmark_logs"
+MAX_ITERS = 1 << 10
+START = 6
+END = 20
+STD_THRESHOLD = 0.05  # 5% standard deviation threshold
+
+# Ensure log directory exists
+Path(LOG_DIR).mkdir(parents=True, exist_ok=True)
+
+# Setup logging
+logging.basicConfig(
+    filename=Path(LOG_DIR) / "benchmark.log",
+    level=logging.INFO,
+    format="%(asctime)s - %(message)s",
+)
+console = logging.StreamHandler()
+console.setLevel(logging.INFO)
+formatter = logging.Formatter("%(asctime)s - %(message)s")
+console.setFormatter(formatter)
+logging.getLogger().addHandler(console)
+
+
+def run_command(command, cwd=None):
+    result = subprocess.run(
+        command, shell=True, text=True, capture_output=True, cwd=cwd
+    )
+    if result.returncode != 0:
+        logging.error(f"Error running command: {command}\n{result.stderr}")
+        sys.exit(result.returncode)
+    return result.stdout.strip()
+
+
+def clean_build_rust():
+    logging.info("Cleaning and building Rust project...")
+    run_command("cargo clean")
+    run_command("cargo build --release --examples")
+
+
+def benchmark_with_stabilization(executable_name, n, max_iters, std_threshold):
+    times = []
+    for i in range(max_iters):
+        result = run_command(f"../target/release/examples/{executable_name} {n}")
+        times.append(int(result))
+        if len(times) > 10:  # Start evaluating after a minimum number of runs
+            current_std = np.std(times) / np.mean(times)
+            if current_std < std_threshold:
+                break
+    return times
+
+
+def benchmark(
+    benchmark_name, output_subdir, start, end, max_iters, std_threshold, executable_name
+):
+    output_dir_path = Path(OUTPUT_DIR) / output_subdir
+    output_dir_path.mkdir(parents=True, exist_ok=True)
+
+    for n in range(start, end + 1):
+        logging.info(
+            f"Running {benchmark_name} benchmark for N = 2^{n} with a standard deviation threshold of {std_threshold * 100}%..."
+        )
+        times = benchmark_with_stabilization(
+            executable_name, n, max_iters, std_threshold
+        )
+        output_file = output_dir_path / f"size_{n}"
+        with open(output_file, "w") as f:
+            for time in times:
+                f.write(f"{time}\n")
+        logging.info(
+            f"Completed N = 2^{n} in {len(times)} iterations with a final standard deviation of {np.std(times) / np.mean(times):.2%}"
+        )
+
+
+def read_benchmark_results(output_dir, start, end):
+    sizes = []
+    times = []
+
+    for n in range(start, end + 1):
+        size_file = Path(output_dir) / f"size_{n}"
+        if size_file.exists():
+            with open(size_file, "r") as f:
+                data = f.readlines()
+                data = [int(line.strip()) for line in data]
+                if data:
+                    min_time_ns = min(data)
+                    sizes.append(2**n)
+                    times.append(min_time_ns)
+                else:
+                    logging.warning(f"No data found in file: {size_file}")
+        else:
+            logging.warning(f"File does not exist: {size_file}")
+
+    return sizes, times
+
+
+def plot_benchmark_results(output_subdirs, start, end, history_dirs=[]):
+    plt.figure(figsize=(10, 6))
+    has_data = False
+
+    # Plot current results
+    for subdir in output_subdirs:
+        sizes, times = read_benchmark_results(Path(OUTPUT_DIR) / subdir, start, end)
+        if sizes and times:
+            has_data = True
+            plt.plot(sizes, times, marker="o", label=f"current {subdir}")
+
+    # Plot previous results from history for PhastFT
+    for history_dir in history_dirs:
+        sizes, times = read_benchmark_results(
+            Path(history_dir) / "benchmark_output" / "phastft", start, end
+        )
+        if sizes and times:
+            has_data = True
+            timestamp = Path(history_dir).stem
+            plt.plot(
+                sizes, times, marker="x", linestyle="--", label=f"{timestamp} phastft"
+            )
+
+    if has_data:
+        plt.title("Benchmark Results")
+        plt.xlabel("FFT Size (N)")
+        plt.ylabel("Minimum Time (ns)")
+        plt.xscale("log")
+        plt.yscale("log")
+        plt.grid(True, which="both", ls="--")
+        plt.legend()
+        plt.savefig(f"{OUTPUT_DIR}/benchmark_results.png", dpi=600)
+        # plt.show()
+    else:
+        logging.warning("No data available to plot.")
+
+
+def compare_results(current_dir, previous_dir, start, end):
+    changes = {}
+    for n in range(start, end + 1):
+        current_file = Path(current_dir) / f"size_{n}"
+        previous_file = (
+            Path(previous_dir) / "benchmark_output" / "phastft" / f"size_{n}"
+        )
+
+        if current_file.exists() and previous_file.exists():
+            with open(current_file, "r") as cf, open(previous_file, "r") as pf:
+                current_data = [int(line.strip()) for line in cf.readlines()]
+                previous_data = [int(line.strip()) for line in pf.readlines()]
+
+                if current_data and previous_data:
+                    current_min = min(current_data)
+                    previous_min = min(previous_data)
+
+                    if current_min != previous_min:
+                        change = ((current_min - previous_min) / previous_min) * 100
+                        changes[n] = change
+                else:
+                    logging.warning(
+                        f"Data missing in files for size 2^{n}: Current data length: {len(current_data)}, Previous data length: {len(previous_data)}"
+                    )
+        else:
+            logging.warning(
+                f"Missing files for size 2^{n}: Current file exists: {current_file.exists()}, Previous file exists: {previous_file.exists()}"
+            )
+
+    return changes
+
+
+def archive_current_results():
+    if Path(OUTPUT_DIR).exists():
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        history_dir = Path(HISTORY_DIR) / timestamp
+        history_dir.mkdir(parents=True, exist_ok=True)
+        shutil.move(OUTPUT_DIR, history_dir)
+        logging.info(f"Archived current results to: {history_dir}")
+    else:
+        logging.warning(
+            f"Output directory '{OUTPUT_DIR}' does not exist and cannot be archived."
+        )
+
+
+def main():
+    clean_build_rust()
+
+    # Check if there are previous results for comparison
+    history_dirs = (
+        sorted(Path(HISTORY_DIR).iterdir(), key=os.path.getmtime)
+        if Path(HISTORY_DIR).exists()
+        else []
+    )
+    latest_previous_dir = history_dirs[-1] if history_dirs else None
+
+    # Run new benchmarks for PhastFT, RustFFT, and FFTW3
+    benchmark("PhastFT", "phastft", START, END, MAX_ITERS, STD_THRESHOLD, "benchmark")
+    benchmark("RustFFT", "rustfft", START, END, MAX_ITERS, STD_THRESHOLD, "rustfft")
+    benchmark(
+        "FFTW3 Rust bindings", "fftwrb", START, END, MAX_ITERS, STD_THRESHOLD, "fftwrb"
+    )
+
+    # Compare new PhastFT benchmarks against previous results
+    if latest_previous_dir:
+        logging.info(f"Comparing with previous results from: {latest_previous_dir}")
+        changes = compare_results(
+            Path(OUTPUT_DIR) / "phastft", latest_previous_dir, START, END
+        )
+        for n, change in changes.items():
+            status = "improvement" if change < 0 else "regression"
+            logging.info(f"N = 2^{n}: {abs(change):.2f}% {status}")
+    else:
+        logging.info("No previous results found for comparison.")
+
+    # Plot benchmark results
+    plot_benchmark_results(["phastft", "rustfft", "fftwrb"], START, END, history_dirs)
+
+    # Archive current results
+    archive_current_results()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/kernels.rs b/src/kernels.rs
index 8651ee9..6d5e3aa 100644
--- a/src/kernels.rs
+++ b/src/kernels.rs
@@ -119,15 +119,15 @@ pub(crate) fn fft_chunk_n<T: Float>(
 ))]
 #[inline]
 pub(crate) fn fft_chunk_4<T: Float>(reals: &mut [T], imags: &mut [T]) {
-    let dist = 2;
-    let chunk_size = dist << 1;
+    const DIST: usize = 2;
+    const CHUNK_SIZE: usize = DIST << 1;
 
     reals
-        .chunks_exact_mut(chunk_size)
-        .zip(imags.chunks_exact_mut(chunk_size))
+        .array_chunks_mut::<CHUNK_SIZE>()
+        .zip(imags.array_chunks_mut::<CHUNK_SIZE>())
         .for_each(|(reals_chunk, imags_chunk)| {
-            let (reals_s0, reals_s1) = reals_chunk.split_at_mut(dist);
-            let (imags_s0, imags_s1) = imags_chunk.split_at_mut(dist);
+            let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST);
+            let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST);
 
             let real_c0 = reals_s0[0];
             let real_c1 = reals_s1[0];
@@ -163,8 +163,8 @@ pub(crate) fn fft_chunk_4<T: Float>(reals: &mut [T], imags: &mut [T]) {
 #[inline]
 pub(crate) fn fft_chunk_2<T: Float>(reals: &mut [T], imags: &mut [T]) {
     reals
-        .chunks_exact_mut(2)
-        .zip(imags.chunks_exact_mut(2))
+        .array_chunks_mut::<2>()
+        .zip(imags.array_chunks_mut::<2>())
         .for_each(|(reals_chunk, imags_chunk)| {
             let z0_re = reals_chunk[0];
             let z0_im = imags_chunk[0];
diff --git a/src/lib.rs b/src/lib.rs
index c0e20f7..e1b6def 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -8,7 +8,7 @@
     clippy::suspicious
 )]
 #![forbid(unsafe_code)]
-#![feature(portable_simd, avx512_target_feature)]
+#![feature(array_chunks, portable_simd, avx512_target_feature)]
 
 #[cfg(feature = "complex-nums")]
 use std::simd::{f32x16, f64x8};

From a201e2c1151d24885eeffbc5b2dc550cbd9f6193 Mon Sep 17 00:00:00 2001
From: Saveliy Yusufov <sy2685@columbia.edu>
Date: Sun, 14 Jul 2024 14:17:26 -0400
Subject: [PATCH 19/23] Update todo

---
 src/twiddles.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/twiddles.rs b/src/twiddles.rs
index 7e11ba9..6008ad8 100644
--- a/src/twiddles.rs
+++ b/src/twiddles.rs
@@ -217,7 +217,7 @@ mod tests {
 
     use super::*;
 
-    // TODO(saveliy): use
+    // TODO(saveliy): try to use only real twiddle factors since sin is just a phase shift of cos
     #[test]
     fn twiddles_cos_only() {
         let n = 4;
@@ -234,7 +234,7 @@ mod tests {
         assert!(fwd_twiddles_re.len() == dist && fwd_twiddles_im.len() == dist);
 
         for i in 0..dist {
-            let w_re = fwd_twiddles_re[i];
+            let _w_re = fwd_twiddles_re[i];
             let expected_w_im = fwd_twiddles_im[i];
 
             let actual_w_im = -fwd_twiddles_re[(i + dist / 2) % dist];

From 403684b60b97c9163b0c93251da930944a0874a9 Mon Sep 17 00:00:00 2001
From: Saveliy Yusufov <sy2685@columbia.edu>
Date: Sun, 14 Jul 2024 14:41:15 -0400
Subject: [PATCH 20/23] Make examples output time elapsed in nanoseconds

---
 benches/bench.rs      | 3 ++-
 examples/benchmark.rs | 2 +-
 examples/fftwrb.rs    | 2 +-
 examples/rustfft.rs   | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/benches/bench.rs b/benches/bench.rs
index 8120787..305399d 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -13,7 +13,8 @@ use utilities::rustfft::num_complex::Complex;
 use utilities::rustfft::FftPlanner;
 
 const LENGTHS: &[usize] = &[
-    6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24];
+    6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+];
 
 fn generate_numbers<T: Float>(n: usize) -> (Vec<T>, Vec<T>)
 where
diff --git a/examples/benchmark.rs b/examples/benchmark.rs
index e6ce897..0fe62bf 100644
--- a/examples/benchmark.rs
+++ b/examples/benchmark.rs
@@ -18,7 +18,7 @@ fn benchmark_fft_64(n: usize) {
 
     let now = std::time::Instant::now();
     fft_64_with_opts_and_plan(&mut reals, &mut imags, &opts, &planner);
-    let elapsed = now.elapsed().as_micros();
+    let elapsed = now.elapsed().as_nanos();
     println!("{elapsed}");
 }
 
diff --git a/examples/fftwrb.rs b/examples/fftwrb.rs
index db4beee..3b1bf5a 100644
--- a/examples/fftwrb.rs
+++ b/examples/fftwrb.rs
@@ -34,7 +34,7 @@ fn benchmark_fftw(n: usize) {
         &mut nums,
     )
     .unwrap();
-    let elapsed = now.elapsed().as_micros();
+    let elapsed = now.elapsed().as_nanos();
     println!("{elapsed}");
 }
 
diff --git a/examples/rustfft.rs b/examples/rustfft.rs
index 8e9ac51..6f40f80 100644
--- a/examples/rustfft.rs
+++ b/examples/rustfft.rs
@@ -28,7 +28,7 @@ fn benchmark_rustfft(n: usize) {
 
     let now = std::time::Instant::now();
     fft.process(&mut signal);
-    let elapsed = now.elapsed().as_micros();
+    let elapsed = now.elapsed().as_nanos();
     println!("{elapsed}");
 }
 

From 1343268bd53bd0fb29872591c636dfc8442b5ae0 Mon Sep 17 00:00:00 2001
From: Saveliy Yusufov <sy2685@columbia.edu>
Date: Sun, 14 Jul 2024 14:45:19 -0400
Subject: [PATCH 21/23] Remove `array_chunks`

---
 src/kernels.rs | 8 ++++----
 src/lib.rs     | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/kernels.rs b/src/kernels.rs
index 6d5e3aa..a1fe09c 100644
--- a/src/kernels.rs
+++ b/src/kernels.rs
@@ -123,8 +123,8 @@ pub(crate) fn fft_chunk_4<T: Float>(reals: &mut [T], imags: &mut [T]) {
     const CHUNK_SIZE: usize = DIST << 1;
 
     reals
-        .array_chunks_mut::<CHUNK_SIZE>()
-        .zip(imags.array_chunks_mut::<CHUNK_SIZE>())
+        .chunks_exact_mut(CHUNK_SIZE)
+        .zip(imags.chunks_exact_mut(CHUNK_SIZE))
         .for_each(|(reals_chunk, imags_chunk)| {
             let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST);
             let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST);
@@ -163,8 +163,8 @@ pub(crate) fn fft_chunk_4<T: Float>(reals: &mut [T], imags: &mut [T]) {
 #[inline]
 pub(crate) fn fft_chunk_2<T: Float>(reals: &mut [T], imags: &mut [T]) {
     reals
-        .array_chunks_mut::<2>()
-        .zip(imags.array_chunks_mut::<2>())
+        .chunks_exact_mut(2)
+        .zip(imags.chunks_exact_mut(2))
         .for_each(|(reals_chunk, imags_chunk)| {
             let z0_re = reals_chunk[0];
             let z0_im = imags_chunk[0];
diff --git a/src/lib.rs b/src/lib.rs
index e1b6def..c0e20f7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -8,7 +8,7 @@
     clippy::suspicious
 )]
 #![forbid(unsafe_code)]
-#![feature(array_chunks, portable_simd, avx512_target_feature)]
+#![feature(portable_simd, avx512_target_feature)]
 
 #[cfg(feature = "complex-nums")]
 use std::simd::{f32x16, f64x8};

From e94f2fc328d0f65a113e5c70b7fe17269795bc8e Mon Sep 17 00:00:00 2001
From: Saveliy Yusufov <sy2685@columbia.edu>
Date: Wed, 17 Jul 2024 00:23:46 -0400
Subject: [PATCH 22/23] Use new de-interleaving function

---
 src/lib.rs   | 84 ++--------------------------------------------------
 src/utils.rs | 17 +++++++++++
 2 files changed, 20 insertions(+), 81 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index c0e20f7..c71dfb8 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -230,86 +230,6 @@ impl_fft_with_opts_and_plan_for!(
     16
 );
 
-#[cfg(feature = "complex-nums")]
-macro_rules! impl_separate_re_im {
-    ($func_name:ident, $precision:ty, $lanes:literal, $simd_vec:ty) => {
-        /// Utility function to separate interleaved format signals (i.e., Vector of Complex Number Structs)
-        /// into separate vectors for the corresponding real and imaginary components.
-        #[multiversion::multiversion(
-                                    targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4
-                                            "x86_64+avx2+fma", // x86_64-v3
-                                            "x86_64+sse4.2", // x86_64-v2
-                                            "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl",
-                                            "x86+avx2+fma",
-                                            "x86+sse4.2",
-                                            "x86+sse2",
-                                        ))]
-        pub fn $func_name(
-            signal: &[Complex<$precision>],
-        ) -> (Vec<$precision>, Vec<$precision>) {
-            let n = signal.len();
-            let mut reals = vec![0.0; n];
-            let mut imags = vec![0.0; n];
-
-            let complex_t: &[$precision] = cast_slice(signal);
-            const CHUNK_SIZE: usize = $lanes * 2;
-
-            for ((chunk, chunk_re), chunk_im) in complex_t
-                .chunks_exact(CHUNK_SIZE)
-                .zip(reals.chunks_exact_mut($lanes))
-                .zip(imags.chunks_exact_mut($lanes))
-            {
-                let (first_half, second_half) = chunk.split_at($lanes);
-
-                let a = <$simd_vec>::from_slice(&first_half);
-                let b = <$simd_vec>::from_slice(&second_half);
-                let (re_deinterleaved, im_deinterleaved) = a.deinterleave(b);
-
-                chunk_re.copy_from_slice(&re_deinterleaved.to_array());
-                chunk_im.copy_from_slice(&im_deinterleaved.to_array());
-            }
-
-            let remainder = complex_t.chunks_exact(CHUNK_SIZE).remainder();
-            if !remainder.is_empty() {
-                let i = reals.len() - remainder.len() / 2;
-            remainder
-                .chunks_exact(2)
-                .zip(reals[i..].iter_mut())
-                .zip(imags[i..].iter_mut())
-                .for_each(|((c, re), im)| {
-                    *re = c[0];
-                    *im = c[1];
-                });
-            }
-
-            (reals, imags)
-        }
-    };
-}
-
-#[cfg(feature = "complex-nums")]
-impl_separate_re_im!(separate_re_im_f32, f32, 16, f32x16);
-
-#[cfg(feature = "complex-nums")]
-impl_separate_re_im!(separate_re_im_f64, f64, 8, f64x8);
-
-/// Utility function to combine separate vectors of real and imaginary components
-/// into a single vector of Complex Number Structs.
-///
-/// # Panics
-///
-/// Panics if `reals.len() != imags.len()`.
-#[cfg(feature = "complex-nums")]
-pub fn combine_re_im<T: Float>(reals: &[T], imags: &[T]) -> Vec<Complex<T>> {
-    assert_eq!(reals.len(), imags.len());
-
-    reals
-        .iter()
-        .zip(imags.iter())
-        .map(|(z_re, z_im)| Complex::new(*z_re, *z_im))
-        .collect()
-}
-
 #[cfg(test)]
 mod tests {
     use std::ops::Range;
@@ -323,6 +243,8 @@ mod tests {
     #[cfg(feature = "complex-nums")]
     #[test]
     fn test_separate_and_combine_re_im() {
+        use utils::deinterleave;
+
         let complex_vec: Vec<_> = vec![
             Complex::new(1.0, 2.0),
             Complex::new(3.0, 4.0),
@@ -330,7 +252,7 @@ mod tests {
             Complex::new(7.0, 8.0),
         ];
 
-        let (reals, imags) = separate_re_im_f64(&complex_vec);
+        let (reals, imags) = deinterleave(&complex_vec);
 
         let recombined_vec = combine_re_im(&reals, &imags);
 
diff --git a/src/utils.rs b/src/utils.rs
index 70ed734..d4be2fe 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -59,6 +59,23 @@ pub(crate) fn deinterleave<T: Copy + Default + SimdElement>(input: &[T]) -> (Vec
     (out_odd, out_even)
 }
 
+/// Utility function to combine separate vectors of real and imaginary components
+/// into a single vector of Complex Number Structs.
+///
+/// # Panics
+///
+/// Panics if `reals.len() != imags.len()`.
+#[cfg(feature = "complex-nums")]
+pub(crate) fn combine_re_im<T: Float>(reals: &[T], imags: &[T]) -> Vec<Complex<T>> {
+    assert_eq!(reals.len(), imags.len());
+
+    reals
+        .iter()
+        .zip(imags.iter())
+        .map(|(z_re, z_im)| Complex::new(*z_re, *z_im))
+        .collect()
+}
+
 #[cfg(test)]
 mod tests {
     use super::deinterleave;

From 713b3a66cf36e33a0450877629d5305ca3434365 Mon Sep 17 00:00:00 2001
From: Saveliy Yusufov <sy2685@columbia.edu>
Date: Wed, 17 Jul 2024 01:16:15 -0400
Subject: [PATCH 23/23] Update pre-commit hook to run clippy everywhere

---
 benches/README.md | 26 +++++------------------
 benches/bench.rs  | 32 ++++++++++++++++------------
 hooks/pre-commit  |  4 ++--
 src/lib.rs        | 38 ++++++---------------------------
 src/twiddles.rs   |  2 +-
 src/utils.rs      | 54 +++++++++++++++++++++++++++++++++++++++++++++--
 6 files changed, 86 insertions(+), 70 deletions(-)

diff --git a/benches/README.md b/benches/README.md
index 3e84f65..e276e06 100644
--- a/benches/README.md
+++ b/benches/README.md
@@ -4,24 +4,15 @@
 
 ### Setup Environment
 
-1. Install [FFTW3](http://www.fftw.org/download.html)[^1]
+1. Clone the `PhastFT` git repository [^2].
 
-   It may be possible to install `fftw3` using a package manager.
-
-   ##### debian
-   ```bash
-   sudo apt install libfftw3-dev
-   ```
-
-2. Clone the `PhastFT` git repository [^2].
-
-3. Create virtual env
+2. Create virtual env
 
 ```bash
 cd ~/PhastFT/benches && python3 -m venv .env && source .env/bin/activate
 ```
 
-4. Install python dependencies[^1]
+3. Install python dependencies[^1]
 
 ```bash
 pip install -r requirements.txt
@@ -29,10 +20,10 @@ cd ~/PhastFT/pyphastft
 pip install .
 ```
 
-5. Run the `FFTW3` vs. `RustFFT` vs. `PhastFT` benchmark for all inputs of size `2^n`, where `n \in [4, 30].`
+5. Run the `FFTW3-RB` vs. `RustFFT` vs. `PhastFT` benchmarks`
 
 ```bash
-./benchmark.sh 4 29
+python run_benches.py
 ```
 
 6. Plot the results
@@ -101,13 +92,6 @@ On linux, open access to performance monitoring, and observability operations fo
 echo -1 | sudo tee /proc/sys/kernel/perf_event_paranoid
 ```
 
-Add debug to `Cargo.toml` under `profile.release`:
-
-```bash
-[profile.release]
-debug = true
-```
-
 Finally, run:
 
 ```bash
diff --git a/benches/bench.rs b/benches/bench.rs
index 305399d..00d7833 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -69,7 +69,7 @@ fn benchmark_forward_f32(c: &mut Criterion) {
         let planner = Planner32::new(len, Direction::Forward);
         let (mut reals, mut imags) = generate_numbers(len);
 
-        group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &len| {
+        group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &_len| {
             b.iter(|| {
                 fft_32_with_opts_and_plan(
                     black_box(&mut reals),
@@ -85,7 +85,7 @@ fn benchmark_forward_f32(c: &mut Criterion) {
         let fft = planner.plan_fft_forward(len);
         let mut signal = generate_complex_numbers(len);
 
-        group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &len| {
+        group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &_len| {
             b.iter(|| fft.process(black_box(&mut signal)));
         });
     }
@@ -103,9 +103,12 @@ fn benchmark_inverse_f32(c: &mut Criterion) {
         c.bench_function(&id, |b| {
             let (mut reals, mut imags) = generate_numbers(len);
             b.iter(|| {
-                black_box(fft_32_with_opts_and_plan(
-                    &mut reals, &mut imags, &options, &planner,
-                ));
+                fft_32_with_opts_and_plan(
+                    black_box(&mut reals),
+                    black_box(&mut imags),
+                    black_box(&options),
+                    black_box(&planner),
+                );
             });
         });
     }
@@ -122,7 +125,7 @@ fn benchmark_forward_f64(c: &mut Criterion) {
         let (mut reals, mut imags) = generate_numbers(len);
         group.throughput(Throughput::Elements(len as u64));
 
-        group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &len| {
+        group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &_len| {
             b.iter(|| {
                 fft_64_with_opts_and_plan(
                     black_box(&mut reals),
@@ -138,7 +141,7 @@ fn benchmark_forward_f64(c: &mut Criterion) {
         let fft = planner.plan_fft_forward(len);
         let mut signal = generate_complex_numbers(len);
 
-        group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &len| {
+        group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &_len| {
             b.iter(|| fft.process(black_box(&mut signal)));
         });
     }
@@ -156,9 +159,12 @@ fn benchmark_inverse_f64(c: &mut Criterion) {
         c.bench_function(&id, |b| {
             let (mut reals, mut imags) = generate_numbers(len);
             b.iter(|| {
-                black_box(fft_64_with_opts_and_plan(
-                    &mut reals, &mut imags, &options, &planner,
-                ));
+                fft_64_with_opts_and_plan(
+                    black_box(&mut reals),
+                    black_box(&mut imags),
+                    black_box(&options),
+                    black_box(&planner),
+                );
             });
         });
     }
@@ -166,9 +172,9 @@ fn benchmark_inverse_f64(c: &mut Criterion) {
 
 criterion_group!(
     benches,
-    // benchmark_forward_f32,
-    // benchmark_inverse_f32,
+    benchmark_forward_f32,
+    benchmark_inverse_f32,
     benchmark_forward_f64,
-    // benchmark_inverse_f64
+    benchmark_inverse_f64
 );
 criterion_main!(benches);
diff --git a/hooks/pre-commit b/hooks/pre-commit
index 6d3f529..8f289a5 100755
--- a/hooks/pre-commit
+++ b/hooks/pre-commit
@@ -9,13 +9,13 @@ then
     exit 1
 fi
 
-if ! cargo clippy -- -D warnings
+if ! cargo clippy --all-targets --all-features --tests -- -D warnings 
 then
     echo "There are some clippy issues."
     exit 1
 fi
 
-if ! cargo test
+if ! cargo test --all-features
 then
     echo "There are some test issues."
     exit 1
diff --git a/src/lib.rs b/src/lib.rs
index c71dfb8..8f03b6d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -11,14 +11,9 @@
 #![feature(portable_simd, avx512_target_feature)]
 
 #[cfg(feature = "complex-nums")]
-use std::simd::{f32x16, f64x8};
-
-#[cfg(feature = "complex-nums")]
-use bytemuck::cast_slice;
+use crate::utils::{combine_re_im, deinterleave_complex32, deinterleave_complex64};
 #[cfg(feature = "complex-nums")]
 use num_complex::Complex;
-#[cfg(feature = "complex-nums")]
-use num_traits::Float;
 
 use crate::cobra::cobra_apply;
 use crate::kernels::{
@@ -96,9 +91,9 @@ macro_rules! impl_fft_interleaved_for {
 }
 
 #[cfg(feature = "complex-nums")]
-impl_fft_interleaved_for!(fft_32_interleaved, f32, fft_32, separate_re_im_f32);
+impl_fft_interleaved_for!(fft_32_interleaved, f32, fft_32, deinterleave_complex32);
 #[cfg(feature = "complex-nums")]
-impl_fft_interleaved_for!(fft_64_interleaved, f64, fft_64, separate_re_im_f64);
+impl_fft_interleaved_for!(fft_64_interleaved, f64, fft_64, deinterleave_complex64);
 
 macro_rules! impl_fft_with_opts_and_plan_for {
     ($func_name:ident, $precision:ty, $planner:ty, $simd_butterfly_kernel:ident, $lanes:literal) => {
@@ -240,25 +235,6 @@ mod tests {
 
     use super::*;
 
-    #[cfg(feature = "complex-nums")]
-    #[test]
-    fn test_separate_and_combine_re_im() {
-        use utils::deinterleave;
-
-        let complex_vec: Vec<_> = vec![
-            Complex::new(1.0, 2.0),
-            Complex::new(3.0, 4.0),
-            Complex::new(5.0, 6.0),
-            Complex::new(7.0, 8.0),
-        ];
-
-        let (reals, imags) = deinterleave(&complex_vec);
-
-        let recombined_vec = combine_re_im(&reals, &imags);
-
-        assert_eq!(complex_vec, recombined_vec);
-    }
-
     macro_rules! non_power_of_2_planner {
         ($test_name:ident, $planner:ty) => {
             #[should_panic]
@@ -436,9 +412,9 @@ mod tests {
         let mut re = reals.clone();
         let mut im = imags.clone();
 
-        let mut planner = Planner64::new(num_points, Direction::Forward);
+        let planner = Planner64::new(num_points, Direction::Forward);
         let opts = Options::guess_options(reals.len());
-        fft_64_with_opts_and_plan(&mut reals, &mut imags, &opts, &mut planner);
+        fft_64_with_opts_and_plan(&mut reals, &mut imags, &opts, &planner);
 
         fft_64(&mut re, &mut im, Direction::Forward);
 
@@ -467,9 +443,9 @@ mod tests {
                 let mut re = reals.clone();
                 let mut im = imags.clone();
 
-                let mut planner = Planner32::new(num_points, direction);
+                let planner = Planner32::new(num_points, direction);
                 let opts = Options::guess_options(reals.len());
-                fft_32_with_opts_and_plan(&mut reals, &mut imags, &opts, &mut planner);
+                fft_32_with_opts_and_plan(&mut reals, &mut imags, &opts, &planner);
 
                 fft_32(&mut re, &mut im, direction);
 
diff --git a/src/twiddles.rs b/src/twiddles.rs
index 6008ad8..144e9fc 100644
--- a/src/twiddles.rs
+++ b/src/twiddles.rs
@@ -239,7 +239,7 @@ mod tests {
 
             let actual_w_im = -fwd_twiddles_re[(i + dist / 2) % dist];
             //assert_float_closeness(actual_w_im, expected_w_im, 1e-6);
-            print!("actual: {actual_w_im} expected: {expected_w_im}\n");
+            println!("actual: {actual_w_im} expected: {expected_w_im}");
         }
         println!("{:?}", fwd_twiddles_re);
         println!("{:?}", fwd_twiddles_im);
diff --git a/src/utils.rs b/src/utils.rs
index d4be2fe..3c0f240 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -1,5 +1,14 @@
 //! Utility functions such as interleave/deinterleave
 
+#[cfg(feature = "complex-nums")]
+use num_complex::Complex;
+
+#[cfg(feature = "complex-nums")]
+use num_traits::Float;
+
+#[cfg(feature = "complex-nums")]
+use bytemuck::cast_slice;
+
 use std::simd::{prelude::Simd, simd_swizzle, SimdElement};
 
 // We don't multiversion for AVX-512 here and keep the chunk size below AVX-512
@@ -59,6 +68,30 @@ pub(crate) fn deinterleave<T: Copy + Default + SimdElement>(input: &[T]) -> (Vec
     (out_odd, out_even)
 }
 
+/// Utility function to separate a slice of [`Complex64``]
+/// into a single vector of Complex Number Structs.
+///
+/// # Panics
+///
+/// Panics if `reals.len() != imags.len()`.
+#[cfg(feature = "complex-nums")]
+pub(crate) fn deinterleave_complex64(signal: &[Complex<f64>]) -> (Vec<f64>, Vec<f64>) {
+    let complex_t: &[f64] = cast_slice(signal);
+    deinterleave(complex_t)
+}
+
+/// Utility function to separate a slice of [`Complex32``]
+/// into a single vector of Complex Number Structs.
+///
+/// # Panics
+///
+/// Panics if `reals.len() != imags.len()`.
+#[cfg(feature = "complex-nums")]
+pub(crate) fn deinterleave_complex32(signal: &[Complex<f32>]) -> (Vec<f32>, Vec<f32>) {
+    let complex_t: &[f32] = cast_slice(signal);
+    deinterleave(complex_t)
+}
+
 /// Utility function to combine separate vectors of real and imaginary components
 /// into a single vector of Complex Number Structs.
 ///
@@ -78,10 +111,10 @@ pub(crate) fn combine_re_im<T: Float>(reals: &[T], imags: &[T]) -> Vec<Complex<T
 
 #[cfg(test)]
 mod tests {
-    use super::deinterleave;
+    use super::*;
 
     fn gen_test_vec(len: usize) -> Vec<usize> {
-        (0..len).into_iter().collect()
+        (0..len).collect()
     }
 
     /// Slow but obviously correct implementation of deinterleaving,
@@ -100,4 +133,21 @@ mod tests {
             assert_eq!(naive_b, opt_b);
         }
     }
+
+    #[cfg(feature = "complex-nums")]
+    #[test]
+    fn test_separate_and_combine_re_im() {
+        let complex_vec: Vec<_> = vec![
+            Complex::new(1.0, 2.0),
+            Complex::new(3.0, 4.0),
+            Complex::new(5.0, 6.0),
+            Complex::new(7.0, 8.0),
+        ];
+
+        let (reals, imags) = deinterleave_complex64(&complex_vec);
+
+        let recombined_vec = combine_re_im(&reals, &imags);
+
+        assert_eq!(complex_vec, recombined_vec);
+    }
 }