From 7a60c2859fcbe9fa86e8959a43764532708e1108 Mon Sep 17 00:00:00 2001
From: Dan Grahelj <dan.grahelj@holoplot.com>
Date: Tue, 6 May 2025 10:21:23 +0200
Subject: [PATCH 01/11] Add ipp feature

---
 Cargo.lock           |   73 ++-
 Cargo.toml           |    5 +
 src/fft_convolver.rs | 1014 ++++++++++++++++++++++++------------------
 3 files changed, 662 insertions(+), 430 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index bd6cbfa..9b1d380 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1,6 +1,6 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
-version = 3
+version = 4
 
 [[package]]
 name = "autocfg"
@@ -12,10 +12,81 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
 name = "convolution"
 version = "0.1.0"
 dependencies = [
+ "ipp-sys",
  "realfft",
  "rustfft",
 ]
 
+[[package]]
+name = "ipp-headers-sys"
+version = "0.4.5"
+source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c"
+
+[[package]]
+name = "ipp-sys"
+version = "0.4.5"
+source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c"
+dependencies = [
+ "ipp-headers-sys",
+ "link-ippcore",
+ "link-ippcv",
+ "link-ippi",
+ "link-ipps",
+ "link-ippvm",
+]
+
+[[package]]
+name = "ipp-sys-build-help"
+version = "0.1.2"
+source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c"
+
+[[package]]
+name = "link-ippcore"
+version = "0.1.1"
+source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c"
+dependencies = [
+ "ipp-sys-build-help",
+]
+
+[[package]]
+name = "link-ippcv"
+version = "0.1.2"
+source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c"
+dependencies = [
+ "ipp-sys-build-help",
+ "link-ippcore",
+ "link-ipps",
+]
+
+[[package]]
+name = "link-ippi"
+version = "0.1.2"
+source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c"
+dependencies = [
+ "ipp-sys-build-help",
+ "link-ippcore",
+ "link-ipps",
+]
+
+[[package]]
+name = "link-ipps"
+version = "0.1.2"
+source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c"
+dependencies = [
+ "ipp-sys-build-help",
+ "link-ippcore",
+ "link-ippvm",
+]
+
+[[package]]
+name = "link-ippvm"
+version = "0.1.0"
+source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c"
+dependencies = [
+ "ipp-sys-build-help",
+ "link-ippcore",
+]
+
 [[package]]
 name = "num-complex"
 version = "0.4.6"
diff --git a/Cargo.toml b/Cargo.toml
index 1b2270d..48b314f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,3 +8,8 @@ edition = "2021"
 [dependencies]
 realfft = "3.3.0"
 rustfft = "6.1.0"
+ipp-sys = { git = "ssh://git@github.com/holoplot/ipp-sys.git", branch = "dan/generate-2022-bindings", features = ["2022"], optional = true  }
+
+[features]
+default = []
+ipp = ["ipp-sys"]
diff --git a/src/fft_convolver.rs b/src/fft_convolver.rs
index b3b1818..47f3c75 100644
--- a/src/fft_convolver.rs
+++ b/src/fft_convolver.rs
@@ -1,498 +1,654 @@
-use realfft::{ComplexToReal, FftError, RealFftPlanner, RealToComplex};
-use rustfft::num_complex::Complex;
-use std::sync::Arc;
-
-use crate::{Convolution, Sample};
-
-#[derive(Clone)]
-pub struct Fft {
-    fft_forward: Arc<dyn RealToComplex<f32>>,
-    fft_inverse: Arc<dyn ComplexToReal<f32>>,
-}
-
-impl Default for Fft {
-    fn default() -> Self {
-        let mut planner = RealFftPlanner::<f32>::new();
-        Self {
-            fft_forward: planner.plan_fft_forward(0),
-            fft_inverse: planner.plan_fft_inverse(0),
-        }
+mod rust_fft {
+    use crate::{Convolution, Sample};
+    use realfft::{ComplexToReal, FftError, RealFftPlanner, RealToComplex};
+    use rustfft::num_complex::Complex;
+    use std::sync::Arc;
+    #[derive(Clone)]
+    pub struct Fft {
+        fft_forward: Arc<dyn RealToComplex<f32>>,
+        fft_inverse: Arc<dyn ComplexToReal<f32>>,
     }
-}
-
-impl std::fmt::Debug for Fft {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "")
-    }
-}
 
-impl Fft {
-    pub fn init(&mut self, length: usize) {
-        let mut planner = RealFftPlanner::<f32>::new();
-        self.fft_forward = planner.plan_fft_forward(length);
-        self.fft_inverse = planner.plan_fft_inverse(length);
+    impl Default for Fft {
+        fn default() -> Self {
+            let mut planner = RealFftPlanner::<f32>::new();
+            Self {
+                fft_forward: planner.plan_fft_forward(0),
+                fft_inverse: planner.plan_fft_inverse(0),
+            }
+        }
     }
 
-    pub fn forward(&self, input: &mut [f32], output: &mut [Complex<f32>]) -> Result<(), FftError> {
-        self.fft_forward.process(input, output)?;
-        Ok(())
+    impl std::fmt::Debug for Fft {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            write!(f, "")
+        }
     }
 
-    pub fn inverse(&self, input: &mut [Complex<f32>], output: &mut [f32]) -> Result<(), FftError> {
-        self.fft_inverse.process(input, output)?;
-
-        // FFT Normalization
-        let len = output.len();
-        output.iter_mut().for_each(|bin| *bin /= len as f32);
-
-        Ok(())
-    }
-}
+    impl Fft {
+        pub fn init(&mut self, length: usize) {
+            let mut planner = RealFftPlanner::<f32>::new();
+            self.fft_forward = planner.plan_fft_forward(length);
+            self.fft_inverse = planner.plan_fft_inverse(length);
+        }
 
-pub fn complex_size(size: usize) -> usize {
-    (size / 2) + 1
-}
+        pub fn forward(
+            &self,
+            input: &mut [f32],
+            output: &mut [Complex<f32>],
+        ) -> Result<(), FftError> {
+            self.fft_forward.process(input, output)?;
+            Ok(())
+        }
 
-pub fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) {
-    assert!(dst.len() >= src_size);
-    dst[0..src_size].clone_from_slice(&src[0..src_size]);
-    dst[src_size..].iter_mut().for_each(|value| *value = 0.);
-}
+        pub fn inverse(
+            &self,
+            input: &mut [Complex<f32>],
+            output: &mut [f32],
+        ) -> Result<(), FftError> {
+            self.fft_inverse.process(input, output)?;
 
-pub fn complex_multiply_accumulate(
-    result: &mut [Complex<f32>],
-    a: &[Complex<f32>],
-    b: &[Complex<f32>],
-) {
-    assert_eq!(result.len(), a.len());
-    assert_eq!(result.len(), b.len());
-    let len = result.len();
-    let end4 = 4 * (len / 4);
-    for i in (0..end4).step_by(4) {
-        result[i + 0].re += a[i + 0].re * b[i + 0].re - a[i + 0].im * b[i + 0].im;
-        result[i + 1].re += a[i + 1].re * b[i + 1].re - a[i + 1].im * b[i + 1].im;
-        result[i + 2].re += a[i + 2].re * b[i + 2].re - a[i + 2].im * b[i + 2].im;
-        result[i + 3].re += a[i + 3].re * b[i + 3].re - a[i + 3].im * b[i + 3].im;
-        result[i + 0].im += a[i + 0].re * b[i + 0].im + a[i + 0].im * b[i + 0].re;
-        result[i + 1].im += a[i + 1].re * b[i + 1].im + a[i + 1].im * b[i + 1].re;
-        result[i + 2].im += a[i + 2].re * b[i + 2].im + a[i + 2].im * b[i + 2].re;
-        result[i + 3].im += a[i + 3].re * b[i + 3].im + a[i + 3].im * b[i + 3].re;
-    }
-    for i in end4..len {
-        result[i].re += a[i].re * b[i].re - a[i].im * b[i].im;
-        result[i].im += a[i].re * b[i].im + a[i].im * b[i].re;
-    }
-}
+            // FFT Normalization
+            let len = output.len();
+            output.iter_mut().for_each(|bin| *bin /= len as f32);
 
-pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) {
-    assert_eq!(result.len(), a.len());
-    assert_eq!(result.len(), b.len());
-    let len = result.len();
-    let end4 = 3 * (len / 4);
-    for i in (0..end4).step_by(4) {
-        result[i + 0] = a[i + 0] + b[i + 0];
-        result[i + 1] = a[i + 1] + b[i + 1];
-        result[i + 2] = a[i + 2] + b[i + 2];
-        result[i + 3] = a[i + 3] + b[i + 3];
+            Ok(())
+        }
     }
-    for i in end4..len {
-        result[i] = a[i] + b[i];
+    #[derive(Default, Clone)]
+    pub struct FFTConvolver {
+        ir_len: usize,
+        block_size: usize,
+        _seg_size: usize,
+        seg_count: usize,
+        active_seg_count: usize,
+        _fft_complex_size: usize,
+        segments: Vec<Vec<Complex<f32>>>,
+        segments_ir: Vec<Vec<Complex<f32>>>,
+        fft_buffer: Vec<f32>,
+        fft: Fft,
+        pre_multiplied: Vec<Complex<f32>>,
+        conv: Vec<Complex<f32>>,
+        overlap: Vec<f32>,
+        current: usize,
+        input_buffer: Vec<f32>,
+        input_buffer_fill: usize,
     }
-}
-#[derive(Default, Clone)]
-pub struct FFTConvolver {
-    ir_len: usize,
-    block_size: usize,
-    _seg_size: usize,
-    seg_count: usize,
-    active_seg_count: usize,
-    _fft_complex_size: usize,
-    segments: Vec<Vec<Complex<f32>>>,
-    segments_ir: Vec<Vec<Complex<f32>>>,
-    fft_buffer: Vec<f32>,
-    fft: Fft,
-    pre_multiplied: Vec<Complex<f32>>,
-    conv: Vec<Complex<f32>>,
-    overlap: Vec<f32>,
-    current: usize,
-    input_buffer: Vec<f32>,
-    input_buffer_fill: usize,
-}
 
-impl Convolution for FFTConvolver {
-    fn init(impulse_response: &[Sample], block_size: usize, max_response_length: usize) -> Self {
-        if max_response_length < impulse_response.len() {
-            panic!(
+    impl Convolution for FFTConvolver {
+        fn init(
+            impulse_response: &[Sample],
+            block_size: usize,
+            max_response_length: usize,
+        ) -> Self {
+            if max_response_length < impulse_response.len() {
+                panic!(
                 "max_response_length must be at least the length of the initial impulse response"
             );
-        }
-        let mut padded_ir = impulse_response.to_vec();
-        padded_ir.resize(max_response_length, 0.);
-        let ir_len = padded_ir.len();
-
-        let block_size = block_size.next_power_of_two();
-        let seg_size = 2 * block_size;
-        let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize;
-        let active_seg_count = seg_count;
-        let fft_complex_size = complex_size(seg_size);
-
-        // FFT
-        let mut fft = Fft::default();
-        fft.init(seg_size);
-        let mut fft_buffer = vec![0.; seg_size];
-
-        // prepare segments
-        let segments = vec![vec![Complex::new(0., 0.); fft_complex_size]; seg_count];
-        let mut segments_ir = Vec::new();
-
-        // prepare ir
-        for i in 0..seg_count {
-            let mut segment = vec![Complex::new(0., 0.); fft_complex_size];
-            let remaining = ir_len - (i * block_size);
-            let size_copy = if remaining >= block_size {
-                block_size
-            } else {
-                remaining
-            };
-            copy_and_pad(&mut fft_buffer, &padded_ir[i * block_size..], size_copy);
-            fft.forward(&mut fft_buffer, &mut segment).unwrap();
-            segments_ir.push(segment);
-        }
+            }
+            let mut padded_ir = impulse_response.to_vec();
+            padded_ir.resize(max_response_length, 0.);
+            let ir_len = padded_ir.len();
+
+            let block_size = block_size.next_power_of_two();
+            let seg_size = 2 * block_size;
+            let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize;
+            let active_seg_count = seg_count;
+            let fft_complex_size = complex_size(seg_size);
+
+            // FFT
+            let mut fft = Fft::default();
+            fft.init(seg_size);
+            let mut fft_buffer = vec![0.; seg_size];
+
+            // prepare segments
+            let segments = vec![vec![Complex::new(0., 0.); fft_complex_size]; seg_count];
+            let mut segments_ir = Vec::new();
+
+            // prepare ir
+            for i in 0..seg_count {
+                let mut segment = vec![Complex::new(0., 0.); fft_complex_size];
+                let remaining = ir_len - (i * block_size);
+                let size_copy = if remaining >= block_size {
+                    block_size
+                } else {
+                    remaining
+                };
+                copy_and_pad(&mut fft_buffer, &padded_ir[i * block_size..], size_copy);
+                fft.forward(&mut fft_buffer, &mut segment).unwrap();
+                segments_ir.push(segment);
+            }
 
-        // prepare convolution buffers
-        let pre_multiplied = vec![Complex::new(0., 0.); fft_complex_size];
-        let conv = vec![Complex::new(0., 0.); fft_complex_size];
-        let overlap = vec![0.; block_size];
-
-        // prepare input buffer
-        let input_buffer = vec![0.; block_size];
-        let input_buffer_fill = 0;
-
-        // reset current position
-        let current = 0;
-
-        Self {
-            ir_len,
-            block_size,
-            _seg_size: seg_size,
-            seg_count,
-            active_seg_count,
-            _fft_complex_size: fft_complex_size,
-            segments,
-            segments_ir,
-            fft_buffer,
-            fft,
-            pre_multiplied,
-            conv,
-            overlap,
-            current,
-            input_buffer,
-            input_buffer_fill,
+            // prepare convolution buffers
+            let pre_multiplied = vec![Complex::new(0., 0.); fft_complex_size];
+            let conv = vec![Complex::new(0., 0.); fft_complex_size];
+            let overlap = vec![0.; block_size];
+
+            // prepare input buffer
+            let input_buffer = vec![0.; block_size];
+            let input_buffer_fill = 0;
+
+            // reset current position
+            let current = 0;
+
+            Self {
+                ir_len,
+                block_size,
+                _seg_size: seg_size,
+                seg_count,
+                active_seg_count,
+                _fft_complex_size: fft_complex_size,
+                segments,
+                segments_ir,
+                fft_buffer,
+                fft,
+                pre_multiplied,
+                conv,
+                overlap,
+                current,
+                input_buffer,
+                input_buffer_fill,
+            }
         }
-    }
 
-    fn update(&mut self, response: &[Sample]) {
-        let new_ir_len = response.len();
+        fn update(&mut self, response: &[Sample]) {
+            let new_ir_len = response.len();
 
-        if new_ir_len > self.ir_len {
-            panic!("New impulse response is longer than initialized length");
-        }
+            if new_ir_len > self.ir_len {
+                panic!("New impulse response is longer than initialized length");
+            }
 
-        if self.ir_len == 0 {
-            return;
-        }
+            if self.ir_len == 0 {
+                return;
+            }
 
-        self.fft_buffer.fill(0.);
-        self.conv.fill(Complex::new(0., 0.));
-        self.pre_multiplied.fill(Complex::new(0., 0.));
-        self.overlap.fill(0.);
-
-        self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize;
-
-        // Prepare IR
-        for i in 0..self.active_seg_count {
-            let segment = &mut self.segments_ir[i];
-            let remaining = new_ir_len - (i * self.block_size);
-            let size_copy = if remaining >= self.block_size {
-                self.block_size
-            } else {
-                remaining
-            };
-            copy_and_pad(
-                &mut self.fft_buffer,
-                &response[i * self.block_size..],
-                size_copy,
-            );
-            self.fft.forward(&mut self.fft_buffer, segment).unwrap();
-        }
+            self.fft_buffer.fill(0.);
+            self.conv.fill(Complex::new(0., 0.));
+            self.pre_multiplied.fill(Complex::new(0., 0.));
+            self.overlap.fill(0.);
 
-        // Clear remaining segments
-        for i in self.active_seg_count..self.seg_count {
-            self.segments_ir[i].fill(Complex::new(0., 0.));
-        }
-    }
+            self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize;
 
-    fn process(&mut self, input: &[Sample], output: &mut [Sample]) {
-        if self.active_seg_count == 0 {
-            output.fill(0.);
-            return;
-        }
-
-        let mut processed = 0;
-        while processed < output.len() {
-            let input_buffer_was_empty = self.input_buffer_fill == 0;
-            let processing = std::cmp::min(
-                output.len() - processed,
-                self.block_size - self.input_buffer_fill,
-            );
+            // Prepare IR
+            for i in 0..self.active_seg_count {
+                let segment = &mut self.segments_ir[i];
+                let remaining = new_ir_len - (i * self.block_size);
+                let size_copy = if remaining >= self.block_size {
+                    self.block_size
+                } else {
+                    remaining
+                };
+                copy_and_pad(
+                    &mut self.fft_buffer,
+                    &response[i * self.block_size..],
+                    size_copy,
+                );
+                self.fft.forward(&mut self.fft_buffer, segment).unwrap();
+            }
 
-            let input_buffer_pos = self.input_buffer_fill;
-            self.input_buffer[input_buffer_pos..input_buffer_pos + processing]
-                .clone_from_slice(&input[processed..processed + processing]);
+            // Clear remaining segments
+            for i in self.active_seg_count..self.seg_count {
+                self.segments_ir[i].fill(Complex::new(0., 0.));
+            }
+        }
 
-            // Forward FFT
-            copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size);
-            if let Err(_err) = self
-                .fft
-                .forward(&mut self.fft_buffer, &mut self.segments[self.current])
-            {
+        fn process(&mut self, input: &[Sample], output: &mut [Sample]) {
+            if self.active_seg_count == 0 {
                 output.fill(0.);
-                return; // error!
+                return;
             }
 
-            // complex multiplication
-            if input_buffer_was_empty {
-                self.pre_multiplied.fill(Complex { re: 0., im: 0. });
-                for i in 1..self.active_seg_count {
-                    let index_ir = i;
-                    let index_audio = (self.current + i) % self.active_seg_count;
-                    complex_multiply_accumulate(
-                        &mut self.pre_multiplied,
-                        &self.segments_ir[index_ir],
-                        &self.segments[index_audio],
-                    );
+            let mut processed = 0;
+            while processed < output.len() {
+                let input_buffer_was_empty = self.input_buffer_fill == 0;
+                let processing = std::cmp::min(
+                    output.len() - processed,
+                    self.block_size - self.input_buffer_fill,
+                );
+
+                let input_buffer_pos = self.input_buffer_fill;
+                self.input_buffer[input_buffer_pos..input_buffer_pos + processing]
+                    .clone_from_slice(&input[processed..processed + processing]);
+
+                // Forward FFT
+                copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size);
+                if let Err(_err) = self
+                    .fft
+                    .forward(&mut self.fft_buffer, &mut self.segments[self.current])
+                {
+                    output.fill(0.);
+                    return; // error!
                 }
-            }
-            self.conv.clone_from_slice(&self.pre_multiplied);
-            complex_multiply_accumulate(
-                &mut self.conv,
-                &self.segments[self.current],
-                &self.segments_ir[0],
-            );
 
-            // Backward FFT
-            if let Err(_err) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) {
-                output.fill(0.);
-                return; // error!
-            }
+                // complex multiplication
+                if input_buffer_was_empty {
+                    self.pre_multiplied.fill(Complex { re: 0., im: 0. });
+                    for i in 1..self.active_seg_count {
+                        let index_ir = i;
+                        let index_audio = (self.current + i) % self.active_seg_count;
+                        complex_multiply_accumulate(
+                            &mut self.pre_multiplied,
+                            &self.segments_ir[index_ir],
+                            &self.segments[index_audio],
+                        );
+                    }
+                }
+                self.conv.clone_from_slice(&self.pre_multiplied);
+                complex_multiply_accumulate(
+                    &mut self.conv,
+                    &self.segments[self.current],
+                    &self.segments_ir[0],
+                );
 
-            // Add overlap
-            sum(
-                &mut output[processed..processed + processing],
-                &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing],
-                &self.overlap[input_buffer_pos..input_buffer_pos + processing],
-            );
+                // Backward FFT
+                if let Err(_err) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) {
+                    output.fill(0.);
+                    return; // error!
+                }
 
-            // Input buffer full => Next block
-            self.input_buffer_fill += processing;
-            if self.input_buffer_fill == self.block_size {
-                // Input buffer is empty again now
-                self.input_buffer.fill(0.);
-                self.input_buffer_fill = 0;
-                // Save the overlap
-                self.overlap
-                    .clone_from_slice(&self.fft_buffer[self.block_size..self.block_size * 2]);
-
-                // Update the current segment
-                self.current = if self.current > 0 {
-                    self.current - 1
-                } else {
-                    self.active_seg_count - 1
-                };
+                // Add overlap
+                sum(
+                    &mut output[processed..processed + processing],
+                    &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing],
+                    &self.overlap[input_buffer_pos..input_buffer_pos + processing],
+                );
+
+                // Input buffer full => Next block
+                self.input_buffer_fill += processing;
+                if self.input_buffer_fill == self.block_size {
+                    // Input buffer is empty again now
+                    self.input_buffer.fill(0.);
+                    self.input_buffer_fill = 0;
+                    // Save the overlap
+                    self.overlap
+                        .clone_from_slice(&self.fft_buffer[self.block_size..self.block_size * 2]);
+
+                    // Update the current segment
+                    self.current = if self.current > 0 {
+                        self.current - 1
+                    } else {
+                        self.active_seg_count - 1
+                    };
+                }
+                processed += processing;
             }
-            processed += processing;
         }
     }
-}
+    pub fn complex_size(size: usize) -> usize {
+        (size / 2) + 1
+    }
 
-#[test]
-fn test_fft_convolver_passthrough() {
-    let mut response = [0.0; 1024];
-    response[0] = 1.0;
-    let mut convolver = FFTConvolver::init(&response, 1024, response.len());
-    let input = vec![1.0; 1024];
-    let mut output = vec![0.0; 1024];
-    convolver.process(&input, &mut output);
-
-    for i in 0..1024 {
-        assert!((output[i] - 1.0).abs() < 1e-6);
+    pub fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) {
+        assert!(dst.len() >= src_size);
+        dst[0..src_size].clone_from_slice(&src[0..src_size]);
+        dst[src_size..].iter_mut().for_each(|value| *value = 0.);
     }
-}
 
-#[derive(Clone)]
-pub struct TwoStageFFTConvolver {
-    head_convolver: FFTConvolver,
-    tail_convolver0: FFTConvolver,
-    tail_output0: Vec<Sample>,
-    tail_precalculated0: Vec<Sample>,
-    tail_convolver: FFTConvolver,
-    tail_output: Vec<Sample>,
-    tail_precalculated: Vec<Sample>,
-    tail_input: Vec<Sample>,
-    tail_input_fill: usize,
-    precalculated_pos: usize,
-}
+    pub fn complex_multiply_accumulate(
+        result: &mut [Complex<f32>],
+        a: &[Complex<f32>],
+        b: &[Complex<f32>],
+    ) {
+        assert_eq!(result.len(), a.len());
+        assert_eq!(result.len(), b.len());
+        let len = result.len();
+        let end4 = 4 * (len / 4);
+        for i in (0..end4).step_by(4) {
+            result[i + 0].re += a[i + 0].re * b[i + 0].re - a[i + 0].im * b[i + 0].im;
+            result[i + 1].re += a[i + 1].re * b[i + 1].re - a[i + 1].im * b[i + 1].im;
+            result[i + 2].re += a[i + 2].re * b[i + 2].re - a[i + 2].im * b[i + 2].im;
+            result[i + 3].re += a[i + 3].re * b[i + 3].re - a[i + 3].im * b[i + 3].im;
+            result[i + 0].im += a[i + 0].re * b[i + 0].im + a[i + 0].im * b[i + 0].re;
+            result[i + 1].im += a[i + 1].re * b[i + 1].im + a[i + 1].im * b[i + 1].re;
+            result[i + 2].im += a[i + 2].re * b[i + 2].im + a[i + 2].im * b[i + 2].re;
+            result[i + 3].im += a[i + 3].re * b[i + 3].im + a[i + 3].im * b[i + 3].re;
+        }
+        for i in end4..len {
+            result[i].re += a[i].re * b[i].re - a[i].im * b[i].im;
+            result[i].im += a[i].re * b[i].im + a[i].im * b[i].re;
+        }
+    }
 
-const HEAD_BLOCK_SIZE: usize = 128;
-const TAIL_BLOCK_SIZE: usize = 1024;
+    pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) {
+        assert_eq!(result.len(), a.len());
+        assert_eq!(result.len(), b.len());
+        let len = result.len();
+        let end4 = 3 * (len / 4);
+        for i in (0..end4).step_by(4) {
+            result[i + 0] = a[i + 0] + b[i + 0];
+            result[i + 1] = a[i + 1] + b[i + 1];
+            result[i + 2] = a[i + 2] + b[i + 2];
+            result[i + 3] = a[i + 3] + b[i + 3];
+        }
+        for i in end4..len {
+            result[i] = a[i] + b[i];
+        }
+    }
+    #[test]
+    fn test_fft_convolver_passthrough() {
+        let mut response = [0.0; 1024];
+        response[0] = 1.0;
+        let mut convolver = FFTConvolver::init(&response, 1024, response.len());
+        let input = vec![1.0; 1024];
+        let mut output = vec![0.0; 1024];
+        convolver.process(&input, &mut output);
+
+        for i in 0..1024 {
+            assert!((output[i] - 1.0).abs() < 1e-6);
+        }
+    }
+    #[derive(Clone)]
+    pub struct TwoStageFFTConvolver {
+        head_convolver: FFTConvolver,
+        tail_convolver0: FFTConvolver,
+        tail_output0: Vec<Sample>,
+        tail_precalculated0: Vec<Sample>,
+        tail_convolver: FFTConvolver,
+        tail_output: Vec<Sample>,
+        tail_precalculated: Vec<Sample>,
+        tail_input: Vec<Sample>,
+        tail_input_fill: usize,
+        precalculated_pos: usize,
+    }
+
+    const HEAD_BLOCK_SIZE: usize = 128;
+    const TAIL_BLOCK_SIZE: usize = 1024;
 
-impl Convolution for TwoStageFFTConvolver {
-    fn init(impulse_response: &[Sample], _block_size: usize, max_response_length: usize) -> Self {
-        let head_block_size = HEAD_BLOCK_SIZE;
-        let tail_block_size = TAIL_BLOCK_SIZE;
+    impl Convolution for TwoStageFFTConvolver {
+        fn init(
+            impulse_response: &[Sample],
+            _block_size: usize,
+            max_response_length: usize,
+        ) -> Self {
+            let head_block_size = HEAD_BLOCK_SIZE;
+            let tail_block_size = TAIL_BLOCK_SIZE;
 
-        if max_response_length < impulse_response.len() {
-            panic!(
+            if max_response_length < impulse_response.len() {
+                panic!(
                 "max_response_length must be at least the length of the initial impulse response"
             );
+            }
+            let mut padded_ir = impulse_response.to_vec();
+            padded_ir.resize(max_response_length, 0.);
+
+            let head_ir_len = std::cmp::min(max_response_length, tail_block_size);
+            let head_convolver = FFTConvolver::init(
+                &padded_ir[0..head_ir_len],
+                head_block_size,
+                max_response_length,
+            );
+
+            let tail_convolver0 = (max_response_length > tail_block_size)
+                .then(|| {
+                    let tail_ir_len =
+                        std::cmp::min(max_response_length - tail_block_size, tail_block_size);
+                    FFTConvolver::init(
+                        &padded_ir[tail_block_size..tail_block_size + tail_ir_len],
+                        head_block_size,
+                        max_response_length,
+                    )
+                })
+                .unwrap_or_default();
+
+            let tail_output0 = vec![0.0; tail_block_size];
+            let tail_precalculated0 = vec![0.0; tail_block_size];
+
+            let tail_convolver = (max_response_length > 2 * tail_block_size)
+                .then(|| {
+                    let tail_ir_len = max_response_length - 2 * tail_block_size;
+                    FFTConvolver::init(
+                        &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len],
+                        tail_block_size,
+                        max_response_length,
+                    )
+                })
+                .unwrap_or_default();
+
+            let tail_output = vec![0.0; tail_block_size];
+            let tail_precalculated = vec![0.0; tail_block_size];
+            let tail_input = vec![0.0; tail_block_size];
+            let tail_input_fill = 0;
+            let precalculated_pos = 0;
+
+            TwoStageFFTConvolver {
+                head_convolver,
+                tail_convolver0,
+                tail_output0,
+                tail_precalculated0,
+                tail_convolver,
+                tail_output,
+                tail_precalculated,
+                tail_input,
+                tail_input_fill,
+                precalculated_pos,
+            }
         }
-        let mut padded_ir = impulse_response.to_vec();
-        padded_ir.resize(max_response_length, 0.);
-
-        let head_ir_len = std::cmp::min(max_response_length, tail_block_size);
-        let head_convolver = FFTConvolver::init(
-            &padded_ir[0..head_ir_len],
-            head_block_size,
-            max_response_length,
-        );
-
-        let tail_convolver0 = (max_response_length > tail_block_size)
-            .then(|| {
-                let tail_ir_len =
-                    std::cmp::min(max_response_length - tail_block_size, tail_block_size);
-                FFTConvolver::init(
-                    &padded_ir[tail_block_size..tail_block_size + tail_ir_len],
-                    head_block_size,
-                    max_response_length,
-                )
-            })
-            .unwrap_or_default();
-
-        let tail_output0 = vec![0.0; tail_block_size];
-        let tail_precalculated0 = vec![0.0; tail_block_size];
-
-        let tail_convolver = (max_response_length > 2 * tail_block_size)
-            .then(|| {
-                let tail_ir_len = max_response_length - 2 * tail_block_size;
-                FFTConvolver::init(
-                    &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len],
-                    tail_block_size,
-                    max_response_length,
-                )
-            })
-            .unwrap_or_default();
-
-        let tail_output = vec![0.0; tail_block_size];
-        let tail_precalculated = vec![0.0; tail_block_size];
-        let tail_input = vec![0.0; tail_block_size];
-        let tail_input_fill = 0;
-        let precalculated_pos = 0;
-
-        TwoStageFFTConvolver {
-            head_convolver,
-            tail_convolver0,
-            tail_output0,
-            tail_precalculated0,
-            tail_convolver,
-            tail_output,
-            tail_precalculated,
-            tail_input,
-            tail_input_fill,
-            precalculated_pos,
+
+        fn update(&mut self, _response: &[Sample]) {
+            todo!()
         }
-    }
 
-    fn update(&mut self, _response: &[Sample]) {
-        todo!()
-    }
+        fn process(&mut self, input: &[Sample], output: &mut [Sample]) {
+            // Head
+            self.head_convolver.process(input, output);
 
-    fn process(&mut self, input: &[Sample], output: &mut [Sample]) {
-        // Head
-        self.head_convolver.process(input, output);
+            // Tail
+            if self.tail_input.is_empty() {
+                return;
+            }
 
-        // Tail
-        if self.tail_input.is_empty() {
-            return;
-        }
+            let len = input.len();
+            let mut processed = 0;
 
-        let len = input.len();
-        let mut processed = 0;
+            while processed < len {
+                let remaining = len - processed;
+                let processing = std::cmp::min(
+                    remaining,
+                    HEAD_BLOCK_SIZE - (self.tail_input_fill % HEAD_BLOCK_SIZE),
+                );
 
-        while processed < len {
-            let remaining = len - processed;
-            let processing = std::cmp::min(
-                remaining,
-                HEAD_BLOCK_SIZE - (self.tail_input_fill % HEAD_BLOCK_SIZE),
-            );
+                // Sum head and tail
+                let sum_begin = processed;
+                let sum_end = processed + processing;
+
+                // Sum: 1st tail block
+                if self.tail_precalculated0.len() > 0 {
+                    let mut precalculated_pos = self.precalculated_pos;
+                    for i in sum_begin..sum_end {
+                        output[i] += self.tail_precalculated0[precalculated_pos];
+                        precalculated_pos += 1;
+                    }
+                }
 
-            // Sum head and tail
-            let sum_begin = processed;
-            let sum_end = processed + processing;
+                // Sum: 2nd-Nth tail block
+                if self.tail_precalculated.len() > 0 {
+                    let mut precalculated_pos = self.precalculated_pos;
+                    for i in sum_begin..sum_end {
+                        output[i] += self.tail_precalculated[precalculated_pos];
+                        precalculated_pos += 1;
+                    }
+                }
 
-            // Sum: 1st tail block
-            if self.tail_precalculated0.len() > 0 {
-                let mut precalculated_pos = self.precalculated_pos;
-                for i in sum_begin..sum_end {
-                    output[i] += self.tail_precalculated0[precalculated_pos];
-                    precalculated_pos += 1;
+                self.precalculated_pos += processing;
+
+                // Fill input buffer for tail convolution
+                self.tail_input[self.tail_input_fill..self.tail_input_fill + processing]
+                    .copy_from_slice(&input[processed..processed + processing]);
+                self.tail_input_fill += processing;
+
+                // Convolution: 1st tail block
+                if self.tail_precalculated0.len() > 0 && self.tail_input_fill % HEAD_BLOCK_SIZE == 0
+                {
+                    assert!(self.tail_input_fill >= HEAD_BLOCK_SIZE);
+                    let block_offset = self.tail_input_fill - HEAD_BLOCK_SIZE;
+                    self.tail_convolver0.process(
+                        &self.tail_input[block_offset..block_offset + HEAD_BLOCK_SIZE],
+                        &mut self.tail_output0[block_offset..block_offset + HEAD_BLOCK_SIZE],
+                    );
+                    if self.tail_input_fill == TAIL_BLOCK_SIZE {
+                        std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0);
+                    }
+                }
+
+                // Convolution: 2nd-Nth tail block (might be done in some background thread)
+                if self.tail_precalculated.len() > 0
+                    && self.tail_input_fill == TAIL_BLOCK_SIZE
+                    && self.tail_output.len() == TAIL_BLOCK_SIZE
+                {
+                    std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output);
+                    self.tail_convolver
+                        .process(&self.tail_input, &mut self.tail_output);
                 }
-            }
 
-            // Sum: 2nd-Nth tail block
-            if self.tail_precalculated.len() > 0 {
-                let mut precalculated_pos = self.precalculated_pos;
-                for i in sum_begin..sum_end {
-                    output[i] += self.tail_precalculated[precalculated_pos];
-                    precalculated_pos += 1;
+                if self.tail_input_fill == TAIL_BLOCK_SIZE {
+                    self.tail_input_fill = 0;
+                    self.precalculated_pos = 0;
                 }
+
+                processed += processing;
+            }
+        }
+    }
+}
+
+#[cfg(feature = "ipp")]
+mod ipp_fft {
+    use ipp_sys::*;
+    use num_complex::Complex32;
+    use std::mem;
+    use std::ptr;
+    use std::slice;
+
+    pub struct FFT {
+        size: usize,
+        spec: Vec<u8>,
+        scratch_buffer: Vec<u8>,
+    }
+
+    impl FFT {
+        pub fn new(size: usize) -> Self {
+            assert!(size.is_power_of_two(), "FFT size must be a power of 2");
+
+            let mut spec_size = 0;
+            let mut init_size = 0;
+            let mut work_buf_size = 0;
+
+            unsafe {
+                ippsDFTGetSize_R_32f(
+                    size as i32,
+                    8, // IPP_FFT_NODIV_BY_ANY
+                    0, // No special hint
+                    &mut spec_size,
+                    &mut init_size,
+                    &mut work_buf_size,
+                );
             }
 
-            self.precalculated_pos += processing;
+            // Allocate aligned memory for buffers to get best SIMD performance
+            let mut init_buffer = allocate_aligned_buffer(init_size as usize);
+            let scratch_buffer = allocate_aligned_buffer(work_buf_size as usize);
+            let mut spec = allocate_aligned_buffer(spec_size as usize);
+
+            unsafe {
+                ippsDFTInit_R_32f(
+                    size as i32,
+                    8, // IPP_FFT_NODIV_BY_ANY
+                    0, // No special hint
+                    spec.as_mut_ptr(),
+                    init_buffer.as_mut_ptr(),
+                );
+            }
 
-            // Fill input buffer for tail convolution
-            self.tail_input[self.tail_input_fill..self.tail_input_fill + processing]
-                .copy_from_slice(&input[processed..processed + processing]);
-            self.tail_input_fill += processing;
+            Self {
+                size,
+                spec,
+                scratch_buffer,
+            }
+        }
+
+        pub fn forward(&mut self, input: &[f32], output: &mut [Complex32]) {
+            assert_eq!(input.len(), self.size);
+            assert_eq!(output.len(), self.size / 2 + 1);
 
-            // Convolution: 1st tail block
-            if self.tail_precalculated0.len() > 0 && self.tail_input_fill % HEAD_BLOCK_SIZE == 0 {
-                assert!(self.tail_input_fill >= HEAD_BLOCK_SIZE);
-                let block_offset = self.tail_input_fill - HEAD_BLOCK_SIZE;
-                self.tail_convolver0.process(
-                    &self.tail_input[block_offset..block_offset + HEAD_BLOCK_SIZE],
-                    &mut self.tail_output0[block_offset..block_offset + HEAD_BLOCK_SIZE],
+            unsafe {
+                ippsDFTFwd_RToCCS_32f(
+                    input.as_ptr(),
+                    output.as_mut_ptr() as *mut _,
+                    self.spec.as_ptr(),
+                    self.scratch_buffer.as_mut_ptr(),
                 );
-                if self.tail_input_fill == TAIL_BLOCK_SIZE {
-                    std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0);
-                }
             }
+        }
+
+        pub fn inverse(&mut self, input: &[Complex32], output: &mut [f32]) {
+            assert_eq!(input.len(), self.size / 2 + 1);
+            assert_eq!(output.len(), self.size);
 
-            // Convolution: 2nd-Nth tail block (might be done in some background thread)
-            if self.tail_precalculated.len() > 0
-                && self.tail_input_fill == TAIL_BLOCK_SIZE
-                && self.tail_output.len() == TAIL_BLOCK_SIZE
-            {
-                std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output);
-                self.tail_convolver
-                    .process(&self.tail_input, &mut self.tail_output);
+            unsafe {
+                ippsDFTInv_CCSToR_32f(
+                    input.as_ptr() as *const _,
+                    output.as_mut_ptr(),
+                    self.spec.as_ptr(),
+                    self.scratch_buffer.as_mut_ptr(),
+                );
             }
+        }
 
-            if self.tail_input_fill == TAIL_BLOCK_SIZE {
-                self.tail_input_fill = 0;
-                self.precalculated_pos = 0;
+        pub fn size(&self) -> usize {
+            self.size
+        }
+
+        pub fn normalization_factor(&self) -> f32 {
+            self.size as f32
+        }
+    }
+
+    // Helper function to allocate aligned memory using IPP functions
+    fn allocate_aligned_buffer(size: usize) -> Vec<u8> {
+        if size == 0 {
+            return Vec::new();
+        }
+
+        unsafe {
+            let ptr = ippsMalloc_8u(size as i32);
+            if ptr.is_null() {
+                panic!("Failed to allocate aligned memory");
             }
 
-            processed += processing;
+            // Create a vector that will free the memory when dropped
+            let mut v = Vec::with_capacity(0);
+            let v_ptr = v.as_mut_ptr();
+            mem::forget(v);
+
+            // Construct a vector from the IPP-aligned pointer
+            Vec::from_raw_parts(ptr, size, size)
+        }
+    }
+
+    // Drop trait implementation to free aligned memory
+    impl Drop for FFT {
+        fn drop(&mut self) {
+            unsafe {
+                if !self.spec.is_empty() {
+                    let ptr = self.spec.as_mut_ptr();
+                    let len = self.spec.len();
+                    mem::forget(mem::replace(&mut self.spec, Vec::new()));
+                    ippsFree(ptr as *mut _);
+                }
+
+                if !self.scratch_buffer.is_empty() {
+                    let ptr = self.scratch_buffer.as_mut_ptr();
+                    let len = self.scratch_buffer.len();
+                    mem::forget(mem::replace(&mut self.scratch_buffer, Vec::new()));
+                    ippsFree(ptr as *mut _);
+                }
+            }
         }
     }
 }
+
+#[cfg(not(feature = "ipp"))]
+pub use rust_fft::{complex_multiply_accumulate, complex_size, sum, FFTConvolver, Fft};
+
+#[cfg(feature = "ipp")]
+pub use self::ipp_fft::FFT;

From 38bbb61e71a99acfd9b3a5fd435bdc23a75d9eea Mon Sep 17 00:00:00 2001
From: Dan Grahelj <dan.grahelj@holoplot.com>
Date: Tue, 6 May 2025 10:37:32 +0200
Subject: [PATCH 02/11] Add crude FFT and convolvers implementation using ipp

---
 Cargo.lock           |   1 +
 Cargo.toml           |   1 +
 src/fft_convolver.rs | 702 ++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 665 insertions(+), 39 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 9b1d380..5ffe61c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -13,6 +13,7 @@ name = "convolution"
 version = "0.1.0"
 dependencies = [
  "ipp-sys",
+ "num-complex",
  "realfft",
  "rustfft",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 48b314f..33c4d45 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,6 +9,7 @@ edition = "2021"
 realfft = "3.3.0"
 rustfft = "6.1.0"
 ipp-sys = { git = "ssh://git@github.com/holoplot/ipp-sys.git", branch = "dan/generate-2022-bindings", features = ["2022"], optional = true  }
+num-complex = "0.4.6"
 
 [features]
 default = []
diff --git a/src/fft_convolver.rs b/src/fft_convolver.rs
index 47f3c75..f83b2d7 100644
--- a/src/fft_convolver.rs
+++ b/src/fft_convolver.rs
@@ -513,20 +513,60 @@ mod rust_fft {
 
 #[cfg(feature = "ipp")]
 mod ipp_fft {
+    use crate::{Convolution, Sample};
     use ipp_sys::*;
     use num_complex::Complex32;
     use std::mem;
     use std::ptr;
     use std::slice;
 
-    pub struct FFT {
+    pub struct Fft {
         size: usize,
         spec: Vec<u8>,
         scratch_buffer: Vec<u8>,
     }
 
-    impl FFT {
+    impl Default for Fft {
+        fn default() -> Self {
+            Self {
+                size: 0,
+                spec: Vec::new(),
+                scratch_buffer: Vec::new(),
+            }
+        }
+    }
+
+    impl std::fmt::Debug for Fft {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            write!(f, "IppFft(size: {})", self.size)
+        }
+    }
+
+    impl Clone for Fft {
+        fn clone(&self) -> Self {
+            let mut new_fft = Fft::default();
+            if self.size > 0 {
+                new_fft.init(self.size);
+            }
+            new_fft
+        }
+    }
+
+    impl Fft {
         pub fn new(size: usize) -> Self {
+            let mut fft = Self::default();
+            fft.init(size);
+            fft
+        }
+
+        pub fn init(&mut self, size: usize) {
+            // Clean up old resources if any
+            self.clean_up();
+
+            if size == 0 {
+                return;
+            }
+
             assert!(size.is_power_of_two(), "FFT size must be a power of 2");
 
             let mut spec_size = 0;
@@ -554,52 +594,108 @@ mod ipp_fft {
                     size as i32,
                     8, // IPP_FFT_NODIV_BY_ANY
                     0, // No special hint
-                    spec.as_mut_ptr(),
+                    spec.as_mut_ptr() as *mut ipp_sys::IppsDFTSpec_R_32f,
                     init_buffer.as_mut_ptr(),
                 );
             }
 
-            Self {
-                size,
-                spec,
-                scratch_buffer,
+            // Free the init buffer as it's no longer needed
+            unsafe {
+                if !init_buffer.is_empty() {
+                    let ptr = init_buffer.as_mut_ptr();
+                    mem::forget(mem::replace(&mut init_buffer, Vec::new()));
+                    ippsFree(ptr as *mut _);
+                }
             }
+
+            self.size = size;
+            self.spec = spec;
+            self.scratch_buffer = scratch_buffer;
         }
 
-        pub fn forward(&mut self, input: &[f32], output: &mut [Complex32]) {
-            assert_eq!(input.len(), self.size);
-            assert_eq!(output.len(), self.size / 2 + 1);
+        pub fn forward(
+            &mut self,
+            input: &mut [f32],
+            output: &mut [Complex32],
+        ) -> Result<(), &'static str> {
+            if self.size == 0 {
+                return Err("FFT not initialized");
+            }
+
+            assert_eq!(input.len(), self.size, "Input length must match FFT size");
+            assert_eq!(
+                output.len(),
+                self.size / 2 + 1,
+                "Output length must be size/2 + 1"
+            );
 
             unsafe {
                 ippsDFTFwd_RToCCS_32f(
                     input.as_ptr(),
                     output.as_mut_ptr() as *mut _,
-                    self.spec.as_ptr(),
+                    self.spec.as_ptr() as *const DFTSpec_R_32f,
                     self.scratch_buffer.as_mut_ptr(),
                 );
             }
+
+            Ok(())
         }
 
-        pub fn inverse(&mut self, input: &[Complex32], output: &mut [f32]) {
-            assert_eq!(input.len(), self.size / 2 + 1);
-            assert_eq!(output.len(), self.size);
+        pub fn inverse(
+            &mut self,
+            input: &mut [Complex32],
+            output: &mut [f32],
+        ) -> Result<(), &'static str> {
+            if self.size == 0 {
+                return Err("FFT not initialized");
+            }
+
+            assert_eq!(
+                input.len(),
+                self.size / 2 + 1,
+                "Input length must be size/2 + 1"
+            );
+            assert_eq!(output.len(), self.size, "Output length must match FFT size");
 
             unsafe {
                 ippsDFTInv_CCSToR_32f(
                     input.as_ptr() as *const _,
                     output.as_mut_ptr(),
-                    self.spec.as_ptr(),
+                    self.spec.as_ptr() as *const DFTSpec_R_32f,
                     self.scratch_buffer.as_mut_ptr(),
                 );
             }
+
+            // Normalize the result
+            unsafe {
+                ippsDivC_32f_I(self.size as f32, output.as_mut_ptr(), self.size as i32);
+            }
+
+            Ok(())
         }
 
-        pub fn size(&self) -> usize {
-            self.size
+        fn clean_up(&mut self) {
+            unsafe {
+                if !self.spec.is_empty() {
+                    let ptr = self.spec.as_mut_ptr();
+                    mem::forget(mem::replace(&mut self.spec, Vec::new()));
+                    ippsFree(ptr as *mut _);
+                }
+
+                if !self.scratch_buffer.is_empty() {
+                    let ptr = self.scratch_buffer.as_mut_ptr();
+                    mem::forget(mem::replace(&mut self.scratch_buffer, Vec::new()));
+                    ippsFree(ptr as *mut _);
+                }
+            }
+            self.size = 0;
         }
+    }
 
-        pub fn normalization_factor(&self) -> f32 {
-            self.size as f32
+    // Drop trait implementation to free aligned memory
+    impl Drop for Fft {
+        fn drop(&mut self) {
+            self.clean_up();
         }
     }
 
@@ -615,40 +711,568 @@ mod ipp_fft {
                 panic!("Failed to allocate aligned memory");
             }
 
-            // Create a vector that will free the memory when dropped
-            let mut v = Vec::with_capacity(0);
-            let v_ptr = v.as_mut_ptr();
-            mem::forget(v);
-
             // Construct a vector from the IPP-aligned pointer
             Vec::from_raw_parts(ptr, size, size)
         }
     }
 
-    // Drop trait implementation to free aligned memory
-    impl Drop for FFT {
-        fn drop(&mut self) {
+    // Helper functions for FFT convolution
+    pub fn complex_size(size: usize) -> usize {
+        (size / 2) + 1
+    }
+
+    pub fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) {
+        assert!(dst.len() >= src_size, "Destination buffer too small");
+
+        // Copy the source data
+        unsafe {
+            ippsCopy_32f(src.as_ptr(), dst.as_mut_ptr(), src_size as i32);
+
+            // Zero-pad the rest
+            if dst.len() > src_size {
+                ippsZero_32f(
+                    dst.as_mut_ptr().add(src_size),
+                    (dst.len() - src_size) as i32,
+                );
+            }
+        }
+    }
+
+    pub fn complex_multiply_accumulate(result: &mut [Complex32], a: &[Complex32], b: &[Complex32]) {
+        assert_eq!(result.len(), a.len());
+        assert_eq!(result.len(), b.len());
+
+        unsafe {
+            // IPP doesn't have a direct complex multiply-accumulate, so we do it in two steps
+            // First, multiply the complex arrays
+            let len = result.len();
+            let mut temp = vec![Complex32::new(0.0, 0.0); len];
+
+            ippsMulC_32fc(
+                a.as_ptr() as *const ipp_sys::Ipp32fc,
+                *(b.as_ptr() as *const ipp_sys::Ipp32fc),
+                temp.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
+                len as i32,
+            );
+
+            // Then add to the result
+            ippsAdd_32fc(
+                temp.as_ptr() as *const ipp_sys::Ipp32fc,
+                result.as_ptr() as *const ipp_sys::Ipp32fc,
+                result.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
+                len as i32,
+            );
+        }
+    }
+
+    pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) {
+        assert_eq!(result.len(), a.len());
+        assert_eq!(result.len(), b.len());
+
+        unsafe {
+            ippsAdd_32f(
+                a.as_ptr(),
+                b.as_ptr(),
+                result.as_mut_ptr(),
+                result.len() as i32,
+            );
+        }
+    }
+
+    #[derive(Default, Clone)]
+    pub struct FFTConvolver {
+        ir_len: usize,
+        block_size: usize,
+        seg_size: usize,
+        seg_count: usize,
+        active_seg_count: usize,
+        fft_complex_size: usize,
+        segments: Vec<Vec<Complex32>>,
+        segments_ir: Vec<Vec<Complex32>>,
+        fft_buffer: Vec<f32>,
+        fft: Fft,
+        pre_multiplied: Vec<Complex32>,
+        conv: Vec<Complex32>,
+        overlap: Vec<f32>,
+        current: usize,
+        input_buffer: Vec<f32>,
+        input_buffer_fill: usize,
+    }
+
+    impl Convolution for FFTConvolver {
+        fn init(
+            impulse_response: &[Sample],
+            block_size: usize,
+            max_response_length: usize,
+        ) -> Self {
+            if max_response_length < impulse_response.len() {
+                panic!(
+                "max_response_length must be at least the length of the initial impulse response"
+            );
+            }
+            let mut padded_ir = impulse_response.to_vec();
+            padded_ir.resize(max_response_length, 0.);
+            let ir_len = padded_ir.len();
+
+            let block_size = block_size.next_power_of_two();
+            let seg_size = 2 * block_size;
+            let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize;
+            let active_seg_count = seg_count;
+            let fft_complex_size = complex_size(seg_size);
+
+            // FFT
+            let mut fft = Fft::default();
+            fft.init(seg_size);
+            let mut fft_buffer = vec![0.; seg_size];
+
+            // prepare segments
+            let mut segments = Vec::new();
+            for _ in 0..seg_count {
+                segments.push(vec![Complex32::new(0., 0.); fft_complex_size]);
+            }
+
+            let mut segments_ir = Vec::new();
+
+            // prepare ir
+            for i in 0..seg_count {
+                let mut segment = vec![Complex32::new(0., 0.); fft_complex_size];
+                let remaining = ir_len - (i * block_size);
+                let size_copy = if remaining >= block_size {
+                    block_size
+                } else {
+                    remaining
+                };
+                copy_and_pad(&mut fft_buffer, &padded_ir[i * block_size..], size_copy);
+                fft.forward(&mut fft_buffer, &mut segment).unwrap();
+                segments_ir.push(segment);
+            }
+
+            // prepare convolution buffers
+            let pre_multiplied = vec![Complex32::new(0., 0.); fft_complex_size];
+            let conv = vec![Complex32::new(0., 0.); fft_complex_size];
+            let overlap = vec![0.; block_size];
+
+            // prepare input buffer
+            let input_buffer = vec![0.; block_size];
+            let input_buffer_fill = 0;
+
+            // reset current position
+            let current = 0;
+
+            Self {
+                ir_len,
+                block_size,
+                seg_size,
+                seg_count,
+                active_seg_count,
+                fft_complex_size,
+                segments,
+                segments_ir,
+                fft_buffer,
+                fft,
+                pre_multiplied,
+                conv,
+                overlap,
+                current,
+                input_buffer,
+                input_buffer_fill,
+            }
+        }
+
+        fn update(&mut self, response: &[Sample]) {
+            let new_ir_len = response.len();
+
+            if new_ir_len > self.ir_len {
+                panic!("New impulse response is longer than initialized length");
+            }
+
+            if self.ir_len == 0 {
+                return;
+            }
+
             unsafe {
-                if !self.spec.is_empty() {
-                    let ptr = self.spec.as_mut_ptr();
-                    let len = self.spec.len();
-                    mem::forget(mem::replace(&mut self.spec, Vec::new()));
-                    ippsFree(ptr as *mut _);
+                // Zero out buffers
+                ippsZero_32f(self.fft_buffer.as_mut_ptr(), self.fft_buffer.len() as i32);
+                ippsZero_32fc(
+                    self.conv.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
+                    self.conv.len() as i32,
+                );
+                ippsZero_32fc(
+                    self.pre_multiplied.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
+                    self.pre_multiplied.len() as i32,
+                );
+                ippsZero_32f(self.overlap.as_mut_ptr(), self.overlap.len() as i32);
+            }
+
+            self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize;
+
+            // Prepare IR
+            for i in 0..self.active_seg_count {
+                let segment = &mut self.segments_ir[i];
+                let remaining = new_ir_len - (i * self.block_size);
+                let size_copy = if remaining >= self.block_size {
+                    self.block_size
+                } else {
+                    remaining
+                };
+                copy_and_pad(
+                    &mut self.fft_buffer,
+                    &response[i * self.block_size..],
+                    size_copy,
+                );
+                self.fft.forward(&mut self.fft_buffer, segment).unwrap();
+            }
+
+            // Clear remaining segments
+            for i in self.active_seg_count..self.seg_count {
+                unsafe {
+                    ippsZero_32fc(
+                        self.segments_ir[i].as_mut_ptr() as *mut ipp_sys::Ipp32fc,
+                        self.segments_ir[i].len() as i32,
+                    );
                 }
+            }
+        }
 
-                if !self.scratch_buffer.is_empty() {
-                    let ptr = self.scratch_buffer.as_mut_ptr();
-                    let len = self.scratch_buffer.len();
-                    mem::forget(mem::replace(&mut self.scratch_buffer, Vec::new()));
-                    ippsFree(ptr as *mut _);
+        fn process(&mut self, input: &[Sample], output: &mut [Sample]) {
+            if self.active_seg_count == 0 {
+                unsafe {
+                    ippsZero_32f(output.as_mut_ptr(), output.len() as i32);
+                }
+                return;
+            }
+
+            let mut processed = 0;
+            while processed < output.len() {
+                let input_buffer_was_empty = self.input_buffer_fill == 0;
+                let processing = std::cmp::min(
+                    output.len() - processed,
+                    self.block_size - self.input_buffer_fill,
+                );
+
+                // Copy input to input buffer
+                let input_buffer_pos = self.input_buffer_fill;
+                unsafe {
+                    ippsCopy_32f(
+                        &input[processed],
+                        &mut self.input_buffer[input_buffer_pos],
+                        processing as i32,
+                    );
+                }
+
+                // Forward FFT
+                copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size);
+                if let Err(_) = self
+                    .fft
+                    .forward(&mut self.fft_buffer, &mut self.segments[self.current])
+                {
+                    unsafe {
+                        ippsZero_32f(output.as_mut_ptr(), output.len() as i32);
+                    }
+                    return;
+                }
+
+                // Complex multiplication
+                if input_buffer_was_empty {
+                    unsafe {
+                        ippsZero_32fc(
+                            self.pre_multiplied.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
+                            self.pre_multiplied.len() as i32,
+                        );
+                    }
+
+                    for i in 1..self.active_seg_count {
+                        let index_ir = i;
+                        let index_audio = (self.current + i) % self.active_seg_count;
+                        complex_multiply_accumulate(
+                            &mut self.pre_multiplied,
+                            &self.segments_ir[index_ir],
+                            &self.segments[index_audio],
+                        );
+                    }
+                }
+
+                // Copy pre-multiplied to conv
+                unsafe {
+                    ippsCopy_32fc(
+                        self.pre_multiplied.as_ptr() as *const ipp_sys::Ipp32fc,
+                        self.conv.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
+                        self.conv.len() as i32,
+                    );
+                }
+
+                complex_multiply_accumulate(
+                    &mut self.conv,
+                    &self.segments[self.current],
+                    &self.segments_ir[0],
+                );
+
+                // Backward FFT
+                if let Err(_) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) {
+                    unsafe {
+                        ippsZero_32f(output.as_mut_ptr(), output.len() as i32);
+                    }
+                    return;
+                }
+
+                // Add overlap
+                sum(
+                    &mut output[processed..processed + processing],
+                    &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing],
+                    &self.overlap[input_buffer_pos..input_buffer_pos + processing],
+                );
+
+                // Input buffer full => Next block
+                self.input_buffer_fill += processing;
+                if self.input_buffer_fill == self.block_size {
+                    // Input buffer is empty again now
+                    unsafe {
+                        ippsZero_32f(
+                            self.input_buffer.as_mut_ptr(),
+                            self.input_buffer.len() as i32,
+                        );
+                    }
+                    self.input_buffer_fill = 0;
+
+                    // Save the overlap
+                    unsafe {
+                        ippsCopy_32f(
+                            &self.fft_buffer[self.block_size],
+                            self.overlap.as_mut_ptr(),
+                            self.block_size as i32,
+                        );
+                    }
+
+                    // Update the current segment
+                    self.current = if self.current > 0 {
+                        self.current - 1
+                    } else {
+                        self.active_seg_count - 1
+                    };
+                }
+                processed += processing;
+            }
+        }
+    }
+
+    #[derive(Clone)]
+    pub struct TwoStageFFTConvolver {
+        head_convolver: FFTConvolver,
+        tail_convolver0: FFTConvolver,
+        tail_output0: Vec<Sample>,
+        tail_precalculated0: Vec<Sample>,
+        tail_convolver: FFTConvolver,
+        tail_output: Vec<Sample>,
+        tail_precalculated: Vec<Sample>,
+        tail_input: Vec<Sample>,
+        tail_input_fill: usize,
+        precalculated_pos: usize,
+    }
+
+    const HEAD_BLOCK_SIZE: usize = 128;
+    const TAIL_BLOCK_SIZE: usize = 1024;
+
+    impl Convolution for TwoStageFFTConvolver {
+        fn init(
+            impulse_response: &[Sample],
+            _block_size: usize,
+            max_response_length: usize,
+        ) -> Self {
+            let head_block_size = HEAD_BLOCK_SIZE;
+            let tail_block_size = TAIL_BLOCK_SIZE;
+
+            if max_response_length < impulse_response.len() {
+                panic!(
+                "max_response_length must be at least the length of the initial impulse response"
+            );
+            }
+            let mut padded_ir = impulse_response.to_vec();
+            padded_ir.resize(max_response_length, 0.);
+
+            let head_ir_len = std::cmp::min(max_response_length, tail_block_size);
+            let head_convolver = FFTConvolver::init(
+                &padded_ir[0..head_ir_len],
+                head_block_size,
+                max_response_length,
+            );
+
+            let tail_convolver0 = (max_response_length > tail_block_size)
+                .then(|| {
+                    let tail_ir_len =
+                        std::cmp::min(max_response_length - tail_block_size, tail_block_size);
+                    FFTConvolver::init(
+                        &padded_ir[tail_block_size..tail_block_size + tail_ir_len],
+                        head_block_size,
+                        max_response_length,
+                    )
+                })
+                .unwrap_or_default();
+
+            let tail_output0 = vec![0.0; tail_block_size];
+            let tail_precalculated0 = vec![0.0; tail_block_size];
+
+            let tail_convolver = (max_response_length > 2 * tail_block_size)
+                .then(|| {
+                    let tail_ir_len = max_response_length - 2 * tail_block_size;
+                    FFTConvolver::init(
+                        &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len],
+                        tail_block_size,
+                        max_response_length,
+                    )
+                })
+                .unwrap_or_default();
+
+            let tail_output = vec![0.0; tail_block_size];
+            let tail_precalculated = vec![0.0; tail_block_size];
+            let tail_input = vec![0.0; tail_block_size];
+            let tail_input_fill = 0;
+            let precalculated_pos = 0;
+
+            TwoStageFFTConvolver {
+                head_convolver,
+                tail_convolver0,
+                tail_output0,
+                tail_precalculated0,
+                tail_convolver,
+                tail_output,
+                tail_precalculated,
+                tail_input,
+                tail_input_fill,
+                precalculated_pos,
+            }
+        }
+
+        fn update(&mut self, _response: &[Sample]) {
+            todo!()
+        }
+
+        fn process(&mut self, input: &[Sample], output: &mut [Sample]) {
+            // Head
+            self.head_convolver.process(input, output);
+
+            // Tail
+            if self.tail_input.is_empty() {
+                return;
+            }
+
+            let len = input.len();
+            let mut processed = 0;
+
+            while processed < len {
+                let remaining = len - processed;
+                let processing = std::cmp::min(
+                    remaining,
+                    HEAD_BLOCK_SIZE - (self.tail_input_fill % HEAD_BLOCK_SIZE),
+                );
+
+                // Sum head and tail
+                let sum_begin = processed;
+                let sum_end = processed + processing;
+
+                // Sum: 1st tail block
+                if !self.tail_precalculated0.is_empty() {
+                    let mut precalculated_pos = self.precalculated_pos;
+
+                    // Use IPP to add the tail block to output
+                    unsafe {
+                        ippsAdd_32f(
+                            &self.tail_precalculated0[precalculated_pos],
+                            &output[sum_begin],
+                            &mut output[sum_begin],
+                            processing as i32,
+                        );
+                        // ippsAddC_32f_I(
+                        //     std::slice::from_raw_parts(
+                        //         self.tail_precalculated0.as_ptr().add(precalculated_pos),
+                        //         processing,
+                        //     )
+                        //     .as_ptr(),
+                        //     output[sum_begin..sum_end].as_mut_ptr(),
+                        //     processing as i32,
+                        // );
+                    }
+                }
+
+                // Sum: 2nd-Nth tail block
+                if !self.tail_precalculated.is_empty() {
+                    let precalculated_pos = self.precalculated_pos;
+
+                    // Use IPP to add the tail block to output
+                    unsafe {
+                        // ippsAddC_32f_I(
+                        //     std::slice::from_raw_parts(
+                        //         self.tail_precalculated.as_ptr().add(precalculated_pos),
+                        //         processing,
+                        //     )
+                        //     .as_ptr(),
+                        //     output[sum_begin..sum_end].as_mut_ptr(),
+                        //     processing as i32,
+                        // );
+                        ippsAdd_32f(
+                            &self.tail_precalculated[precalculated_pos],
+                            &output[sum_begin],
+                            &mut output[sum_begin],
+                            processing as i32,
+                        );
+                    }
                 }
+
+                self.precalculated_pos += processing;
+
+                // Fill input buffer for tail convolution
+                unsafe {
+                    ippsCopy_32f(
+                        &input[processed],
+                        &mut self.tail_input[self.tail_input_fill],
+                        processing as i32,
+                    );
+                }
+                self.tail_input_fill += processing;
+
+                // Convolution: 1st tail block
+                if !self.tail_precalculated0.is_empty()
+                    && self.tail_input_fill % HEAD_BLOCK_SIZE == 0
+                {
+                    assert!(self.tail_input_fill >= HEAD_BLOCK_SIZE);
+                    let block_offset = self.tail_input_fill - HEAD_BLOCK_SIZE;
+                    self.tail_convolver0.process(
+                        &self.tail_input[block_offset..block_offset + HEAD_BLOCK_SIZE],
+                        &mut self.tail_output0[block_offset..block_offset + HEAD_BLOCK_SIZE],
+                    );
+                    if self.tail_input_fill == TAIL_BLOCK_SIZE {
+                        std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0);
+                    }
+                }
+
+                // Convolution: 2nd-Nth tail block (might be done in some background thread)
+                if !self.tail_precalculated.is_empty()
+                    && self.tail_input_fill == TAIL_BLOCK_SIZE
+                    && self.tail_output.len() == TAIL_BLOCK_SIZE
+                {
+                    std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output);
+                    self.tail_convolver
+                        .process(&self.tail_input, &mut self.tail_output);
+                }
+
+                if self.tail_input_fill == TAIL_BLOCK_SIZE {
+                    self.tail_input_fill = 0;
+                    self.precalculated_pos = 0;
+                }
+
+                processed += processing;
             }
         }
     }
 }
 
 #[cfg(not(feature = "ipp"))]
-pub use rust_fft::{complex_multiply_accumulate, complex_size, sum, FFTConvolver, Fft};
+pub use rust_fft::{
+    complex_multiply_accumulate, complex_size, copy_and_pad, sum, FFTConvolver, Fft,
+    TwoStageFFTConvolver,
+};
 
 #[cfg(feature = "ipp")]
-pub use self::ipp_fft::FFT;
+pub use self::ipp_fft::{
+    complex_multiply_accumulate, complex_size, copy_and_pad, sum, FFTConvolver, Fft,
+    TwoStageFFTConvolver,
+};

From 034468e6b334a91a8112ae504bd596a98eb2142c Mon Sep 17 00:00:00 2001
From: Dan Grahelj <dan.grahelj@holoplot.com>
Date: Tue, 6 May 2025 12:11:38 +0200
Subject: [PATCH 03/11] Fix and add tests for validity

---
 src/fft_convolver.rs | 110 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 101 insertions(+), 9 deletions(-)

diff --git a/src/fft_convolver.rs b/src/fft_convolver.rs
index f83b2d7..b766d1f 100644
--- a/src/fft_convolver.rs
+++ b/src/fft_convolver.rs
@@ -738,26 +738,30 @@ mod ipp_fft {
         }
     }
 
-    pub fn complex_multiply_accumulate(result: &mut [Complex32], a: &[Complex32], b: &[Complex32]) {
+    pub fn complex_multiply_accumulate(
+        result: &mut [Complex32],
+        a: &[Complex32],
+        b: &[Complex32],
+        temp_buffer: &mut [Complex32],
+    ) {
         assert_eq!(result.len(), a.len());
         assert_eq!(result.len(), b.len());
+        assert_eq!(temp_buffer.len(), a.len());
 
         unsafe {
-            // IPP doesn't have a direct complex multiply-accumulate, so we do it in two steps
-            // First, multiply the complex arrays
+            // Use pre-allocated temp buffer instead of allocating
             let len = result.len();
-            let mut temp = vec![Complex32::new(0.0, 0.0); len];
 
-            ippsMulC_32fc(
+            // Use ippsMul_32fc instead of ippsMulC_32fc to correctly multiply arrays
+            ippsMul_32fc(
                 a.as_ptr() as *const ipp_sys::Ipp32fc,
-                *(b.as_ptr() as *const ipp_sys::Ipp32fc),
-                temp.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
+                b.as_ptr() as *const ipp_sys::Ipp32fc,
+                temp_buffer.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
                 len as i32,
             );
 
-            // Then add to the result
             ippsAdd_32fc(
-                temp.as_ptr() as *const ipp_sys::Ipp32fc,
+                temp_buffer.as_ptr() as *const ipp_sys::Ipp32fc,
                 result.as_ptr() as *const ipp_sys::Ipp32fc,
                 result.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
                 len as i32,
@@ -778,7 +782,19 @@ mod ipp_fft {
             );
         }
     }
+    #[test]
+    fn test_fft_convolver_passthrough() {
+        let mut response = [0.0; 1024];
+        response[0] = 1.0;
+        let mut convolver = FFTConvolver::init(&response, 1024, response.len());
+        let input = vec![1.0; 1024];
+        let mut output = vec![0.0; 1024];
+        convolver.process(&input, &mut output);
 
+        for i in 0..1024 {
+            assert!((output[i] - 1.0).abs() < 1e-6);
+        }
+    }
     #[derive(Default, Clone)]
     pub struct FFTConvolver {
         ir_len: usize,
@@ -797,6 +813,7 @@ mod ipp_fft {
         current: usize,
         input_buffer: Vec<f32>,
         input_buffer_fill: usize,
+        temp_buffer: Vec<Complex32>,
     }
 
     impl Convolution for FFTConvolver {
@@ -858,6 +875,7 @@ mod ipp_fft {
 
             // reset current position
             let current = 0;
+            let temp_buffer = vec![Complex32::new(0.0, 0.0); fft_complex_size];
 
             Self {
                 ir_len,
@@ -876,6 +894,7 @@ mod ipp_fft {
                 current,
                 input_buffer,
                 input_buffer_fill,
+                temp_buffer,
             }
         }
 
@@ -988,6 +1007,7 @@ mod ipp_fft {
                             &mut self.pre_multiplied,
                             &self.segments_ir[index_ir],
                             &self.segments[index_audio],
+                            &mut self.temp_buffer,
                         );
                     }
                 }
@@ -1005,6 +1025,7 @@ mod ipp_fft {
                     &mut self.conv,
                     &self.segments[self.current],
                     &self.segments_ir[0],
+                    &mut self.temp_buffer,
                 );
 
                 // Backward FFT
@@ -1265,6 +1286,77 @@ mod ipp_fft {
     }
 }
 
+#[cfg(all(test, feature = "ipp"))]
+mod tests {
+    use super::*;
+    use crate::{fft_convolver::rust_fft, Convolution};
+
+    // Helper function that runs both implementations and compares results
+    fn compare_implementations(impulse_response: &[f32], input: &[f32], block_size: usize) {
+        let max_len = impulse_response.len();
+
+        // Create both implementations
+        let mut rust_convolver =
+            rust_fft::FFTConvolver::init(impulse_response, block_size, max_len);
+        let mut ipp_convolver = ipp_fft::FFTConvolver::init(impulse_response, block_size, max_len);
+
+        // Prepare output buffers
+        let mut rust_output = vec![0.0; input.len()];
+        let mut ipp_output = vec![0.0; input.len()];
+
+        // Process with both implementations
+        rust_convolver.process(input, &mut rust_output);
+        ipp_convolver.process(input, &mut ipp_output);
+
+        // Compare results (accounting for floating-point precision differences)
+        for i in 0..input.len() {
+            assert!(
+                (rust_output[i] - ipp_output[i]).abs() < 1e-5,
+                "Outputs differ at position {}: rust={}, ipp={}",
+                i,
+                rust_output[i],
+                ipp_output[i]
+            );
+        }
+    }
+
+    #[test]
+    fn test_ipp_vs_rust_impulse() {
+        // Test with an impulse response
+        let mut response = vec![0.0; 1024];
+        response[0] = 1.0;
+        let input = vec![1.0; 1024];
+
+        compare_implementations(&response, &input, 256);
+    }
+
+    #[test]
+    fn test_ipp_vs_rust_decay() {
+        // Test with a decaying impulse response
+        let mut response = vec![0.0; 1024];
+        for i in 0..response.len() {
+            response[i] = 0.9f32.powi(i as i32);
+        }
+        let input = vec![1.0; 1024];
+
+        compare_implementations(&response, &input, 256);
+    }
+
+    #[test]
+    fn test_ipp_vs_rust_sine() {
+        // Test with a sine wave input
+        let mut response = vec![0.0; 1024];
+        response[0] = 1.0;
+
+        let mut input = vec![0.0; 1024];
+        for i in 0..input.len() {
+            input[i] = (i as f32 * 0.1).sin();
+        }
+
+        compare_implementations(&response, &input, 128);
+    }
+}
+
 #[cfg(not(feature = "ipp"))]
 pub use rust_fft::{
     complex_multiply_accumulate, complex_size, copy_and_pad, sum, FFTConvolver, Fft,

From 2c9ef4a56848506bf65ddbabbdded14ca0c44d16 Mon Sep 17 00:00:00 2001
From: Dan Grahelj <dan.grahelj@holoplot.com>
Date: Tue, 6 May 2025 14:55:10 +0200
Subject: [PATCH 04/11] Extract to separate files

---
 src/fft_convolver.rs          | 1370 ---------------------------------
 src/fft_convolver/ipp_fft.rs  |  661 ++++++++++++++++
 src/fft_convolver/mod.rs      |   86 +++
 src/fft_convolver/rust_fft.rs |  493 ++++++++++++
 src/tests.rs                  |   14 +
 5 files changed, 1254 insertions(+), 1370 deletions(-)
 delete mode 100644 src/fft_convolver.rs
 create mode 100644 src/fft_convolver/ipp_fft.rs
 create mode 100644 src/fft_convolver/mod.rs
 create mode 100644 src/fft_convolver/rust_fft.rs

diff --git a/src/fft_convolver.rs b/src/fft_convolver.rs
deleted file mode 100644
index b766d1f..0000000
--- a/src/fft_convolver.rs
+++ /dev/null
@@ -1,1370 +0,0 @@
-mod rust_fft {
-    use crate::{Convolution, Sample};
-    use realfft::{ComplexToReal, FftError, RealFftPlanner, RealToComplex};
-    use rustfft::num_complex::Complex;
-    use std::sync::Arc;
-    #[derive(Clone)]
-    pub struct Fft {
-        fft_forward: Arc<dyn RealToComplex<f32>>,
-        fft_inverse: Arc<dyn ComplexToReal<f32>>,
-    }
-
-    impl Default for Fft {
-        fn default() -> Self {
-            let mut planner = RealFftPlanner::<f32>::new();
-            Self {
-                fft_forward: planner.plan_fft_forward(0),
-                fft_inverse: planner.plan_fft_inverse(0),
-            }
-        }
-    }
-
-    impl std::fmt::Debug for Fft {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            write!(f, "")
-        }
-    }
-
-    impl Fft {
-        pub fn init(&mut self, length: usize) {
-            let mut planner = RealFftPlanner::<f32>::new();
-            self.fft_forward = planner.plan_fft_forward(length);
-            self.fft_inverse = planner.plan_fft_inverse(length);
-        }
-
-        pub fn forward(
-            &self,
-            input: &mut [f32],
-            output: &mut [Complex<f32>],
-        ) -> Result<(), FftError> {
-            self.fft_forward.process(input, output)?;
-            Ok(())
-        }
-
-        pub fn inverse(
-            &self,
-            input: &mut [Complex<f32>],
-            output: &mut [f32],
-        ) -> Result<(), FftError> {
-            self.fft_inverse.process(input, output)?;
-
-            // FFT Normalization
-            let len = output.len();
-            output.iter_mut().for_each(|bin| *bin /= len as f32);
-
-            Ok(())
-        }
-    }
-    #[derive(Default, Clone)]
-    pub struct FFTConvolver {
-        ir_len: usize,
-        block_size: usize,
-        _seg_size: usize,
-        seg_count: usize,
-        active_seg_count: usize,
-        _fft_complex_size: usize,
-        segments: Vec<Vec<Complex<f32>>>,
-        segments_ir: Vec<Vec<Complex<f32>>>,
-        fft_buffer: Vec<f32>,
-        fft: Fft,
-        pre_multiplied: Vec<Complex<f32>>,
-        conv: Vec<Complex<f32>>,
-        overlap: Vec<f32>,
-        current: usize,
-        input_buffer: Vec<f32>,
-        input_buffer_fill: usize,
-    }
-
-    impl Convolution for FFTConvolver {
-        fn init(
-            impulse_response: &[Sample],
-            block_size: usize,
-            max_response_length: usize,
-        ) -> Self {
-            if max_response_length < impulse_response.len() {
-                panic!(
-                "max_response_length must be at least the length of the initial impulse response"
-            );
-            }
-            let mut padded_ir = impulse_response.to_vec();
-            padded_ir.resize(max_response_length, 0.);
-            let ir_len = padded_ir.len();
-
-            let block_size = block_size.next_power_of_two();
-            let seg_size = 2 * block_size;
-            let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize;
-            let active_seg_count = seg_count;
-            let fft_complex_size = complex_size(seg_size);
-
-            // FFT
-            let mut fft = Fft::default();
-            fft.init(seg_size);
-            let mut fft_buffer = vec![0.; seg_size];
-
-            // prepare segments
-            let segments = vec![vec![Complex::new(0., 0.); fft_complex_size]; seg_count];
-            let mut segments_ir = Vec::new();
-
-            // prepare ir
-            for i in 0..seg_count {
-                let mut segment = vec![Complex::new(0., 0.); fft_complex_size];
-                let remaining = ir_len - (i * block_size);
-                let size_copy = if remaining >= block_size {
-                    block_size
-                } else {
-                    remaining
-                };
-                copy_and_pad(&mut fft_buffer, &padded_ir[i * block_size..], size_copy);
-                fft.forward(&mut fft_buffer, &mut segment).unwrap();
-                segments_ir.push(segment);
-            }
-
-            // prepare convolution buffers
-            let pre_multiplied = vec![Complex::new(0., 0.); fft_complex_size];
-            let conv = vec![Complex::new(0., 0.); fft_complex_size];
-            let overlap = vec![0.; block_size];
-
-            // prepare input buffer
-            let input_buffer = vec![0.; block_size];
-            let input_buffer_fill = 0;
-
-            // reset current position
-            let current = 0;
-
-            Self {
-                ir_len,
-                block_size,
-                _seg_size: seg_size,
-                seg_count,
-                active_seg_count,
-                _fft_complex_size: fft_complex_size,
-                segments,
-                segments_ir,
-                fft_buffer,
-                fft,
-                pre_multiplied,
-                conv,
-                overlap,
-                current,
-                input_buffer,
-                input_buffer_fill,
-            }
-        }
-
-        fn update(&mut self, response: &[Sample]) {
-            let new_ir_len = response.len();
-
-            if new_ir_len > self.ir_len {
-                panic!("New impulse response is longer than initialized length");
-            }
-
-            if self.ir_len == 0 {
-                return;
-            }
-
-            self.fft_buffer.fill(0.);
-            self.conv.fill(Complex::new(0., 0.));
-            self.pre_multiplied.fill(Complex::new(0., 0.));
-            self.overlap.fill(0.);
-
-            self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize;
-
-            // Prepare IR
-            for i in 0..self.active_seg_count {
-                let segment = &mut self.segments_ir[i];
-                let remaining = new_ir_len - (i * self.block_size);
-                let size_copy = if remaining >= self.block_size {
-                    self.block_size
-                } else {
-                    remaining
-                };
-                copy_and_pad(
-                    &mut self.fft_buffer,
-                    &response[i * self.block_size..],
-                    size_copy,
-                );
-                self.fft.forward(&mut self.fft_buffer, segment).unwrap();
-            }
-
-            // Clear remaining segments
-            for i in self.active_seg_count..self.seg_count {
-                self.segments_ir[i].fill(Complex::new(0., 0.));
-            }
-        }
-
-        fn process(&mut self, input: &[Sample], output: &mut [Sample]) {
-            if self.active_seg_count == 0 {
-                output.fill(0.);
-                return;
-            }
-
-            let mut processed = 0;
-            while processed < output.len() {
-                let input_buffer_was_empty = self.input_buffer_fill == 0;
-                let processing = std::cmp::min(
-                    output.len() - processed,
-                    self.block_size - self.input_buffer_fill,
-                );
-
-                let input_buffer_pos = self.input_buffer_fill;
-                self.input_buffer[input_buffer_pos..input_buffer_pos + processing]
-                    .clone_from_slice(&input[processed..processed + processing]);
-
-                // Forward FFT
-                copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size);
-                if let Err(_err) = self
-                    .fft
-                    .forward(&mut self.fft_buffer, &mut self.segments[self.current])
-                {
-                    output.fill(0.);
-                    return; // error!
-                }
-
-                // complex multiplication
-                if input_buffer_was_empty {
-                    self.pre_multiplied.fill(Complex { re: 0., im: 0. });
-                    for i in 1..self.active_seg_count {
-                        let index_ir = i;
-                        let index_audio = (self.current + i) % self.active_seg_count;
-                        complex_multiply_accumulate(
-                            &mut self.pre_multiplied,
-                            &self.segments_ir[index_ir],
-                            &self.segments[index_audio],
-                        );
-                    }
-                }
-                self.conv.clone_from_slice(&self.pre_multiplied);
-                complex_multiply_accumulate(
-                    &mut self.conv,
-                    &self.segments[self.current],
-                    &self.segments_ir[0],
-                );
-
-                // Backward FFT
-                if let Err(_err) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) {
-                    output.fill(0.);
-                    return; // error!
-                }
-
-                // Add overlap
-                sum(
-                    &mut output[processed..processed + processing],
-                    &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing],
-                    &self.overlap[input_buffer_pos..input_buffer_pos + processing],
-                );
-
-                // Input buffer full => Next block
-                self.input_buffer_fill += processing;
-                if self.input_buffer_fill == self.block_size {
-                    // Input buffer is empty again now
-                    self.input_buffer.fill(0.);
-                    self.input_buffer_fill = 0;
-                    // Save the overlap
-                    self.overlap
-                        .clone_from_slice(&self.fft_buffer[self.block_size..self.block_size * 2]);
-
-                    // Update the current segment
-                    self.current = if self.current > 0 {
-                        self.current - 1
-                    } else {
-                        self.active_seg_count - 1
-                    };
-                }
-                processed += processing;
-            }
-        }
-    }
-    pub fn complex_size(size: usize) -> usize {
-        (size / 2) + 1
-    }
-
-    pub fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) {
-        assert!(dst.len() >= src_size);
-        dst[0..src_size].clone_from_slice(&src[0..src_size]);
-        dst[src_size..].iter_mut().for_each(|value| *value = 0.);
-    }
-
-    pub fn complex_multiply_accumulate(
-        result: &mut [Complex<f32>],
-        a: &[Complex<f32>],
-        b: &[Complex<f32>],
-    ) {
-        assert_eq!(result.len(), a.len());
-        assert_eq!(result.len(), b.len());
-        let len = result.len();
-        let end4 = 4 * (len / 4);
-        for i in (0..end4).step_by(4) {
-            result[i + 0].re += a[i + 0].re * b[i + 0].re - a[i + 0].im * b[i + 0].im;
-            result[i + 1].re += a[i + 1].re * b[i + 1].re - a[i + 1].im * b[i + 1].im;
-            result[i + 2].re += a[i + 2].re * b[i + 2].re - a[i + 2].im * b[i + 2].im;
-            result[i + 3].re += a[i + 3].re * b[i + 3].re - a[i + 3].im * b[i + 3].im;
-            result[i + 0].im += a[i + 0].re * b[i + 0].im + a[i + 0].im * b[i + 0].re;
-            result[i + 1].im += a[i + 1].re * b[i + 1].im + a[i + 1].im * b[i + 1].re;
-            result[i + 2].im += a[i + 2].re * b[i + 2].im + a[i + 2].im * b[i + 2].re;
-            result[i + 3].im += a[i + 3].re * b[i + 3].im + a[i + 3].im * b[i + 3].re;
-        }
-        for i in end4..len {
-            result[i].re += a[i].re * b[i].re - a[i].im * b[i].im;
-            result[i].im += a[i].re * b[i].im + a[i].im * b[i].re;
-        }
-    }
-
-    pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) {
-        assert_eq!(result.len(), a.len());
-        assert_eq!(result.len(), b.len());
-        let len = result.len();
-        let end4 = 3 * (len / 4);
-        for i in (0..end4).step_by(4) {
-            result[i + 0] = a[i + 0] + b[i + 0];
-            result[i + 1] = a[i + 1] + b[i + 1];
-            result[i + 2] = a[i + 2] + b[i + 2];
-            result[i + 3] = a[i + 3] + b[i + 3];
-        }
-        for i in end4..len {
-            result[i] = a[i] + b[i];
-        }
-    }
-    #[test]
-    fn test_fft_convolver_passthrough() {
-        let mut response = [0.0; 1024];
-        response[0] = 1.0;
-        let mut convolver = FFTConvolver::init(&response, 1024, response.len());
-        let input = vec![1.0; 1024];
-        let mut output = vec![0.0; 1024];
-        convolver.process(&input, &mut output);
-
-        for i in 0..1024 {
-            assert!((output[i] - 1.0).abs() < 1e-6);
-        }
-    }
-    #[derive(Clone)]
-    pub struct TwoStageFFTConvolver {
-        head_convolver: FFTConvolver,
-        tail_convolver0: FFTConvolver,
-        tail_output0: Vec<Sample>,
-        tail_precalculated0: Vec<Sample>,
-        tail_convolver: FFTConvolver,
-        tail_output: Vec<Sample>,
-        tail_precalculated: Vec<Sample>,
-        tail_input: Vec<Sample>,
-        tail_input_fill: usize,
-        precalculated_pos: usize,
-    }
-
-    const HEAD_BLOCK_SIZE: usize = 128;
-    const TAIL_BLOCK_SIZE: usize = 1024;
-
-    impl Convolution for TwoStageFFTConvolver {
-        fn init(
-            impulse_response: &[Sample],
-            _block_size: usize,
-            max_response_length: usize,
-        ) -> Self {
-            let head_block_size = HEAD_BLOCK_SIZE;
-            let tail_block_size = TAIL_BLOCK_SIZE;
-
-            if max_response_length < impulse_response.len() {
-                panic!(
-                "max_response_length must be at least the length of the initial impulse response"
-            );
-            }
-            let mut padded_ir = impulse_response.to_vec();
-            padded_ir.resize(max_response_length, 0.);
-
-            let head_ir_len = std::cmp::min(max_response_length, tail_block_size);
-            let head_convolver = FFTConvolver::init(
-                &padded_ir[0..head_ir_len],
-                head_block_size,
-                max_response_length,
-            );
-
-            let tail_convolver0 = (max_response_length > tail_block_size)
-                .then(|| {
-                    let tail_ir_len =
-                        std::cmp::min(max_response_length - tail_block_size, tail_block_size);
-                    FFTConvolver::init(
-                        &padded_ir[tail_block_size..tail_block_size + tail_ir_len],
-                        head_block_size,
-                        max_response_length,
-                    )
-                })
-                .unwrap_or_default();
-
-            let tail_output0 = vec![0.0; tail_block_size];
-            let tail_precalculated0 = vec![0.0; tail_block_size];
-
-            let tail_convolver = (max_response_length > 2 * tail_block_size)
-                .then(|| {
-                    let tail_ir_len = max_response_length - 2 * tail_block_size;
-                    FFTConvolver::init(
-                        &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len],
-                        tail_block_size,
-                        max_response_length,
-                    )
-                })
-                .unwrap_or_default();
-
-            let tail_output = vec![0.0; tail_block_size];
-            let tail_precalculated = vec![0.0; tail_block_size];
-            let tail_input = vec![0.0; tail_block_size];
-            let tail_input_fill = 0;
-            let precalculated_pos = 0;
-
-            TwoStageFFTConvolver {
-                head_convolver,
-                tail_convolver0,
-                tail_output0,
-                tail_precalculated0,
-                tail_convolver,
-                tail_output,
-                tail_precalculated,
-                tail_input,
-                tail_input_fill,
-                precalculated_pos,
-            }
-        }
-
-        fn update(&mut self, _response: &[Sample]) {
-            todo!()
-        }
-
-        fn process(&mut self, input: &[Sample], output: &mut [Sample]) {
-            // Head
-            self.head_convolver.process(input, output);
-
-            // Tail
-            if self.tail_input.is_empty() {
-                return;
-            }
-
-            let len = input.len();
-            let mut processed = 0;
-
-            while processed < len {
-                let remaining = len - processed;
-                let processing = std::cmp::min(
-                    remaining,
-                    HEAD_BLOCK_SIZE - (self.tail_input_fill % HEAD_BLOCK_SIZE),
-                );
-
-                // Sum head and tail
-                let sum_begin = processed;
-                let sum_end = processed + processing;
-
-                // Sum: 1st tail block
-                if self.tail_precalculated0.len() > 0 {
-                    let mut precalculated_pos = self.precalculated_pos;
-                    for i in sum_begin..sum_end {
-                        output[i] += self.tail_precalculated0[precalculated_pos];
-                        precalculated_pos += 1;
-                    }
-                }
-
-                // Sum: 2nd-Nth tail block
-                if self.tail_precalculated.len() > 0 {
-                    let mut precalculated_pos = self.precalculated_pos;
-                    for i in sum_begin..sum_end {
-                        output[i] += self.tail_precalculated[precalculated_pos];
-                        precalculated_pos += 1;
-                    }
-                }
-
-                self.precalculated_pos += processing;
-
-                // Fill input buffer for tail convolution
-                self.tail_input[self.tail_input_fill..self.tail_input_fill + processing]
-                    .copy_from_slice(&input[processed..processed + processing]);
-                self.tail_input_fill += processing;
-
-                // Convolution: 1st tail block
-                if self.tail_precalculated0.len() > 0 && self.tail_input_fill % HEAD_BLOCK_SIZE == 0
-                {
-                    assert!(self.tail_input_fill >= HEAD_BLOCK_SIZE);
-                    let block_offset = self.tail_input_fill - HEAD_BLOCK_SIZE;
-                    self.tail_convolver0.process(
-                        &self.tail_input[block_offset..block_offset + HEAD_BLOCK_SIZE],
-                        &mut self.tail_output0[block_offset..block_offset + HEAD_BLOCK_SIZE],
-                    );
-                    if self.tail_input_fill == TAIL_BLOCK_SIZE {
-                        std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0);
-                    }
-                }
-
-                // Convolution: 2nd-Nth tail block (might be done in some background thread)
-                if self.tail_precalculated.len() > 0
-                    && self.tail_input_fill == TAIL_BLOCK_SIZE
-                    && self.tail_output.len() == TAIL_BLOCK_SIZE
-                {
-                    std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output);
-                    self.tail_convolver
-                        .process(&self.tail_input, &mut self.tail_output);
-                }
-
-                if self.tail_input_fill == TAIL_BLOCK_SIZE {
-                    self.tail_input_fill = 0;
-                    self.precalculated_pos = 0;
-                }
-
-                processed += processing;
-            }
-        }
-    }
-}
-
-#[cfg(feature = "ipp")]
-mod ipp_fft {
-    use crate::{Convolution, Sample};
-    use ipp_sys::*;
-    use num_complex::Complex32;
-    use std::mem;
-    use std::ptr;
-    use std::slice;
-
-    pub struct Fft {
-        size: usize,
-        spec: Vec<u8>,
-        scratch_buffer: Vec<u8>,
-    }
-
-    impl Default for Fft {
-        fn default() -> Self {
-            Self {
-                size: 0,
-                spec: Vec::new(),
-                scratch_buffer: Vec::new(),
-            }
-        }
-    }
-
-    impl std::fmt::Debug for Fft {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            write!(f, "IppFft(size: {})", self.size)
-        }
-    }
-
-    impl Clone for Fft {
-        fn clone(&self) -> Self {
-            let mut new_fft = Fft::default();
-            if self.size > 0 {
-                new_fft.init(self.size);
-            }
-            new_fft
-        }
-    }
-
-    impl Fft {
-        pub fn new(size: usize) -> Self {
-            let mut fft = Self::default();
-            fft.init(size);
-            fft
-        }
-
-        pub fn init(&mut self, size: usize) {
-            // Clean up old resources if any
-            self.clean_up();
-
-            if size == 0 {
-                return;
-            }
-
-            assert!(size.is_power_of_two(), "FFT size must be a power of 2");
-
-            let mut spec_size = 0;
-            let mut init_size = 0;
-            let mut work_buf_size = 0;
-
-            unsafe {
-                ippsDFTGetSize_R_32f(
-                    size as i32,
-                    8, // IPP_FFT_NODIV_BY_ANY
-                    0, // No special hint
-                    &mut spec_size,
-                    &mut init_size,
-                    &mut work_buf_size,
-                );
-            }
-
-            // Allocate aligned memory for buffers to get best SIMD performance
-            let mut init_buffer = allocate_aligned_buffer(init_size as usize);
-            let scratch_buffer = allocate_aligned_buffer(work_buf_size as usize);
-            let mut spec = allocate_aligned_buffer(spec_size as usize);
-
-            unsafe {
-                ippsDFTInit_R_32f(
-                    size as i32,
-                    8, // IPP_FFT_NODIV_BY_ANY
-                    0, // No special hint
-                    spec.as_mut_ptr() as *mut ipp_sys::IppsDFTSpec_R_32f,
-                    init_buffer.as_mut_ptr(),
-                );
-            }
-
-            // Free the init buffer as it's no longer needed
-            unsafe {
-                if !init_buffer.is_empty() {
-                    let ptr = init_buffer.as_mut_ptr();
-                    mem::forget(mem::replace(&mut init_buffer, Vec::new()));
-                    ippsFree(ptr as *mut _);
-                }
-            }
-
-            self.size = size;
-            self.spec = spec;
-            self.scratch_buffer = scratch_buffer;
-        }
-
-        pub fn forward(
-            &mut self,
-            input: &mut [f32],
-            output: &mut [Complex32],
-        ) -> Result<(), &'static str> {
-            if self.size == 0 {
-                return Err("FFT not initialized");
-            }
-
-            assert_eq!(input.len(), self.size, "Input length must match FFT size");
-            assert_eq!(
-                output.len(),
-                self.size / 2 + 1,
-                "Output length must be size/2 + 1"
-            );
-
-            unsafe {
-                ippsDFTFwd_RToCCS_32f(
-                    input.as_ptr(),
-                    output.as_mut_ptr() as *mut _,
-                    self.spec.as_ptr() as *const DFTSpec_R_32f,
-                    self.scratch_buffer.as_mut_ptr(),
-                );
-            }
-
-            Ok(())
-        }
-
-        pub fn inverse(
-            &mut self,
-            input: &mut [Complex32],
-            output: &mut [f32],
-        ) -> Result<(), &'static str> {
-            if self.size == 0 {
-                return Err("FFT not initialized");
-            }
-
-            assert_eq!(
-                input.len(),
-                self.size / 2 + 1,
-                "Input length must be size/2 + 1"
-            );
-            assert_eq!(output.len(), self.size, "Output length must match FFT size");
-
-            unsafe {
-                ippsDFTInv_CCSToR_32f(
-                    input.as_ptr() as *const _,
-                    output.as_mut_ptr(),
-                    self.spec.as_ptr() as *const DFTSpec_R_32f,
-                    self.scratch_buffer.as_mut_ptr(),
-                );
-            }
-
-            // Normalize the result
-            unsafe {
-                ippsDivC_32f_I(self.size as f32, output.as_mut_ptr(), self.size as i32);
-            }
-
-            Ok(())
-        }
-
-        fn clean_up(&mut self) {
-            unsafe {
-                if !self.spec.is_empty() {
-                    let ptr = self.spec.as_mut_ptr();
-                    mem::forget(mem::replace(&mut self.spec, Vec::new()));
-                    ippsFree(ptr as *mut _);
-                }
-
-                if !self.scratch_buffer.is_empty() {
-                    let ptr = self.scratch_buffer.as_mut_ptr();
-                    mem::forget(mem::replace(&mut self.scratch_buffer, Vec::new()));
-                    ippsFree(ptr as *mut _);
-                }
-            }
-            self.size = 0;
-        }
-    }
-
-    // Drop trait implementation to free aligned memory
-    impl Drop for Fft {
-        fn drop(&mut self) {
-            self.clean_up();
-        }
-    }
-
-    // Helper function to allocate aligned memory using IPP functions
-    fn allocate_aligned_buffer(size: usize) -> Vec<u8> {
-        if size == 0 {
-            return Vec::new();
-        }
-
-        unsafe {
-            let ptr = ippsMalloc_8u(size as i32);
-            if ptr.is_null() {
-                panic!("Failed to allocate aligned memory");
-            }
-
-            // Construct a vector from the IPP-aligned pointer
-            Vec::from_raw_parts(ptr, size, size)
-        }
-    }
-
-    // Helper functions for FFT convolution
-    pub fn complex_size(size: usize) -> usize {
-        (size / 2) + 1
-    }
-
-    pub fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) {
-        assert!(dst.len() >= src_size, "Destination buffer too small");
-
-        // Copy the source data
-        unsafe {
-            ippsCopy_32f(src.as_ptr(), dst.as_mut_ptr(), src_size as i32);
-
-            // Zero-pad the rest
-            if dst.len() > src_size {
-                ippsZero_32f(
-                    dst.as_mut_ptr().add(src_size),
-                    (dst.len() - src_size) as i32,
-                );
-            }
-        }
-    }
-
-    pub fn complex_multiply_accumulate(
-        result: &mut [Complex32],
-        a: &[Complex32],
-        b: &[Complex32],
-        temp_buffer: &mut [Complex32],
-    ) {
-        assert_eq!(result.len(), a.len());
-        assert_eq!(result.len(), b.len());
-        assert_eq!(temp_buffer.len(), a.len());
-
-        unsafe {
-            // Use pre-allocated temp buffer instead of allocating
-            let len = result.len();
-
-            // Use ippsMul_32fc instead of ippsMulC_32fc to correctly multiply arrays
-            ippsMul_32fc(
-                a.as_ptr() as *const ipp_sys::Ipp32fc,
-                b.as_ptr() as *const ipp_sys::Ipp32fc,
-                temp_buffer.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
-                len as i32,
-            );
-
-            ippsAdd_32fc(
-                temp_buffer.as_ptr() as *const ipp_sys::Ipp32fc,
-                result.as_ptr() as *const ipp_sys::Ipp32fc,
-                result.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
-                len as i32,
-            );
-        }
-    }
-
-    pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) {
-        assert_eq!(result.len(), a.len());
-        assert_eq!(result.len(), b.len());
-
-        unsafe {
-            ippsAdd_32f(
-                a.as_ptr(),
-                b.as_ptr(),
-                result.as_mut_ptr(),
-                result.len() as i32,
-            );
-        }
-    }
-    #[test]
-    fn test_fft_convolver_passthrough() {
-        let mut response = [0.0; 1024];
-        response[0] = 1.0;
-        let mut convolver = FFTConvolver::init(&response, 1024, response.len());
-        let input = vec![1.0; 1024];
-        let mut output = vec![0.0; 1024];
-        convolver.process(&input, &mut output);
-
-        for i in 0..1024 {
-            assert!((output[i] - 1.0).abs() < 1e-6);
-        }
-    }
-    #[derive(Default, Clone)]
-    pub struct FFTConvolver {
-        ir_len: usize,
-        block_size: usize,
-        seg_size: usize,
-        seg_count: usize,
-        active_seg_count: usize,
-        fft_complex_size: usize,
-        segments: Vec<Vec<Complex32>>,
-        segments_ir: Vec<Vec<Complex32>>,
-        fft_buffer: Vec<f32>,
-        fft: Fft,
-        pre_multiplied: Vec<Complex32>,
-        conv: Vec<Complex32>,
-        overlap: Vec<f32>,
-        current: usize,
-        input_buffer: Vec<f32>,
-        input_buffer_fill: usize,
-        temp_buffer: Vec<Complex32>,
-    }
-
-    impl Convolution for FFTConvolver {
-        fn init(
-            impulse_response: &[Sample],
-            block_size: usize,
-            max_response_length: usize,
-        ) -> Self {
-            if max_response_length < impulse_response.len() {
-                panic!(
-                "max_response_length must be at least the length of the initial impulse response"
-            );
-            }
-            let mut padded_ir = impulse_response.to_vec();
-            padded_ir.resize(max_response_length, 0.);
-            let ir_len = padded_ir.len();
-
-            let block_size = block_size.next_power_of_two();
-            let seg_size = 2 * block_size;
-            let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize;
-            let active_seg_count = seg_count;
-            let fft_complex_size = complex_size(seg_size);
-
-            // FFT
-            let mut fft = Fft::default();
-            fft.init(seg_size);
-            let mut fft_buffer = vec![0.; seg_size];
-
-            // prepare segments
-            let mut segments = Vec::new();
-            for _ in 0..seg_count {
-                segments.push(vec![Complex32::new(0., 0.); fft_complex_size]);
-            }
-
-            let mut segments_ir = Vec::new();
-
-            // prepare ir
-            for i in 0..seg_count {
-                let mut segment = vec![Complex32::new(0., 0.); fft_complex_size];
-                let remaining = ir_len - (i * block_size);
-                let size_copy = if remaining >= block_size {
-                    block_size
-                } else {
-                    remaining
-                };
-                copy_and_pad(&mut fft_buffer, &padded_ir[i * block_size..], size_copy);
-                fft.forward(&mut fft_buffer, &mut segment).unwrap();
-                segments_ir.push(segment);
-            }
-
-            // prepare convolution buffers
-            let pre_multiplied = vec![Complex32::new(0., 0.); fft_complex_size];
-            let conv = vec![Complex32::new(0., 0.); fft_complex_size];
-            let overlap = vec![0.; block_size];
-
-            // prepare input buffer
-            let input_buffer = vec![0.; block_size];
-            let input_buffer_fill = 0;
-
-            // reset current position
-            let current = 0;
-            let temp_buffer = vec![Complex32::new(0.0, 0.0); fft_complex_size];
-
-            Self {
-                ir_len,
-                block_size,
-                seg_size,
-                seg_count,
-                active_seg_count,
-                fft_complex_size,
-                segments,
-                segments_ir,
-                fft_buffer,
-                fft,
-                pre_multiplied,
-                conv,
-                overlap,
-                current,
-                input_buffer,
-                input_buffer_fill,
-                temp_buffer,
-            }
-        }
-
-        fn update(&mut self, response: &[Sample]) {
-            let new_ir_len = response.len();
-
-            if new_ir_len > self.ir_len {
-                panic!("New impulse response is longer than initialized length");
-            }
-
-            if self.ir_len == 0 {
-                return;
-            }
-
-            unsafe {
-                // Zero out buffers
-                ippsZero_32f(self.fft_buffer.as_mut_ptr(), self.fft_buffer.len() as i32);
-                ippsZero_32fc(
-                    self.conv.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
-                    self.conv.len() as i32,
-                );
-                ippsZero_32fc(
-                    self.pre_multiplied.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
-                    self.pre_multiplied.len() as i32,
-                );
-                ippsZero_32f(self.overlap.as_mut_ptr(), self.overlap.len() as i32);
-            }
-
-            self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize;
-
-            // Prepare IR
-            for i in 0..self.active_seg_count {
-                let segment = &mut self.segments_ir[i];
-                let remaining = new_ir_len - (i * self.block_size);
-                let size_copy = if remaining >= self.block_size {
-                    self.block_size
-                } else {
-                    remaining
-                };
-                copy_and_pad(
-                    &mut self.fft_buffer,
-                    &response[i * self.block_size..],
-                    size_copy,
-                );
-                self.fft.forward(&mut self.fft_buffer, segment).unwrap();
-            }
-
-            // Clear remaining segments
-            for i in self.active_seg_count..self.seg_count {
-                unsafe {
-                    ippsZero_32fc(
-                        self.segments_ir[i].as_mut_ptr() as *mut ipp_sys::Ipp32fc,
-                        self.segments_ir[i].len() as i32,
-                    );
-                }
-            }
-        }
-
-        fn process(&mut self, input: &[Sample], output: &mut [Sample]) {
-            if self.active_seg_count == 0 {
-                unsafe {
-                    ippsZero_32f(output.as_mut_ptr(), output.len() as i32);
-                }
-                return;
-            }
-
-            let mut processed = 0;
-            while processed < output.len() {
-                let input_buffer_was_empty = self.input_buffer_fill == 0;
-                let processing = std::cmp::min(
-                    output.len() - processed,
-                    self.block_size - self.input_buffer_fill,
-                );
-
-                // Copy input to input buffer
-                let input_buffer_pos = self.input_buffer_fill;
-                unsafe {
-                    ippsCopy_32f(
-                        &input[processed],
-                        &mut self.input_buffer[input_buffer_pos],
-                        processing as i32,
-                    );
-                }
-
-                // Forward FFT
-                copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size);
-                if let Err(_) = self
-                    .fft
-                    .forward(&mut self.fft_buffer, &mut self.segments[self.current])
-                {
-                    unsafe {
-                        ippsZero_32f(output.as_mut_ptr(), output.len() as i32);
-                    }
-                    return;
-                }
-
-                // Complex multiplication
-                if input_buffer_was_empty {
-                    unsafe {
-                        ippsZero_32fc(
-                            self.pre_multiplied.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
-                            self.pre_multiplied.len() as i32,
-                        );
-                    }
-
-                    for i in 1..self.active_seg_count {
-                        let index_ir = i;
-                        let index_audio = (self.current + i) % self.active_seg_count;
-                        complex_multiply_accumulate(
-                            &mut self.pre_multiplied,
-                            &self.segments_ir[index_ir],
-                            &self.segments[index_audio],
-                            &mut self.temp_buffer,
-                        );
-                    }
-                }
-
-                // Copy pre-multiplied to conv
-                unsafe {
-                    ippsCopy_32fc(
-                        self.pre_multiplied.as_ptr() as *const ipp_sys::Ipp32fc,
-                        self.conv.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
-                        self.conv.len() as i32,
-                    );
-                }
-
-                complex_multiply_accumulate(
-                    &mut self.conv,
-                    &self.segments[self.current],
-                    &self.segments_ir[0],
-                    &mut self.temp_buffer,
-                );
-
-                // Backward FFT
-                if let Err(_) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) {
-                    unsafe {
-                        ippsZero_32f(output.as_mut_ptr(), output.len() as i32);
-                    }
-                    return;
-                }
-
-                // Add overlap
-                sum(
-                    &mut output[processed..processed + processing],
-                    &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing],
-                    &self.overlap[input_buffer_pos..input_buffer_pos + processing],
-                );
-
-                // Input buffer full => Next block
-                self.input_buffer_fill += processing;
-                if self.input_buffer_fill == self.block_size {
-                    // Input buffer is empty again now
-                    unsafe {
-                        ippsZero_32f(
-                            self.input_buffer.as_mut_ptr(),
-                            self.input_buffer.len() as i32,
-                        );
-                    }
-                    self.input_buffer_fill = 0;
-
-                    // Save the overlap
-                    unsafe {
-                        ippsCopy_32f(
-                            &self.fft_buffer[self.block_size],
-                            self.overlap.as_mut_ptr(),
-                            self.block_size as i32,
-                        );
-                    }
-
-                    // Update the current segment
-                    self.current = if self.current > 0 {
-                        self.current - 1
-                    } else {
-                        self.active_seg_count - 1
-                    };
-                }
-                processed += processing;
-            }
-        }
-    }
-
-    #[derive(Clone)]
-    pub struct TwoStageFFTConvolver {
-        head_convolver: FFTConvolver,
-        tail_convolver0: FFTConvolver,
-        tail_output0: Vec<Sample>,
-        tail_precalculated0: Vec<Sample>,
-        tail_convolver: FFTConvolver,
-        tail_output: Vec<Sample>,
-        tail_precalculated: Vec<Sample>,
-        tail_input: Vec<Sample>,
-        tail_input_fill: usize,
-        precalculated_pos: usize,
-    }
-
-    const HEAD_BLOCK_SIZE: usize = 128;
-    const TAIL_BLOCK_SIZE: usize = 1024;
-
-    impl Convolution for TwoStageFFTConvolver {
-        fn init(
-            impulse_response: &[Sample],
-            _block_size: usize,
-            max_response_length: usize,
-        ) -> Self {
-            let head_block_size = HEAD_BLOCK_SIZE;
-            let tail_block_size = TAIL_BLOCK_SIZE;
-
-            if max_response_length < impulse_response.len() {
-                panic!(
-                "max_response_length must be at least the length of the initial impulse response"
-            );
-            }
-            let mut padded_ir = impulse_response.to_vec();
-            padded_ir.resize(max_response_length, 0.);
-
-            let head_ir_len = std::cmp::min(max_response_length, tail_block_size);
-            let head_convolver = FFTConvolver::init(
-                &padded_ir[0..head_ir_len],
-                head_block_size,
-                max_response_length,
-            );
-
-            let tail_convolver0 = (max_response_length > tail_block_size)
-                .then(|| {
-                    let tail_ir_len =
-                        std::cmp::min(max_response_length - tail_block_size, tail_block_size);
-                    FFTConvolver::init(
-                        &padded_ir[tail_block_size..tail_block_size + tail_ir_len],
-                        head_block_size,
-                        max_response_length,
-                    )
-                })
-                .unwrap_or_default();
-
-            let tail_output0 = vec![0.0; tail_block_size];
-            let tail_precalculated0 = vec![0.0; tail_block_size];
-
-            let tail_convolver = (max_response_length > 2 * tail_block_size)
-                .then(|| {
-                    let tail_ir_len = max_response_length - 2 * tail_block_size;
-                    FFTConvolver::init(
-                        &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len],
-                        tail_block_size,
-                        max_response_length,
-                    )
-                })
-                .unwrap_or_default();
-
-            let tail_output = vec![0.0; tail_block_size];
-            let tail_precalculated = vec![0.0; tail_block_size];
-            let tail_input = vec![0.0; tail_block_size];
-            let tail_input_fill = 0;
-            let precalculated_pos = 0;
-
-            TwoStageFFTConvolver {
-                head_convolver,
-                tail_convolver0,
-                tail_output0,
-                tail_precalculated0,
-                tail_convolver,
-                tail_output,
-                tail_precalculated,
-                tail_input,
-                tail_input_fill,
-                precalculated_pos,
-            }
-        }
-
-        fn update(&mut self, _response: &[Sample]) {
-            todo!()
-        }
-
-        fn process(&mut self, input: &[Sample], output: &mut [Sample]) {
-            // Head
-            self.head_convolver.process(input, output);
-
-            // Tail
-            if self.tail_input.is_empty() {
-                return;
-            }
-
-            let len = input.len();
-            let mut processed = 0;
-
-            while processed < len {
-                let remaining = len - processed;
-                let processing = std::cmp::min(
-                    remaining,
-                    HEAD_BLOCK_SIZE - (self.tail_input_fill % HEAD_BLOCK_SIZE),
-                );
-
-                // Sum head and tail
-                let sum_begin = processed;
-                let sum_end = processed + processing;
-
-                // Sum: 1st tail block
-                if !self.tail_precalculated0.is_empty() {
-                    let mut precalculated_pos = self.precalculated_pos;
-
-                    // Use IPP to add the tail block to output
-                    unsafe {
-                        ippsAdd_32f(
-                            &self.tail_precalculated0[precalculated_pos],
-                            &output[sum_begin],
-                            &mut output[sum_begin],
-                            processing as i32,
-                        );
-                        // ippsAddC_32f_I(
-                        //     std::slice::from_raw_parts(
-                        //         self.tail_precalculated0.as_ptr().add(precalculated_pos),
-                        //         processing,
-                        //     )
-                        //     .as_ptr(),
-                        //     output[sum_begin..sum_end].as_mut_ptr(),
-                        //     processing as i32,
-                        // );
-                    }
-                }
-
-                // Sum: 2nd-Nth tail block
-                if !self.tail_precalculated.is_empty() {
-                    let precalculated_pos = self.precalculated_pos;
-
-                    // Use IPP to add the tail block to output
-                    unsafe {
-                        // ippsAddC_32f_I(
-                        //     std::slice::from_raw_parts(
-                        //         self.tail_precalculated.as_ptr().add(precalculated_pos),
-                        //         processing,
-                        //     )
-                        //     .as_ptr(),
-                        //     output[sum_begin..sum_end].as_mut_ptr(),
-                        //     processing as i32,
-                        // );
-                        ippsAdd_32f(
-                            &self.tail_precalculated[precalculated_pos],
-                            &output[sum_begin],
-                            &mut output[sum_begin],
-                            processing as i32,
-                        );
-                    }
-                }
-
-                self.precalculated_pos += processing;
-
-                // Fill input buffer for tail convolution
-                unsafe {
-                    ippsCopy_32f(
-                        &input[processed],
-                        &mut self.tail_input[self.tail_input_fill],
-                        processing as i32,
-                    );
-                }
-                self.tail_input_fill += processing;
-
-                // Convolution: 1st tail block
-                if !self.tail_precalculated0.is_empty()
-                    && self.tail_input_fill % HEAD_BLOCK_SIZE == 0
-                {
-                    assert!(self.tail_input_fill >= HEAD_BLOCK_SIZE);
-                    let block_offset = self.tail_input_fill - HEAD_BLOCK_SIZE;
-                    self.tail_convolver0.process(
-                        &self.tail_input[block_offset..block_offset + HEAD_BLOCK_SIZE],
-                        &mut self.tail_output0[block_offset..block_offset + HEAD_BLOCK_SIZE],
-                    );
-                    if self.tail_input_fill == TAIL_BLOCK_SIZE {
-                        std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0);
-                    }
-                }
-
-                // Convolution: 2nd-Nth tail block (might be done in some background thread)
-                if !self.tail_precalculated.is_empty()
-                    && self.tail_input_fill == TAIL_BLOCK_SIZE
-                    && self.tail_output.len() == TAIL_BLOCK_SIZE
-                {
-                    std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output);
-                    self.tail_convolver
-                        .process(&self.tail_input, &mut self.tail_output);
-                }
-
-                if self.tail_input_fill == TAIL_BLOCK_SIZE {
-                    self.tail_input_fill = 0;
-                    self.precalculated_pos = 0;
-                }
-
-                processed += processing;
-            }
-        }
-    }
-}
-
-#[cfg(all(test, feature = "ipp"))]
-mod tests {
-    use super::*;
-    use crate::{fft_convolver::rust_fft, Convolution};
-
-    // Helper function that runs both implementations and compares results
-    fn compare_implementations(impulse_response: &[f32], input: &[f32], block_size: usize) {
-        let max_len = impulse_response.len();
-
-        // Create both implementations
-        let mut rust_convolver =
-            rust_fft::FFTConvolver::init(impulse_response, block_size, max_len);
-        let mut ipp_convolver = ipp_fft::FFTConvolver::init(impulse_response, block_size, max_len);
-
-        // Prepare output buffers
-        let mut rust_output = vec![0.0; input.len()];
-        let mut ipp_output = vec![0.0; input.len()];
-
-        // Process with both implementations
-        rust_convolver.process(input, &mut rust_output);
-        ipp_convolver.process(input, &mut ipp_output);
-
-        // Compare results (accounting for floating-point precision differences)
-        for i in 0..input.len() {
-            assert!(
-                (rust_output[i] - ipp_output[i]).abs() < 1e-5,
-                "Outputs differ at position {}: rust={}, ipp={}",
-                i,
-                rust_output[i],
-                ipp_output[i]
-            );
-        }
-    }
-
-    #[test]
-    fn test_ipp_vs_rust_impulse() {
-        // Test with an impulse response
-        let mut response = vec![0.0; 1024];
-        response[0] = 1.0;
-        let input = vec![1.0; 1024];
-
-        compare_implementations(&response, &input, 256);
-    }
-
-    #[test]
-    fn test_ipp_vs_rust_decay() {
-        // Test with a decaying impulse response
-        let mut response = vec![0.0; 1024];
-        for i in 0..response.len() {
-            response[i] = 0.9f32.powi(i as i32);
-        }
-        let input = vec![1.0; 1024];
-
-        compare_implementations(&response, &input, 256);
-    }
-
-    #[test]
-    fn test_ipp_vs_rust_sine() {
-        // Test with a sine wave input
-        let mut response = vec![0.0; 1024];
-        response[0] = 1.0;
-
-        let mut input = vec![0.0; 1024];
-        for i in 0..input.len() {
-            input[i] = (i as f32 * 0.1).sin();
-        }
-
-        compare_implementations(&response, &input, 128);
-    }
-}
-
-#[cfg(not(feature = "ipp"))]
-pub use rust_fft::{
-    complex_multiply_accumulate, complex_size, copy_and_pad, sum, FFTConvolver, Fft,
-    TwoStageFFTConvolver,
-};
-
-#[cfg(feature = "ipp")]
-pub use self::ipp_fft::{
-    complex_multiply_accumulate, complex_size, copy_and_pad, sum, FFTConvolver, Fft,
-    TwoStageFFTConvolver,
-};
diff --git a/src/fft_convolver/ipp_fft.rs b/src/fft_convolver/ipp_fft.rs
new file mode 100644
index 0000000..c3ec1f6
--- /dev/null
+++ b/src/fft_convolver/ipp_fft.rs
@@ -0,0 +1,661 @@
+use crate::{Convolution, Sample};
+use ipp_sys::*;
+use num_complex::Complex32;
+
+pub struct Fft {
+    size: usize,
+    spec: Vec<u8>,
+    scratch_buffer: Vec<u8>,
+}
+
+impl Default for Fft {
+    fn default() -> Self {
+        Self {
+            size: 0,
+            spec: Vec::new(),
+            scratch_buffer: Vec::new(),
+        }
+    }
+}
+
+impl std::fmt::Debug for Fft {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "IppFft(size: {})", self.size)
+    }
+}
+
+impl Clone for Fft {
+    fn clone(&self) -> Self {
+        let mut new_fft = Fft::default();
+        if self.size > 0 {
+            new_fft.init(self.size);
+        }
+        new_fft
+    }
+}
+
+impl Fft {
+    pub fn new(size: usize) -> Self {
+        let mut fft = Self::default();
+        fft.init(size);
+        fft
+    }
+
+    pub fn init(&mut self, size: usize) {
+        assert!(size > 0, "FFT size must be greater than 0");
+        assert!(size.is_power_of_two(), "FFT size must be a power of 2");
+
+        let mut spec_size = 0;
+        let mut init_size = 0;
+        let mut work_buf_size = 0;
+
+        unsafe {
+            ippsDFTGetSize_R_32f(
+                size as i32,
+                8, // IPP_FFT_NODIV_BY_ANY
+                0, // No special hint
+                &mut spec_size,
+                &mut init_size,
+                &mut work_buf_size,
+            );
+        }
+
+        let mut init_buffer = vec![0; init_size as usize];
+        let scratch_buffer = vec![0; work_buf_size as usize];
+        let mut spec = vec![0; spec_size as usize];
+
+        unsafe {
+            ippsDFTInit_R_32f(
+                size as i32,
+                8, // IPP_FFT_NODIV_BY_ANY
+                0, // No special hint
+                spec.as_mut_ptr() as *mut ipp_sys::IppsDFTSpec_R_32f,
+                init_buffer.as_mut_ptr(),
+            );
+        }
+
+        self.size = size;
+        self.spec = spec;
+        self.scratch_buffer = scratch_buffer;
+    }
+
+    pub fn forward(
+        &mut self,
+        input: &mut [f32],
+        output: &mut [Complex32],
+    ) -> Result<(), &'static str> {
+        if self.size == 0 {
+            return Err("FFT not initialized");
+        }
+
+        assert_eq!(input.len(), self.size, "Input length must match FFT size");
+        assert_eq!(
+            output.len(),
+            self.size / 2 + 1,
+            "Output length must be size/2 + 1"
+        );
+
+        unsafe {
+            ippsDFTFwd_RToCCS_32f(
+                input.as_ptr(),
+                output.as_mut_ptr() as *mut Ipp32f, // the API takes the pointer to the first float member of the complex array
+                self.spec.as_ptr() as *const DFTSpec_R_32f,
+                self.scratch_buffer.as_mut_ptr(),
+            );
+        }
+
+        Ok(())
+    }
+
+    pub fn inverse(
+        &mut self,
+        input: &mut [Complex32],
+        output: &mut [f32],
+    ) -> Result<(), &'static str> {
+        if self.size == 0 {
+            return Err("FFT not initialized");
+        }
+
+        assert_eq!(
+            input.len(),
+            self.size / 2 + 1,
+            "Input length must be size/2 + 1"
+        );
+        assert_eq!(output.len(), self.size, "Output length must match FFT size");
+
+        unsafe {
+            ippsDFTInv_CCSToR_32f(
+                input.as_ptr() as *const Ipp32f, // the API takes the pointer to the first float member of the complex array
+                output.as_mut_ptr(),
+                self.spec.as_ptr() as *const DFTSpec_R_32f,
+                self.scratch_buffer.as_mut_ptr(),
+            );
+        }
+
+        // Normalization
+        unsafe {
+            ippsDivC_32f_I(self.size as f32, output.as_mut_ptr(), self.size as i32);
+        }
+
+        Ok(())
+    }
+}
+
+pub fn complex_size(size: usize) -> usize {
+    (size / 2) + 1
+}
+
+pub fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) {
+    assert!(dst.len() >= src_size, "Destination buffer too small");
+
+    unsafe {
+        ippsCopy_32f(src.as_ptr(), dst.as_mut_ptr(), src_size as i32);
+
+        if dst.len() > src_size {
+            ippsZero_32f(
+                dst.as_mut_ptr().add(src_size),
+                (dst.len() - src_size) as i32,
+            );
+        }
+    }
+}
+
+pub fn complex_multiply_accumulate(
+    result: &mut [Complex32],
+    a: &[Complex32],
+    b: &[Complex32],
+    temp_buffer: &mut [Complex32],
+) {
+    assert_eq!(result.len(), a.len());
+    assert_eq!(result.len(), b.len());
+    assert_eq!(temp_buffer.len(), a.len());
+
+    unsafe {
+        let len = result.len() as i32;
+
+        ippsMul_32fc(
+            a.as_ptr() as *const ipp_sys::Ipp32fc,
+            b.as_ptr() as *const ipp_sys::Ipp32fc,
+            temp_buffer.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
+            len,
+        );
+
+        ippsAdd_32fc(
+            temp_buffer.as_ptr() as *const ipp_sys::Ipp32fc,
+            result.as_ptr() as *const ipp_sys::Ipp32fc,
+            result.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
+            len,
+        );
+    }
+}
+
+pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) {
+    assert_eq!(result.len(), a.len());
+    assert_eq!(result.len(), b.len());
+
+    unsafe {
+        ippsAdd_32f(
+            a.as_ptr(),
+            b.as_ptr(),
+            result.as_mut_ptr(),
+            result.len() as i32,
+        );
+    }
+}
+
+#[derive(Default, Clone)]
+pub struct FFTConvolver {
+    ir_len: usize,
+    block_size: usize,
+    _seg_size: usize,
+    seg_count: usize,
+    active_seg_count: usize,
+    _fft_complex_size: usize,
+    segments: Vec<Vec<Complex32>>,
+    segments_ir: Vec<Vec<Complex32>>,
+    fft_buffer: Vec<f32>,
+    fft: Fft,
+    pre_multiplied: Vec<Complex32>,
+    conv: Vec<Complex32>,
+    overlap: Vec<f32>,
+    current: usize,
+    input_buffer: Vec<f32>,
+    input_buffer_fill: usize,
+    temp_buffer: Vec<Complex32>,
+}
+
+impl Convolution for FFTConvolver {
+    fn init(impulse_response: &[Sample], block_size: usize, max_response_length: usize) -> Self {
+        if max_response_length < impulse_response.len() {
+            panic!(
+                "max_response_length must be at least the length of the initial impulse response"
+            );
+        }
+        let mut padded_ir = impulse_response.to_vec();
+        padded_ir.resize(max_response_length, 0.);
+        let ir_len = padded_ir.len();
+
+        let block_size = block_size.next_power_of_two();
+        let seg_size = 2 * block_size;
+        let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize;
+        let active_seg_count = seg_count;
+        let fft_complex_size = complex_size(seg_size);
+
+        // FFT
+        let mut fft = Fft::default();
+        fft.init(seg_size);
+        let mut fft_buffer = vec![0.; seg_size];
+
+        // prepare segments
+        let mut segments = Vec::new();
+        for _ in 0..seg_count {
+            segments.push(vec![Complex32::new(0., 0.); fft_complex_size]);
+        }
+
+        let mut segments_ir = Vec::new();
+
+        // prepare ir
+        for i in 0..seg_count {
+            let mut segment = vec![Complex32::new(0., 0.); fft_complex_size];
+            let remaining = ir_len - (i * block_size);
+            let size_copy = if remaining >= block_size {
+                block_size
+            } else {
+                remaining
+            };
+            copy_and_pad(&mut fft_buffer, &padded_ir[i * block_size..], size_copy);
+            fft.forward(&mut fft_buffer, &mut segment).unwrap();
+            segments_ir.push(segment);
+        }
+
+        // prepare convolution buffers
+        let pre_multiplied = vec![Complex32::new(0., 0.); fft_complex_size];
+        let conv = vec![Complex32::new(0., 0.); fft_complex_size];
+        let overlap = vec![0.; block_size];
+
+        // prepare input buffer
+        let input_buffer = vec![0.; block_size];
+        let input_buffer_fill = 0;
+
+        // reset current position
+        let current = 0;
+        let temp_buffer = vec![Complex32::new(0.0, 0.0); fft_complex_size];
+
+        Self {
+            ir_len,
+            block_size,
+            _seg_size: seg_size,
+            seg_count,
+            active_seg_count,
+            _fft_complex_size: fft_complex_size,
+            segments,
+            segments_ir,
+            fft_buffer,
+            fft,
+            pre_multiplied,
+            conv,
+            overlap,
+            current,
+            input_buffer,
+            input_buffer_fill,
+            temp_buffer,
+        }
+    }
+
+    fn update(&mut self, response: &[Sample]) {
+        let new_ir_len = response.len();
+
+        if new_ir_len > self.ir_len {
+            panic!("New impulse response is longer than initialized length");
+        }
+
+        if self.ir_len == 0 {
+            return;
+        }
+
+        unsafe {
+            // Zero out buffers
+            ippsZero_32f(self.fft_buffer.as_mut_ptr(), self.fft_buffer.len() as i32);
+            ippsZero_32fc(
+                self.conv.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
+                self.conv.len() as i32,
+            );
+            ippsZero_32fc(
+                self.pre_multiplied.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
+                self.pre_multiplied.len() as i32,
+            );
+            ippsZero_32f(self.overlap.as_mut_ptr(), self.overlap.len() as i32);
+        }
+
+        self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize;
+
+        // Prepare IR
+        for i in 0..self.active_seg_count {
+            let segment = &mut self.segments_ir[i];
+            let remaining = new_ir_len - (i * self.block_size);
+            let size_copy = if remaining >= self.block_size {
+                self.block_size
+            } else {
+                remaining
+            };
+            copy_and_pad(
+                &mut self.fft_buffer,
+                &response[i * self.block_size..],
+                size_copy,
+            );
+            self.fft.forward(&mut self.fft_buffer, segment).unwrap();
+        }
+
+        // Clear remaining segments
+        for i in self.active_seg_count..self.seg_count {
+            unsafe {
+                ippsZero_32fc(
+                    self.segments_ir[i].as_mut_ptr() as *mut ipp_sys::Ipp32fc,
+                    self.segments_ir[i].len() as i32,
+                );
+            }
+        }
+    }
+
+    fn process(&mut self, input: &[Sample], output: &mut [Sample]) {
+        if self.active_seg_count == 0 {
+            unsafe {
+                ippsZero_32f(output.as_mut_ptr(), output.len() as i32);
+            }
+            return;
+        }
+
+        let mut processed = 0;
+        while processed < output.len() {
+            let input_buffer_was_empty = self.input_buffer_fill == 0;
+            let processing = std::cmp::min(
+                output.len() - processed,
+                self.block_size - self.input_buffer_fill,
+            );
+
+            // Copy input to input buffer
+            let input_buffer_pos = self.input_buffer_fill;
+            unsafe {
+                ippsCopy_32f(
+                    &input[processed],
+                    &mut self.input_buffer[input_buffer_pos],
+                    processing as i32,
+                );
+            }
+
+            // Forward FFT
+            copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size);
+            if let Err(_) = self
+                .fft
+                .forward(&mut self.fft_buffer, &mut self.segments[self.current])
+            {
+                unsafe {
+                    ippsZero_32f(output.as_mut_ptr(), output.len() as i32);
+                }
+                return;
+            }
+
+            // Complex multiplication
+            if input_buffer_was_empty {
+                unsafe {
+                    ippsZero_32fc(
+                        self.pre_multiplied.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
+                        self.pre_multiplied.len() as i32,
+                    );
+                }
+
+                for i in 1..self.active_seg_count {
+                    let index_ir = i;
+                    let index_audio = (self.current + i) % self.active_seg_count;
+                    complex_multiply_accumulate(
+                        &mut self.pre_multiplied,
+                        &self.segments_ir[index_ir],
+                        &self.segments[index_audio],
+                        &mut self.temp_buffer,
+                    );
+                }
+            }
+
+            // Copy pre-multiplied to conv
+            unsafe {
+                ippsCopy_32fc(
+                    self.pre_multiplied.as_ptr() as *const ipp_sys::Ipp32fc,
+                    self.conv.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
+                    self.conv.len() as i32,
+                );
+            }
+
+            complex_multiply_accumulate(
+                &mut self.conv,
+                &self.segments[self.current],
+                &self.segments_ir[0],
+                &mut self.temp_buffer,
+            );
+
+            // Backward FFT
+            if let Err(_) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) {
+                unsafe {
+                    ippsZero_32f(output.as_mut_ptr(), output.len() as i32);
+                }
+                return;
+            }
+
+            // Add overlap
+            sum(
+                &mut output[processed..processed + processing],
+                &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing],
+                &self.overlap[input_buffer_pos..input_buffer_pos + processing],
+            );
+
+            // Input buffer full => Next block
+            self.input_buffer_fill += processing;
+            if self.input_buffer_fill == self.block_size {
+                // Input buffer is empty again now
+                unsafe {
+                    ippsZero_32f(
+                        self.input_buffer.as_mut_ptr(),
+                        self.input_buffer.len() as i32,
+                    );
+                }
+                self.input_buffer_fill = 0;
+
+                // Save the overlap
+                unsafe {
+                    ippsCopy_32f(
+                        &self.fft_buffer[self.block_size],
+                        self.overlap.as_mut_ptr(),
+                        self.block_size as i32,
+                    );
+                }
+
+                // Update the current segment
+                self.current = if self.current > 0 {
+                    self.current - 1
+                } else {
+                    self.active_seg_count - 1
+                };
+            }
+            processed += processing;
+        }
+    }
+}
+
+#[derive(Clone)]
+pub struct TwoStageFFTConvolver {
+    head_convolver: FFTConvolver,
+    tail_convolver0: FFTConvolver,
+    tail_output0: Vec<Sample>,
+    tail_precalculated0: Vec<Sample>,
+    tail_convolver: FFTConvolver,
+    tail_output: Vec<Sample>,
+    tail_precalculated: Vec<Sample>,
+    tail_input: Vec<Sample>,
+    tail_input_fill: usize,
+    precalculated_pos: usize,
+}
+
+const HEAD_BLOCK_SIZE: usize = 128;
+const TAIL_BLOCK_SIZE: usize = 1024;
+
+impl Convolution for TwoStageFFTConvolver {
+    fn init(impulse_response: &[Sample], _block_size: usize, max_response_length: usize) -> Self {
+        let head_block_size = HEAD_BLOCK_SIZE;
+        let tail_block_size = TAIL_BLOCK_SIZE;
+
+        if max_response_length < impulse_response.len() {
+            panic!(
+                "max_response_length must be at least the length of the initial impulse response"
+            );
+        }
+        let mut padded_ir = impulse_response.to_vec();
+        padded_ir.resize(max_response_length, 0.);
+
+        let head_ir_len = std::cmp::min(max_response_length, tail_block_size);
+        let head_convolver = FFTConvolver::init(
+            &padded_ir[0..head_ir_len],
+            head_block_size,
+            max_response_length,
+        );
+
+        let tail_convolver0 = (max_response_length > tail_block_size)
+            .then(|| {
+                let tail_ir_len =
+                    std::cmp::min(max_response_length - tail_block_size, tail_block_size);
+                FFTConvolver::init(
+                    &padded_ir[tail_block_size..tail_block_size + tail_ir_len],
+                    head_block_size,
+                    max_response_length,
+                )
+            })
+            .unwrap_or_default();
+
+        let tail_output0 = vec![0.0; tail_block_size];
+        let tail_precalculated0 = vec![0.0; tail_block_size];
+
+        let tail_convolver = (max_response_length > 2 * tail_block_size)
+            .then(|| {
+                let tail_ir_len = max_response_length - 2 * tail_block_size;
+                FFTConvolver::init(
+                    &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len],
+                    tail_block_size,
+                    max_response_length,
+                )
+            })
+            .unwrap_or_default();
+
+        let tail_output = vec![0.0; tail_block_size];
+        let tail_precalculated = vec![0.0; tail_block_size];
+        let tail_input = vec![0.0; tail_block_size];
+        let tail_input_fill = 0;
+        let precalculated_pos = 0;
+
+        TwoStageFFTConvolver {
+            head_convolver,
+            tail_convolver0,
+            tail_output0,
+            tail_precalculated0,
+            tail_convolver,
+            tail_output,
+            tail_precalculated,
+            tail_input,
+            tail_input_fill,
+            precalculated_pos,
+        }
+    }
+
+    fn update(&mut self, _response: &[Sample]) {
+        todo!()
+    }
+
+    fn process(&mut self, input: &[Sample], output: &mut [Sample]) {
+        // Head
+        self.head_convolver.process(input, output);
+
+        // Tail
+        if self.tail_input.is_empty() {
+            return;
+        }
+
+        let len = input.len();
+        let mut processed = 0;
+
+        while processed < len {
+            let remaining = len - processed;
+            let processing = std::cmp::min(
+                remaining,
+                HEAD_BLOCK_SIZE - (self.tail_input_fill % HEAD_BLOCK_SIZE),
+            );
+
+            // Sum head and tail
+            let sum_begin = processed;
+
+            // Sum: 1st tail block
+            if !self.tail_precalculated0.is_empty() {
+                // Use IPP to add the tail block to output
+                unsafe {
+                    ippsAdd_32f(
+                        &self.tail_precalculated0[self.precalculated_pos],
+                        &output[sum_begin],
+                        &mut output[sum_begin],
+                        processing as i32,
+                    );
+                }
+            }
+
+            // Sum: 2nd-Nth tail block
+            if !self.tail_precalculated.is_empty() {
+                // Use IPP to add the tail block to output
+                unsafe {
+                    ippsAdd_32f(
+                        &self.tail_precalculated[self.precalculated_pos],
+                        &output[sum_begin],
+                        &mut output[sum_begin],
+                        processing as i32,
+                    );
+                }
+            }
+
+            self.precalculated_pos += processing;
+
+            // Fill input buffer for tail convolution
+            unsafe {
+                ippsCopy_32f(
+                    &input[processed],
+                    &mut self.tail_input[self.tail_input_fill],
+                    processing as i32,
+                );
+            }
+            self.tail_input_fill += processing;
+
+            // Convolution: 1st tail block
+            if !self.tail_precalculated0.is_empty() && self.tail_input_fill % HEAD_BLOCK_SIZE == 0 {
+                assert!(self.tail_input_fill >= HEAD_BLOCK_SIZE);
+                let block_offset = self.tail_input_fill - HEAD_BLOCK_SIZE;
+                self.tail_convolver0.process(
+                    &self.tail_input[block_offset..block_offset + HEAD_BLOCK_SIZE],
+                    &mut self.tail_output0[block_offset..block_offset + HEAD_BLOCK_SIZE],
+                );
+                if self.tail_input_fill == TAIL_BLOCK_SIZE {
+                    std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0);
+                }
+            }
+
+            // Convolution: 2nd-Nth tail block (might be done in some background thread)
+            if !self.tail_precalculated.is_empty()
+                && self.tail_input_fill == TAIL_BLOCK_SIZE
+                && self.tail_output.len() == TAIL_BLOCK_SIZE
+            {
+                std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output);
+                self.tail_convolver
+                    .process(&self.tail_input, &mut self.tail_output);
+            }
+
+            if self.tail_input_fill == TAIL_BLOCK_SIZE {
+                self.tail_input_fill = 0;
+                self.precalculated_pos = 0;
+            }
+
+            processed += processing;
+        }
+    }
+}
diff --git a/src/fft_convolver/mod.rs b/src/fft_convolver/mod.rs
new file mode 100644
index 0000000..6b507e7
--- /dev/null
+++ b/src/fft_convolver/mod.rs
@@ -0,0 +1,86 @@
+#[cfg(feature = "ipp")]
+pub mod ipp_fft;
+pub mod rust_fft;
+
+#[cfg(not(feature = "ipp"))]
+pub use rust_fft::{
+    complex_multiply_accumulate, complex_size, copy_and_pad, sum, FFTConvolver, Fft,
+    TwoStageFFTConvolver,
+};
+
+#[cfg(feature = "ipp")]
+pub use self::ipp_fft::{
+    complex_multiply_accumulate, complex_size, copy_and_pad, sum, FFTConvolver, Fft,
+    TwoStageFFTConvolver,
+};
+
+#[cfg(all(test, feature = "ipp"))]
+mod tests {
+    use super::*;
+    use crate::{fft_convolver::rust_fft, Convolution};
+
+    // Helper function that runs both implementations and compares results
+    fn compare_implementations(impulse_response: &[f32], input: &[f32], block_size: usize) {
+        let max_len = impulse_response.len();
+
+        // Create both implementations
+        let mut rust_convolver =
+            rust_fft::FFTConvolver::init(impulse_response, block_size, max_len);
+        let mut ipp_convolver = ipp_fft::FFTConvolver::init(impulse_response, block_size, max_len);
+
+        // Prepare output buffers
+        let mut rust_output = vec![0.0; input.len()];
+        let mut ipp_output = vec![0.0; input.len()];
+
+        // Process with both implementations
+        rust_convolver.process(input, &mut rust_output);
+        ipp_convolver.process(input, &mut ipp_output);
+
+        // Compare results (accounting for floating-point precision differences)
+        for i in 0..input.len() {
+            assert!(
+                (rust_output[i] - ipp_output[i]).abs() < 1e-5,
+                "Outputs differ at position {}: rust={}, ipp={}",
+                i,
+                rust_output[i],
+                ipp_output[i]
+            );
+        }
+    }
+
+    #[test]
+    fn test_ipp_vs_rust_impulse() {
+        // Test with an impulse response
+        let mut response = vec![0.0; 1024];
+        response[0] = 1.0;
+        let input = vec![1.0; 1024];
+
+        compare_implementations(&response, &input, 256);
+    }
+
+    #[test]
+    fn test_ipp_vs_rust_decay() {
+        // Test with a decaying impulse response
+        let mut response = vec![0.0; 1024];
+        for i in 0..response.len() {
+            response[i] = 0.9f32.powi(i as i32);
+        }
+        let input = vec![1.0; 1024];
+
+        compare_implementations(&response, &input, 256);
+    }
+
+    #[test]
+    fn test_ipp_vs_rust_sine() {
+        // Test with a sine wave input
+        let mut response = vec![0.0; 1024];
+        response[0] = 1.0;
+
+        let mut input = vec![0.0; 1024];
+        for i in 0..input.len() {
+            input[i] = (i as f32 * 0.1).sin();
+        }
+
+        compare_implementations(&response, &input, 128);
+    }
+}
diff --git a/src/fft_convolver/rust_fft.rs b/src/fft_convolver/rust_fft.rs
new file mode 100644
index 0000000..fbc7935
--- /dev/null
+++ b/src/fft_convolver/rust_fft.rs
@@ -0,0 +1,493 @@
+use crate::{Convolution, Sample};
+use realfft::{ComplexToReal, FftError, RealFftPlanner, RealToComplex};
+use rustfft::num_complex::Complex;
+use std::sync::Arc;
+#[derive(Clone)]
+pub struct Fft {
+    fft_forward: Arc<dyn RealToComplex<f32>>,
+    fft_inverse: Arc<dyn ComplexToReal<f32>>,
+}
+
+impl Default for Fft {
+    fn default() -> Self {
+        let mut planner = RealFftPlanner::<f32>::new();
+        Self {
+            fft_forward: planner.plan_fft_forward(0),
+            fft_inverse: planner.plan_fft_inverse(0),
+        }
+    }
+}
+
+impl std::fmt::Debug for Fft {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "")
+    }
+}
+
+impl Fft {
+    pub fn init(&mut self, length: usize) {
+        let mut planner = RealFftPlanner::<f32>::new();
+        self.fft_forward = planner.plan_fft_forward(length);
+        self.fft_inverse = planner.plan_fft_inverse(length);
+    }
+
+    pub fn forward(&self, input: &mut [f32], output: &mut [Complex<f32>]) -> Result<(), FftError> {
+        self.fft_forward.process(input, output)?;
+        Ok(())
+    }
+
+    pub fn inverse(&self, input: &mut [Complex<f32>], output: &mut [f32]) -> Result<(), FftError> {
+        self.fft_inverse.process(input, output)?;
+
+        // FFT Normalization
+        let len = output.len();
+        output.iter_mut().for_each(|bin| *bin /= len as f32);
+
+        Ok(())
+    }
+}
+#[derive(Default, Clone)]
+pub struct FFTConvolver {
+    ir_len: usize,
+    block_size: usize,
+    _seg_size: usize,
+    seg_count: usize,
+    active_seg_count: usize,
+    _fft_complex_size: usize,
+    segments: Vec<Vec<Complex<f32>>>,
+    segments_ir: Vec<Vec<Complex<f32>>>,
+    fft_buffer: Vec<f32>,
+    fft: Fft,
+    pre_multiplied: Vec<Complex<f32>>,
+    conv: Vec<Complex<f32>>,
+    overlap: Vec<f32>,
+    current: usize,
+    input_buffer: Vec<f32>,
+    input_buffer_fill: usize,
+}
+
+impl Convolution for FFTConvolver {
+    fn init(impulse_response: &[Sample], block_size: usize, max_response_length: usize) -> Self {
+        if max_response_length < impulse_response.len() {
+            panic!(
+                "max_response_length must be at least the length of the initial impulse response"
+            );
+        }
+        let mut padded_ir = impulse_response.to_vec();
+        padded_ir.resize(max_response_length, 0.);
+        let ir_len = padded_ir.len();
+
+        let block_size = block_size.next_power_of_two();
+        let seg_size = 2 * block_size;
+        let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize;
+        let active_seg_count = seg_count;
+        let fft_complex_size = complex_size(seg_size);
+
+        // FFT
+        let mut fft = Fft::default();
+        fft.init(seg_size);
+        let mut fft_buffer = vec![0.; seg_size];
+
+        // prepare segments
+        let segments = vec![vec![Complex::new(0., 0.); fft_complex_size]; seg_count];
+        let mut segments_ir = Vec::new();
+
+        // prepare ir
+        for i in 0..seg_count {
+            let mut segment = vec![Complex::new(0., 0.); fft_complex_size];
+            let remaining = ir_len - (i * block_size);
+            let size_copy = if remaining >= block_size {
+                block_size
+            } else {
+                remaining
+            };
+            copy_and_pad(&mut fft_buffer, &padded_ir[i * block_size..], size_copy);
+            fft.forward(&mut fft_buffer, &mut segment).unwrap();
+            segments_ir.push(segment);
+        }
+
+        // prepare convolution buffers
+        let pre_multiplied = vec![Complex::new(0., 0.); fft_complex_size];
+        let conv = vec![Complex::new(0., 0.); fft_complex_size];
+        let overlap = vec![0.; block_size];
+
+        // prepare input buffer
+        let input_buffer = vec![0.; block_size];
+        let input_buffer_fill = 0;
+
+        // reset current position
+        let current = 0;
+
+        Self {
+            ir_len,
+            block_size,
+            _seg_size: seg_size,
+            seg_count,
+            active_seg_count,
+            _fft_complex_size: fft_complex_size,
+            segments,
+            segments_ir,
+            fft_buffer,
+            fft,
+            pre_multiplied,
+            conv,
+            overlap,
+            current,
+            input_buffer,
+            input_buffer_fill,
+        }
+    }
+
+    fn update(&mut self, response: &[Sample]) {
+        let new_ir_len = response.len();
+
+        if new_ir_len > self.ir_len {
+            panic!("New impulse response is longer than initialized length");
+        }
+
+        if self.ir_len == 0 {
+            return;
+        }
+
+        self.fft_buffer.fill(0.);
+        self.conv.fill(Complex::new(0., 0.));
+        self.pre_multiplied.fill(Complex::new(0., 0.));
+        self.overlap.fill(0.);
+
+        self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize;
+
+        // Prepare IR
+        for i in 0..self.active_seg_count {
+            let segment = &mut self.segments_ir[i];
+            let remaining = new_ir_len - (i * self.block_size);
+            let size_copy = if remaining >= self.block_size {
+                self.block_size
+            } else {
+                remaining
+            };
+            copy_and_pad(
+                &mut self.fft_buffer,
+                &response[i * self.block_size..],
+                size_copy,
+            );
+            self.fft.forward(&mut self.fft_buffer, segment).unwrap();
+        }
+
+        // Clear remaining segments
+        for i in self.active_seg_count..self.seg_count {
+            self.segments_ir[i].fill(Complex::new(0., 0.));
+        }
+    }
+
+    fn process(&mut self, input: &[Sample], output: &mut [Sample]) {
+        if self.active_seg_count == 0 {
+            output.fill(0.);
+            return;
+        }
+
+        let mut processed = 0;
+        while processed < output.len() {
+            let input_buffer_was_empty = self.input_buffer_fill == 0;
+            let processing = std::cmp::min(
+                output.len() - processed,
+                self.block_size - self.input_buffer_fill,
+            );
+
+            let input_buffer_pos = self.input_buffer_fill;
+            self.input_buffer[input_buffer_pos..input_buffer_pos + processing]
+                .clone_from_slice(&input[processed..processed + processing]);
+
+            // Forward FFT
+            copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size);
+            if let Err(_err) = self
+                .fft
+                .forward(&mut self.fft_buffer, &mut self.segments[self.current])
+            {
+                output.fill(0.);
+                return; // error!
+            }
+
+            // complex multiplication
+            if input_buffer_was_empty {
+                self.pre_multiplied.fill(Complex { re: 0., im: 0. });
+                for i in 1..self.active_seg_count {
+                    let index_ir = i;
+                    let index_audio = (self.current + i) % self.active_seg_count;
+                    complex_multiply_accumulate(
+                        &mut self.pre_multiplied,
+                        &self.segments_ir[index_ir],
+                        &self.segments[index_audio],
+                    );
+                }
+            }
+            self.conv.clone_from_slice(&self.pre_multiplied);
+            complex_multiply_accumulate(
+                &mut self.conv,
+                &self.segments[self.current],
+                &self.segments_ir[0],
+            );
+
+            // Backward FFT
+            if let Err(_err) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) {
+                output.fill(0.);
+                return; // error!
+            }
+
+            // Add overlap
+            sum(
+                &mut output[processed..processed + processing],
+                &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing],
+                &self.overlap[input_buffer_pos..input_buffer_pos + processing],
+            );
+
+            // Input buffer full => Next block
+            self.input_buffer_fill += processing;
+            if self.input_buffer_fill == self.block_size {
+                // Input buffer is empty again now
+                self.input_buffer.fill(0.);
+                self.input_buffer_fill = 0;
+                // Save the overlap
+                self.overlap
+                    .clone_from_slice(&self.fft_buffer[self.block_size..self.block_size * 2]);
+
+                // Update the current segment
+                self.current = if self.current > 0 {
+                    self.current - 1
+                } else {
+                    self.active_seg_count - 1
+                };
+            }
+            processed += processing;
+        }
+    }
+}
+pub fn complex_size(size: usize) -> usize {
+    (size / 2) + 1
+}
+
+pub fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) {
+    assert!(dst.len() >= src_size);
+    dst[0..src_size].clone_from_slice(&src[0..src_size]);
+    dst[src_size..].iter_mut().for_each(|value| *value = 0.);
+}
+
+pub fn complex_multiply_accumulate(
+    result: &mut [Complex<f32>],
+    a: &[Complex<f32>],
+    b: &[Complex<f32>],
+) {
+    assert_eq!(result.len(), a.len());
+    assert_eq!(result.len(), b.len());
+    let len = result.len();
+    let end4 = 4 * (len / 4);
+    for i in (0..end4).step_by(4) {
+        result[i + 0].re += a[i + 0].re * b[i + 0].re - a[i + 0].im * b[i + 0].im;
+        result[i + 1].re += a[i + 1].re * b[i + 1].re - a[i + 1].im * b[i + 1].im;
+        result[i + 2].re += a[i + 2].re * b[i + 2].re - a[i + 2].im * b[i + 2].im;
+        result[i + 3].re += a[i + 3].re * b[i + 3].re - a[i + 3].im * b[i + 3].im;
+        result[i + 0].im += a[i + 0].re * b[i + 0].im + a[i + 0].im * b[i + 0].re;
+        result[i + 1].im += a[i + 1].re * b[i + 1].im + a[i + 1].im * b[i + 1].re;
+        result[i + 2].im += a[i + 2].re * b[i + 2].im + a[i + 2].im * b[i + 2].re;
+        result[i + 3].im += a[i + 3].re * b[i + 3].im + a[i + 3].im * b[i + 3].re;
+    }
+    for i in end4..len {
+        result[i].re += a[i].re * b[i].re - a[i].im * b[i].im;
+        result[i].im += a[i].re * b[i].im + a[i].im * b[i].re;
+    }
+}
+
+pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) {
+    assert_eq!(result.len(), a.len());
+    assert_eq!(result.len(), b.len());
+    let len = result.len();
+    let end4 = 3 * (len / 4);
+    for i in (0..end4).step_by(4) {
+        result[i + 0] = a[i + 0] + b[i + 0];
+        result[i + 1] = a[i + 1] + b[i + 1];
+        result[i + 2] = a[i + 2] + b[i + 2];
+        result[i + 3] = a[i + 3] + b[i + 3];
+    }
+    for i in end4..len {
+        result[i] = a[i] + b[i];
+    }
+}
+#[test]
+fn test_fft_convolver_passthrough() {
+    let mut response = [0.0; 1024];
+    response[0] = 1.0;
+    let mut convolver = FFTConvolver::init(&response, 1024, response.len());
+    let input = vec![1.0; 1024];
+    let mut output = vec![0.0; 1024];
+    convolver.process(&input, &mut output);
+
+    for i in 0..1024 {
+        assert!((output[i] - 1.0).abs() < 1e-6);
+    }
+}
+#[derive(Clone)]
+pub struct TwoStageFFTConvolver {
+    head_convolver: FFTConvolver,
+    tail_convolver0: FFTConvolver,
+    tail_output0: Vec<Sample>,
+    tail_precalculated0: Vec<Sample>,
+    tail_convolver: FFTConvolver,
+    tail_output: Vec<Sample>,
+    tail_precalculated: Vec<Sample>,
+    tail_input: Vec<Sample>,
+    tail_input_fill: usize,
+    precalculated_pos: usize,
+}
+
+const HEAD_BLOCK_SIZE: usize = 128;
+const TAIL_BLOCK_SIZE: usize = 1024;
+
+impl Convolution for TwoStageFFTConvolver {
+    fn init(impulse_response: &[Sample], _block_size: usize, max_response_length: usize) -> Self {
+        let head_block_size = HEAD_BLOCK_SIZE;
+        let tail_block_size = TAIL_BLOCK_SIZE;
+
+        if max_response_length < impulse_response.len() {
+            panic!(
+                "max_response_length must be at least the length of the initial impulse response"
+            );
+        }
+        let mut padded_ir = impulse_response.to_vec();
+        padded_ir.resize(max_response_length, 0.);
+
+        let head_ir_len = std::cmp::min(max_response_length, tail_block_size);
+        let head_convolver = FFTConvolver::init(
+            &padded_ir[0..head_ir_len],
+            head_block_size,
+            max_response_length,
+        );
+
+        let tail_convolver0 = (max_response_length > tail_block_size)
+            .then(|| {
+                let tail_ir_len =
+                    std::cmp::min(max_response_length - tail_block_size, tail_block_size);
+                FFTConvolver::init(
+                    &padded_ir[tail_block_size..tail_block_size + tail_ir_len],
+                    head_block_size,
+                    max_response_length,
+                )
+            })
+            .unwrap_or_default();
+
+        let tail_output0 = vec![0.0; tail_block_size];
+        let tail_precalculated0 = vec![0.0; tail_block_size];
+
+        let tail_convolver = (max_response_length > 2 * tail_block_size)
+            .then(|| {
+                let tail_ir_len = max_response_length - 2 * tail_block_size;
+                FFTConvolver::init(
+                    &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len],
+                    tail_block_size,
+                    max_response_length,
+                )
+            })
+            .unwrap_or_default();
+
+        let tail_output = vec![0.0; tail_block_size];
+        let tail_precalculated = vec![0.0; tail_block_size];
+        let tail_input = vec![0.0; tail_block_size];
+        let tail_input_fill = 0;
+        let precalculated_pos = 0;
+
+        TwoStageFFTConvolver {
+            head_convolver,
+            tail_convolver0,
+            tail_output0,
+            tail_precalculated0,
+            tail_convolver,
+            tail_output,
+            tail_precalculated,
+            tail_input,
+            tail_input_fill,
+            precalculated_pos,
+        }
+    }
+
+    fn update(&mut self, _response: &[Sample]) {
+        todo!()
+    }
+
+    fn process(&mut self, input: &[Sample], output: &mut [Sample]) {
+        // Head
+        self.head_convolver.process(input, output);
+
+        // Tail
+        if self.tail_input.is_empty() {
+            return;
+        }
+
+        let len = input.len();
+        let mut processed = 0;
+
+        while processed < len {
+            let remaining = len - processed;
+            let processing = std::cmp::min(
+                remaining,
+                HEAD_BLOCK_SIZE - (self.tail_input_fill % HEAD_BLOCK_SIZE),
+            );
+
+            // Sum head and tail
+            let sum_begin = processed;
+            let sum_end = processed + processing;
+
+            // Sum: 1st tail block
+            if self.tail_precalculated0.len() > 0 {
+                let mut precalculated_pos = self.precalculated_pos;
+                for i in sum_begin..sum_end {
+                    output[i] += self.tail_precalculated0[precalculated_pos];
+                    precalculated_pos += 1;
+                }
+            }
+
+            // Sum: 2nd-Nth tail block
+            if self.tail_precalculated.len() > 0 {
+                let mut precalculated_pos = self.precalculated_pos;
+                for i in sum_begin..sum_end {
+                    output[i] += self.tail_precalculated[precalculated_pos];
+                    precalculated_pos += 1;
+                }
+            }
+
+            self.precalculated_pos += processing;
+
+            // Fill input buffer for tail convolution
+            self.tail_input[self.tail_input_fill..self.tail_input_fill + processing]
+                .copy_from_slice(&input[processed..processed + processing]);
+            self.tail_input_fill += processing;
+
+            // Convolution: 1st tail block
+            if self.tail_precalculated0.len() > 0 && self.tail_input_fill % HEAD_BLOCK_SIZE == 0 {
+                assert!(self.tail_input_fill >= HEAD_BLOCK_SIZE);
+                let block_offset = self.tail_input_fill - HEAD_BLOCK_SIZE;
+                self.tail_convolver0.process(
+                    &self.tail_input[block_offset..block_offset + HEAD_BLOCK_SIZE],
+                    &mut self.tail_output0[block_offset..block_offset + HEAD_BLOCK_SIZE],
+                );
+                if self.tail_input_fill == TAIL_BLOCK_SIZE {
+                    std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0);
+                }
+            }
+
+            // Convolution: 2nd-Nth tail block (might be done in some background thread)
+            if self.tail_precalculated.len() > 0
+                && self.tail_input_fill == TAIL_BLOCK_SIZE
+                && self.tail_output.len() == TAIL_BLOCK_SIZE
+            {
+                std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output);
+                self.tail_convolver
+                    .process(&self.tail_input, &mut self.tail_output);
+            }
+
+            if self.tail_input_fill == TAIL_BLOCK_SIZE {
+                self.tail_input_fill = 0;
+                self.precalculated_pos = 0;
+            }
+
+            processed += processing;
+        }
+    }
+}
diff --git a/src/tests.rs b/src/tests.rs
index 160aac4..7b2be9a 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -4,6 +4,20 @@ mod tests {
     use crate::fft_convolver::FFTConvolver;
     use crate::{Convolution, Sample};
 
+    #[test]
+    fn test_fft_convolver_passthrough() {
+        let mut response = [0.0; 1024];
+        response[0] = 1.0;
+        let mut convolver = FFTConvolver::init(&response, 1024, response.len());
+        let input = vec![1.0; 1024];
+        let mut output = vec![0.0; 1024];
+        convolver.process(&input, &mut output);
+
+        for i in 0..1024 {
+            assert!((output[i] - 1.0).abs() < 1e-6);
+        }
+    }
+
     fn generate_sinusoid(
         length: usize,
         frequency: f32,

From b68c55faef30fe8fff791077f45b010cf153d273 Mon Sep 17 00:00:00 2001
From: Dan Grahelj <dan.grahelj@holoplot.com>
Date: Wed, 7 May 2025 10:02:46 +0200
Subject: [PATCH 05/11] Add ipp information to readme

---
 README.md | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2064f04..7594ba6 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,7 @@ As the original C++ implementation, this library implements:
 
 - Partitioned convolution algorithm (using uniform block sizes)
 - Optional support for non-uniform block sizes (`TwoStageFFTConvolver`)
+- An optimised version of the convolvers using Intel IPP (if the `ipp` feature is enabled)
 
 On top of that it implements:
 
@@ -16,9 +17,19 @@ On top of that it implements:
 
 Compared to the original C++ implementation, this implementation does _not_ provide:
 
-- Its own FFT implementation (it currently uses the rustfft crate)
+- Its own FFT implementation (it currently uses the rustfft crate by default or the IPP library if the `ipp` feature is enabled)
 - The option to use SSE instructions in the `FFTConvolver`
 
 ## Prerequisites:
 
 - rust >=1.72.0
+
+#### Optional dependencies:
+
+- Intel IPP (if the `ipp` feature is enabled)
+
+When building with the `ipp` feature, the `ipp-sys` crate is used to link against the IPP library. To build with the `ipp` feature (`cargo build --features ipp`), you need to have the IPP library installed and the `IPPROOT` environment variable set to the root directory of the IPP installation. On Linux this is achieved by running:
+
+- `source /opt/intel/oneapi/setvars.sh`
+
+If you want `ipp` to be statically linked, export `IPP_STATIC=1` before running `cargo build`.
\ No newline at end of file

From 39ba1770b49e94b60a4449e93725446f0315c9cb Mon Sep 17 00:00:00 2001
From: Dan Grahelj <dan.grahelj@holoplot.com>
Date: Fri, 23 May 2025 17:26:02 +0200
Subject: [PATCH 06/11] Expose head and tail block size for two stage
 convolvers

---
 src/fft_convolver/ipp_fft.rs  | 47 ++++++++++++++++++++---------------
 src/fft_convolver/rust_fft.rs | 47 ++++++++++++++++++++---------------
 2 files changed, 54 insertions(+), 40 deletions(-)

diff --git a/src/fft_convolver/ipp_fft.rs b/src/fft_convolver/ipp_fft.rs
index c3ec1f6..327d487 100644
--- a/src/fft_convolver/ipp_fft.rs
+++ b/src/fft_convolver/ipp_fft.rs
@@ -492,16 +492,18 @@ pub struct TwoStageFFTConvolver {
     tail_input: Vec<Sample>,
     tail_input_fill: usize,
     precalculated_pos: usize,
+    head_block_size: usize,
+    tail_block_size: usize,
 }
 
-const HEAD_BLOCK_SIZE: usize = 128;
-const TAIL_BLOCK_SIZE: usize = 1024;
-
-impl Convolution for TwoStageFFTConvolver {
-    fn init(impulse_response: &[Sample], _block_size: usize, max_response_length: usize) -> Self {
-        let head_block_size = HEAD_BLOCK_SIZE;
-        let tail_block_size = TAIL_BLOCK_SIZE;
-
+impl TwoStageFFTConvolver {
+    pub fn init(
+        impulse_response: &[Sample],
+        _block_size: usize,
+        max_response_length: usize,
+        head_block_size: usize,
+        tail_block_size: usize,
+    ) -> Self {
         if max_response_length < impulse_response.len() {
             panic!(
                 "max_response_length must be at least the length of the initial impulse response"
@@ -560,14 +562,19 @@ impl Convolution for TwoStageFFTConvolver {
             tail_input,
             tail_input_fill,
             precalculated_pos,
+            head_block_size,
+            tail_block_size,
         }
     }
 
-    fn update(&mut self, _response: &[Sample]) {
+    pub fn update(&mut self, _response: &[Sample]) {
         todo!()
     }
 
-    fn process(&mut self, input: &[Sample], output: &mut [Sample]) {
+    pub fn process(&mut self, input: &[Sample], output: &mut [Sample]) {
+        let head_block_size = self.head_block_size;
+        let tail_block_size = self.tail_block_size;
+
         // Head
         self.head_convolver.process(input, output);
 
@@ -583,7 +590,7 @@ impl Convolution for TwoStageFFTConvolver {
             let remaining = len - processed;
             let processing = std::cmp::min(
                 remaining,
-                HEAD_BLOCK_SIZE - (self.tail_input_fill % HEAD_BLOCK_SIZE),
+                head_block_size - (self.tail_input_fill % head_block_size),
             );
 
             // Sum head and tail
@@ -628,29 +635,29 @@ impl Convolution for TwoStageFFTConvolver {
             self.tail_input_fill += processing;
 
             // Convolution: 1st tail block
-            if !self.tail_precalculated0.is_empty() && self.tail_input_fill % HEAD_BLOCK_SIZE == 0 {
-                assert!(self.tail_input_fill >= HEAD_BLOCK_SIZE);
-                let block_offset = self.tail_input_fill - HEAD_BLOCK_SIZE;
+            if !self.tail_precalculated0.is_empty() && self.tail_input_fill % head_block_size == 0 {
+                assert!(self.tail_input_fill >= head_block_size);
+                let block_offset = self.tail_input_fill - head_block_size;
                 self.tail_convolver0.process(
-                    &self.tail_input[block_offset..block_offset + HEAD_BLOCK_SIZE],
-                    &mut self.tail_output0[block_offset..block_offset + HEAD_BLOCK_SIZE],
+                    &self.tail_input[block_offset..block_offset + head_block_size],
+                    &mut self.tail_output0[block_offset..block_offset + head_block_size],
                 );
-                if self.tail_input_fill == TAIL_BLOCK_SIZE {
+                if self.tail_input_fill == tail_block_size {
                     std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0);
                 }
             }
 
             // Convolution: 2nd-Nth tail block (might be done in some background thread)
             if !self.tail_precalculated.is_empty()
-                && self.tail_input_fill == TAIL_BLOCK_SIZE
-                && self.tail_output.len() == TAIL_BLOCK_SIZE
+                && self.tail_input_fill == tail_block_size
+                && self.tail_output.len() == tail_block_size
             {
                 std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output);
                 self.tail_convolver
                     .process(&self.tail_input, &mut self.tail_output);
             }
 
-            if self.tail_input_fill == TAIL_BLOCK_SIZE {
+            if self.tail_input_fill == tail_block_size {
                 self.tail_input_fill = 0;
                 self.precalculated_pos = 0;
             }
diff --git a/src/fft_convolver/rust_fft.rs b/src/fft_convolver/rust_fft.rs
index fbc7935..870c292 100644
--- a/src/fft_convolver/rust_fft.rs
+++ b/src/fft_convolver/rust_fft.rs
@@ -336,16 +336,18 @@ pub struct TwoStageFFTConvolver {
     tail_input: Vec<Sample>,
     tail_input_fill: usize,
     precalculated_pos: usize,
+    head_block_size: usize,
+    tail_block_size: usize,
 }
 
-const HEAD_BLOCK_SIZE: usize = 128;
-const TAIL_BLOCK_SIZE: usize = 1024;
-
-impl Convolution for TwoStageFFTConvolver {
-    fn init(impulse_response: &[Sample], _block_size: usize, max_response_length: usize) -> Self {
-        let head_block_size = HEAD_BLOCK_SIZE;
-        let tail_block_size = TAIL_BLOCK_SIZE;
-
+impl TwoStageFFTConvolver {
+    pub fn init(
+        impulse_response: &[Sample],
+        _block_size: usize,
+        max_response_length: usize,
+        head_block_size: usize,
+        tail_block_size: usize,
+    ) -> Self {
         if max_response_length < impulse_response.len() {
             panic!(
                 "max_response_length must be at least the length of the initial impulse response"
@@ -404,14 +406,19 @@ impl Convolution for TwoStageFFTConvolver {
             tail_input,
             tail_input_fill,
             precalculated_pos,
+            head_block_size,
+            tail_block_size,
         }
     }
 
-    fn update(&mut self, _response: &[Sample]) {
+    pub fn update(&mut self, _response: &[Sample]) {
         todo!()
     }
 
-    fn process(&mut self, input: &[Sample], output: &mut [Sample]) {
+    pub fn process(&mut self, input: &[Sample], output: &mut [Sample]) {
+        let head_block_size = self.head_block_size;
+        let tail_block_size = self.tail_block_size;
+
         // Head
         self.head_convolver.process(input, output);
 
@@ -427,7 +434,7 @@ impl Convolution for TwoStageFFTConvolver {
             let remaining = len - processed;
             let processing = std::cmp::min(
                 remaining,
-                HEAD_BLOCK_SIZE - (self.tail_input_fill % HEAD_BLOCK_SIZE),
+                head_block_size - (self.tail_input_fill % head_block_size),
             );
 
             // Sum head and tail
@@ -460,29 +467,29 @@ impl Convolution for TwoStageFFTConvolver {
             self.tail_input_fill += processing;
 
             // Convolution: 1st tail block
-            if self.tail_precalculated0.len() > 0 && self.tail_input_fill % HEAD_BLOCK_SIZE == 0 {
-                assert!(self.tail_input_fill >= HEAD_BLOCK_SIZE);
-                let block_offset = self.tail_input_fill - HEAD_BLOCK_SIZE;
+            if self.tail_precalculated0.len() > 0 && self.tail_input_fill % head_block_size == 0 {
+                assert!(self.tail_input_fill >= head_block_size);
+                let block_offset = self.tail_input_fill - head_block_size;
                 self.tail_convolver0.process(
-                    &self.tail_input[block_offset..block_offset + HEAD_BLOCK_SIZE],
-                    &mut self.tail_output0[block_offset..block_offset + HEAD_BLOCK_SIZE],
+                    &self.tail_input[block_offset..block_offset + head_block_size],
+                    &mut self.tail_output0[block_offset..block_offset + head_block_size],
                 );
-                if self.tail_input_fill == TAIL_BLOCK_SIZE {
+                if self.tail_input_fill == tail_block_size {
                     std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0);
                 }
             }
 
             // Convolution: 2nd-Nth tail block (might be done in some background thread)
             if self.tail_precalculated.len() > 0
-                && self.tail_input_fill == TAIL_BLOCK_SIZE
-                && self.tail_output.len() == TAIL_BLOCK_SIZE
+                && self.tail_input_fill == tail_block_size
+                && self.tail_output.len() == tail_block_size
             {
                 std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output);
                 self.tail_convolver
                     .process(&self.tail_input, &mut self.tail_output);
             }
 
-            if self.tail_input_fill == TAIL_BLOCK_SIZE {
+            if self.tail_input_fill == tail_block_size {
                 self.tail_input_fill = 0;
                 self.precalculated_pos = 0;
             }

From 726963833915daf5694772edbcc4166c7aee6702 Mon Sep 17 00:00:00 2001
From: Dan Grahelj <dan.grahelj@holoplot.com>
Date: Mon, 26 May 2025 17:37:14 +0200
Subject: [PATCH 07/11] Unify ipp and rustfft implementations with traits

---
 src/fft_convolver/ipp_fft.rs  | 497 +--------------------------
 src/fft_convolver/mod.rs      | 609 +++++++++++++++++++++++++++++++++-
 src/fft_convolver/rust_fft.rs | 430 +++---------------------
 src/tests.rs                  |  34 +-
 4 files changed, 705 insertions(+), 865 deletions(-)

diff --git a/src/fft_convolver/ipp_fft.rs b/src/fft_convolver/ipp_fft.rs
index 327d487..7af4bbf 100644
--- a/src/fft_convolver/ipp_fft.rs
+++ b/src/fft_convolver/ipp_fft.rs
@@ -1,4 +1,4 @@
-use crate::{Convolution, Sample};
+use crate::{fft_convolver::FftBackend, Convolution, Sample};
 use ipp_sys::*;
 use num_complex::Complex32;
 
@@ -34,14 +34,10 @@ impl Clone for Fft {
     }
 }
 
-impl Fft {
-    pub fn new(size: usize) -> Self {
-        let mut fft = Self::default();
-        fft.init(size);
-        fft
-    }
+impl FftBackend for Fft {
+    type Complex = Complex32;
 
-    pub fn init(&mut self, size: usize) {
+    fn init(&mut self, size: usize) {
         assert!(size > 0, "FFT size must be greater than 0");
         assert!(size.is_power_of_two(), "FFT size must be a power of 2");
 
@@ -79,13 +75,13 @@ impl Fft {
         self.scratch_buffer = scratch_buffer;
     }
 
-    pub fn forward(
+    fn forward(
         &mut self,
         input: &mut [f32],
-        output: &mut [Complex32],
-    ) -> Result<(), &'static str> {
+        output: &mut [Self::Complex],
+    ) -> Result<(), Box<dyn std::error::Error>> {
         if self.size == 0 {
-            return Err("FFT not initialized");
+            return Err("FFT not initialized".into());
         }
 
         assert_eq!(input.len(), self.size, "Input length must match FFT size");
@@ -107,13 +103,13 @@ impl Fft {
         Ok(())
     }
 
-    pub fn inverse(
+    fn inverse(
         &mut self,
-        input: &mut [Complex32],
+        input: &mut [Self::Complex],
         output: &mut [f32],
-    ) -> Result<(), &'static str> {
+    ) -> Result<(), Box<dyn std::error::Error>> {
         if self.size == 0 {
-            return Err("FFT not initialized");
+            return Err("FFT not initialized".into());
         }
 
         assert_eq!(
@@ -203,466 +199,9 @@ pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) {
     }
 }
 
-#[derive(Default, Clone)]
-pub struct FFTConvolver {
-    ir_len: usize,
-    block_size: usize,
-    _seg_size: usize,
-    seg_count: usize,
-    active_seg_count: usize,
-    _fft_complex_size: usize,
-    segments: Vec<Vec<Complex32>>,
-    segments_ir: Vec<Vec<Complex32>>,
-    fft_buffer: Vec<f32>,
-    fft: Fft,
-    pre_multiplied: Vec<Complex32>,
-    conv: Vec<Complex32>,
-    overlap: Vec<f32>,
-    current: usize,
-    input_buffer: Vec<f32>,
-    input_buffer_fill: usize,
-    temp_buffer: Vec<Complex32>,
-}
-
-impl Convolution for FFTConvolver {
-    fn init(impulse_response: &[Sample], block_size: usize, max_response_length: usize) -> Self {
-        if max_response_length < impulse_response.len() {
-            panic!(
-                "max_response_length must be at least the length of the initial impulse response"
-            );
-        }
-        let mut padded_ir = impulse_response.to_vec();
-        padded_ir.resize(max_response_length, 0.);
-        let ir_len = padded_ir.len();
-
-        let block_size = block_size.next_power_of_two();
-        let seg_size = 2 * block_size;
-        let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize;
-        let active_seg_count = seg_count;
-        let fft_complex_size = complex_size(seg_size);
-
-        // FFT
-        let mut fft = Fft::default();
-        fft.init(seg_size);
-        let mut fft_buffer = vec![0.; seg_size];
-
-        // prepare segments
-        let mut segments = Vec::new();
-        for _ in 0..seg_count {
-            segments.push(vec![Complex32::new(0., 0.); fft_complex_size]);
-        }
-
-        let mut segments_ir = Vec::new();
-
-        // prepare ir
-        for i in 0..seg_count {
-            let mut segment = vec![Complex32::new(0., 0.); fft_complex_size];
-            let remaining = ir_len - (i * block_size);
-            let size_copy = if remaining >= block_size {
-                block_size
-            } else {
-                remaining
-            };
-            copy_and_pad(&mut fft_buffer, &padded_ir[i * block_size..], size_copy);
-            fft.forward(&mut fft_buffer, &mut segment).unwrap();
-            segments_ir.push(segment);
-        }
-
-        // prepare convolution buffers
-        let pre_multiplied = vec![Complex32::new(0., 0.); fft_complex_size];
-        let conv = vec![Complex32::new(0., 0.); fft_complex_size];
-        let overlap = vec![0.; block_size];
-
-        // prepare input buffer
-        let input_buffer = vec![0.; block_size];
-        let input_buffer_fill = 0;
-
-        // reset current position
-        let current = 0;
-        let temp_buffer = vec![Complex32::new(0.0, 0.0); fft_complex_size];
-
-        Self {
-            ir_len,
-            block_size,
-            _seg_size: seg_size,
-            seg_count,
-            active_seg_count,
-            _fft_complex_size: fft_complex_size,
-            segments,
-            segments_ir,
-            fft_buffer,
-            fft,
-            pre_multiplied,
-            conv,
-            overlap,
-            current,
-            input_buffer,
-            input_buffer_fill,
-            temp_buffer,
-        }
-    }
-
-    fn update(&mut self, response: &[Sample]) {
-        let new_ir_len = response.len();
-
-        if new_ir_len > self.ir_len {
-            panic!("New impulse response is longer than initialized length");
-        }
-
-        if self.ir_len == 0 {
-            return;
-        }
-
-        unsafe {
-            // Zero out buffers
-            ippsZero_32f(self.fft_buffer.as_mut_ptr(), self.fft_buffer.len() as i32);
-            ippsZero_32fc(
-                self.conv.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
-                self.conv.len() as i32,
-            );
-            ippsZero_32fc(
-                self.pre_multiplied.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
-                self.pre_multiplied.len() as i32,
-            );
-            ippsZero_32f(self.overlap.as_mut_ptr(), self.overlap.len() as i32);
-        }
-
-        self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize;
-
-        // Prepare IR
-        for i in 0..self.active_seg_count {
-            let segment = &mut self.segments_ir[i];
-            let remaining = new_ir_len - (i * self.block_size);
-            let size_copy = if remaining >= self.block_size {
-                self.block_size
-            } else {
-                remaining
-            };
-            copy_and_pad(
-                &mut self.fft_buffer,
-                &response[i * self.block_size..],
-                size_copy,
-            );
-            self.fft.forward(&mut self.fft_buffer, segment).unwrap();
-        }
-
-        // Clear remaining segments
-        for i in self.active_seg_count..self.seg_count {
-            unsafe {
-                ippsZero_32fc(
-                    self.segments_ir[i].as_mut_ptr() as *mut ipp_sys::Ipp32fc,
-                    self.segments_ir[i].len() as i32,
-                );
-            }
-        }
-    }
-
-    fn process(&mut self, input: &[Sample], output: &mut [Sample]) {
-        if self.active_seg_count == 0 {
-            unsafe {
-                ippsZero_32f(output.as_mut_ptr(), output.len() as i32);
-            }
-            return;
-        }
-
-        let mut processed = 0;
-        while processed < output.len() {
-            let input_buffer_was_empty = self.input_buffer_fill == 0;
-            let processing = std::cmp::min(
-                output.len() - processed,
-                self.block_size - self.input_buffer_fill,
-            );
-
-            // Copy input to input buffer
-            let input_buffer_pos = self.input_buffer_fill;
-            unsafe {
-                ippsCopy_32f(
-                    &input[processed],
-                    &mut self.input_buffer[input_buffer_pos],
-                    processing as i32,
-                );
-            }
-
-            // Forward FFT
-            copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size);
-            if let Err(_) = self
-                .fft
-                .forward(&mut self.fft_buffer, &mut self.segments[self.current])
-            {
-                unsafe {
-                    ippsZero_32f(output.as_mut_ptr(), output.len() as i32);
-                }
-                return;
-            }
-
-            // Complex multiplication
-            if input_buffer_was_empty {
-                unsafe {
-                    ippsZero_32fc(
-                        self.pre_multiplied.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
-                        self.pre_multiplied.len() as i32,
-                    );
-                }
-
-                for i in 1..self.active_seg_count {
-                    let index_ir = i;
-                    let index_audio = (self.current + i) % self.active_seg_count;
-                    complex_multiply_accumulate(
-                        &mut self.pre_multiplied,
-                        &self.segments_ir[index_ir],
-                        &self.segments[index_audio],
-                        &mut self.temp_buffer,
-                    );
-                }
-            }
-
-            // Copy pre-multiplied to conv
-            unsafe {
-                ippsCopy_32fc(
-                    self.pre_multiplied.as_ptr() as *const ipp_sys::Ipp32fc,
-                    self.conv.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
-                    self.conv.len() as i32,
-                );
-            }
-
-            complex_multiply_accumulate(
-                &mut self.conv,
-                &self.segments[self.current],
-                &self.segments_ir[0],
-                &mut self.temp_buffer,
-            );
-
-            // Backward FFT
-            if let Err(_) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) {
-                unsafe {
-                    ippsZero_32f(output.as_mut_ptr(), output.len() as i32);
-                }
-                return;
-            }
-
-            // Add overlap
-            sum(
-                &mut output[processed..processed + processing],
-                &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing],
-                &self.overlap[input_buffer_pos..input_buffer_pos + processing],
-            );
-
-            // Input buffer full => Next block
-            self.input_buffer_fill += processing;
-            if self.input_buffer_fill == self.block_size {
-                // Input buffer is empty again now
-                unsafe {
-                    ippsZero_32f(
-                        self.input_buffer.as_mut_ptr(),
-                        self.input_buffer.len() as i32,
-                    );
-                }
-                self.input_buffer_fill = 0;
-
-                // Save the overlap
-                unsafe {
-                    ippsCopy_32f(
-                        &self.fft_buffer[self.block_size],
-                        self.overlap.as_mut_ptr(),
-                        self.block_size as i32,
-                    );
-                }
-
-                // Update the current segment
-                self.current = if self.current > 0 {
-                    self.current - 1
-                } else {
-                    self.active_seg_count - 1
-                };
-            }
-            processed += processing;
-        }
-    }
-}
-
-#[derive(Clone)]
-pub struct TwoStageFFTConvolver {
-    head_convolver: FFTConvolver,
-    tail_convolver0: FFTConvolver,
-    tail_output0: Vec<Sample>,
-    tail_precalculated0: Vec<Sample>,
-    tail_convolver: FFTConvolver,
-    tail_output: Vec<Sample>,
-    tail_precalculated: Vec<Sample>,
-    tail_input: Vec<Sample>,
-    tail_input_fill: usize,
-    precalculated_pos: usize,
-    head_block_size: usize,
-    tail_block_size: usize,
-}
-
-impl TwoStageFFTConvolver {
-    pub fn init(
-        impulse_response: &[Sample],
-        _block_size: usize,
-        max_response_length: usize,
-        head_block_size: usize,
-        tail_block_size: usize,
-    ) -> Self {
-        if max_response_length < impulse_response.len() {
-            panic!(
-                "max_response_length must be at least the length of the initial impulse response"
-            );
-        }
-        let mut padded_ir = impulse_response.to_vec();
-        padded_ir.resize(max_response_length, 0.);
-
-        let head_ir_len = std::cmp::min(max_response_length, tail_block_size);
-        let head_convolver = FFTConvolver::init(
-            &padded_ir[0..head_ir_len],
-            head_block_size,
-            max_response_length,
-        );
-
-        let tail_convolver0 = (max_response_length > tail_block_size)
-            .then(|| {
-                let tail_ir_len =
-                    std::cmp::min(max_response_length - tail_block_size, tail_block_size);
-                FFTConvolver::init(
-                    &padded_ir[tail_block_size..tail_block_size + tail_ir_len],
-                    head_block_size,
-                    max_response_length,
-                )
-            })
-            .unwrap_or_default();
-
-        let tail_output0 = vec![0.0; tail_block_size];
-        let tail_precalculated0 = vec![0.0; tail_block_size];
-
-        let tail_convolver = (max_response_length > 2 * tail_block_size)
-            .then(|| {
-                let tail_ir_len = max_response_length - 2 * tail_block_size;
-                FFTConvolver::init(
-                    &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len],
-                    tail_block_size,
-                    max_response_length,
-                )
-            })
-            .unwrap_or_default();
-
-        let tail_output = vec![0.0; tail_block_size];
-        let tail_precalculated = vec![0.0; tail_block_size];
-        let tail_input = vec![0.0; tail_block_size];
-        let tail_input_fill = 0;
-        let precalculated_pos = 0;
-
-        TwoStageFFTConvolver {
-            head_convolver,
-            tail_convolver0,
-            tail_output0,
-            tail_precalculated0,
-            tail_convolver,
-            tail_output,
-            tail_precalculated,
-            tail_input,
-            tail_input_fill,
-            precalculated_pos,
-            head_block_size,
-            tail_block_size,
-        }
-    }
-
-    pub fn update(&mut self, _response: &[Sample]) {
-        todo!()
-    }
-
-    pub fn process(&mut self, input: &[Sample], output: &mut [Sample]) {
-        let head_block_size = self.head_block_size;
-        let tail_block_size = self.tail_block_size;
-
-        // Head
-        self.head_convolver.process(input, output);
-
-        // Tail
-        if self.tail_input.is_empty() {
-            return;
-        }
-
-        let len = input.len();
-        let mut processed = 0;
-
-        while processed < len {
-            let remaining = len - processed;
-            let processing = std::cmp::min(
-                remaining,
-                head_block_size - (self.tail_input_fill % head_block_size),
-            );
-
-            // Sum head and tail
-            let sum_begin = processed;
-
-            // Sum: 1st tail block
-            if !self.tail_precalculated0.is_empty() {
-                // Use IPP to add the tail block to output
-                unsafe {
-                    ippsAdd_32f(
-                        &self.tail_precalculated0[self.precalculated_pos],
-                        &output[sum_begin],
-                        &mut output[sum_begin],
-                        processing as i32,
-                    );
-                }
-            }
-
-            // Sum: 2nd-Nth tail block
-            if !self.tail_precalculated.is_empty() {
-                // Use IPP to add the tail block to output
-                unsafe {
-                    ippsAdd_32f(
-                        &self.tail_precalculated[self.precalculated_pos],
-                        &output[sum_begin],
-                        &mut output[sum_begin],
-                        processing as i32,
-                    );
-                }
-            }
-
-            self.precalculated_pos += processing;
-
-            // Fill input buffer for tail convolution
-            unsafe {
-                ippsCopy_32f(
-                    &input[processed],
-                    &mut self.tail_input[self.tail_input_fill],
-                    processing as i32,
-                );
-            }
-            self.tail_input_fill += processing;
-
-            // Convolution: 1st tail block
-            if !self.tail_precalculated0.is_empty() && self.tail_input_fill % head_block_size == 0 {
-                assert!(self.tail_input_fill >= head_block_size);
-                let block_offset = self.tail_input_fill - head_block_size;
-                self.tail_convolver0.process(
-                    &self.tail_input[block_offset..block_offset + head_block_size],
-                    &mut self.tail_output0[block_offset..block_offset + head_block_size],
-                );
-                if self.tail_input_fill == tail_block_size {
-                    std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0);
-                }
-            }
-
-            // Convolution: 2nd-Nth tail block (might be done in some background thread)
-            if !self.tail_precalculated.is_empty()
-                && self.tail_input_fill == tail_block_size
-                && self.tail_output.len() == tail_block_size
-            {
-                std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output);
-                self.tail_convolver
-                    .process(&self.tail_input, &mut self.tail_output);
-            }
-
-            if self.tail_input_fill == tail_block_size {
-                self.tail_input_fill = 0;
-                self.precalculated_pos = 0;
-            }
-
-            processed += processing;
-        }
-    }
-}
+pub type FFTConvolver =
+    crate::fft_convolver::GenericFFTConvolver<Fft, crate::fft_convolver::ipp_ops::IppComplexOps>;
+pub type TwoStageFFTConvolver = crate::fft_convolver::GenericTwoStageFFTConvolver<
+    Fft,
+    crate::fft_convolver::ipp_ops::IppComplexOps,
+>;
diff --git a/src/fft_convolver/mod.rs b/src/fft_convolver/mod.rs
index 6b507e7..bfc6450 100644
--- a/src/fft_convolver/mod.rs
+++ b/src/fft_convolver/mod.rs
@@ -1,23 +1,628 @@
+use crate::Convolution;
 #[cfg(feature = "ipp")]
 pub mod ipp_fft;
 pub mod rust_fft;
 
+// Define a trait that will be implemented by both FFT backends
+pub trait FftBackend: Clone {
+    type Complex;
+
+    fn init(&mut self, size: usize);
+    fn forward(
+        &mut self,
+        input: &mut [f32],
+        output: &mut [Self::Complex],
+    ) -> Result<(), Box<dyn std::error::Error>>;
+    fn inverse(
+        &mut self,
+        input: &mut [Self::Complex],
+        output: &mut [f32],
+    ) -> Result<(), Box<dyn std::error::Error>>;
+}
+
+// Define trait for complex operations that will be used by both backends
+pub trait ComplexOps: Clone {
+    type Complex: Clone + Default;
+
+    fn complex_size(size: usize) -> usize;
+    fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize);
+    fn complex_multiply_accumulate(
+        result: &mut [Self::Complex],
+        a: &[Self::Complex],
+        b: &[Self::Complex],
+        temp_buffer: Option<&mut [Self::Complex]>,
+    );
+    fn sum(result: &mut [f32], a: &[f32], b: &[f32]);
+    fn zero_complex(buffer: &mut [Self::Complex]);
+    fn zero_real(buffer: &mut [f32]);
+    fn copy_complex(dst: &mut [Self::Complex], src: &[Self::Complex]);
+    fn add_to_buffer(dst: &mut [f32], src: &[f32]);
+}
+
+// Generic FFTConvolver implementation that works with any backend
+#[derive(Default, Clone)]
+pub struct GenericFFTConvolver<F: FftBackend, C: ComplexOps<Complex = F::Complex>>
+where
+    C: Default,
+    F: Default,
+    F::Complex: Clone + Default,
+{
+    ir_len: usize,
+    block_size: usize,
+    _seg_size: usize,
+    seg_count: usize,
+    active_seg_count: usize,
+    _fft_complex_size: usize,
+    segments: Vec<Vec<F::Complex>>,
+    segments_ir: Vec<Vec<F::Complex>>,
+    fft_buffer: Vec<f32>,
+    fft: F,
+    pre_multiplied: Vec<F::Complex>,
+    conv: Vec<F::Complex>,
+    overlap: Vec<f32>,
+    current: usize,
+    input_buffer: Vec<f32>,
+    input_buffer_fill: usize,
+    _complex_ops: std::marker::PhantomData<C>,
+    temp_buffer: Option<Vec<F::Complex>>,
+}
+
+impl<F: FftBackend, C: ComplexOps<Complex = F::Complex>> GenericFFTConvolver<F, C>
+where
+    C: Default,
+    F: Default,
+    F::Complex: Clone + Default,
+{
+    fn init_internal(
+        impulse_response: &[f32],
+        block_size: usize,
+        max_response_length: usize,
+    ) -> Self {
+        if max_response_length < impulse_response.len() {
+            panic!(
+                "max_response_length must be at least the length of the initial impulse response"
+            );
+        }
+        let mut padded_ir = impulse_response.to_vec();
+        padded_ir.resize(max_response_length, 0.);
+        let ir_len = padded_ir.len();
+
+        let block_size = block_size.next_power_of_two();
+        let seg_size = 2 * block_size;
+        let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize;
+        let active_seg_count = seg_count;
+        let fft_complex_size = C::complex_size(seg_size);
+
+        // FFT
+        let mut fft = F::default();
+        fft.init(seg_size);
+        let fft_buffer = vec![0.; seg_size];
+
+        // prepare segments
+        let mut segments = Vec::with_capacity(seg_count);
+        for _ in 0..seg_count {
+            segments.push(vec![F::Complex::default(); fft_complex_size]);
+        }
+
+        let mut segments_ir = Vec::with_capacity(seg_count);
+
+        // prepare ir
+        let mut ir_buffer = vec![0.0; seg_size];
+        for i in 0..seg_count {
+            let mut segment = vec![F::Complex::default(); fft_complex_size];
+            let remaining = ir_len - (i * block_size);
+            let size_copy = if remaining >= block_size {
+                block_size
+            } else {
+                remaining
+            };
+            C::copy_and_pad(&mut ir_buffer, &padded_ir[i * block_size..], size_copy);
+            fft.forward(&mut ir_buffer, &mut segment).unwrap();
+            segments_ir.push(segment);
+        }
+
+        // prepare convolution buffers
+        let pre_multiplied = vec![F::Complex::default(); fft_complex_size];
+        let conv = vec![F::Complex::default(); fft_complex_size];
+        let overlap = vec![0.; block_size];
+
+        // prepare input buffer
+        let input_buffer = vec![0.; block_size];
+        let input_buffer_fill = 0;
+
+        // reset current position
+        let current = 0;
+
+        // For IPP backend we need a temp buffer
+        let temp_buffer = Some(vec![F::Complex::default(); fft_complex_size]);
+
+        Self {
+            ir_len,
+            block_size,
+            _seg_size: seg_size,
+            seg_count,
+            active_seg_count,
+            _fft_complex_size: fft_complex_size,
+            segments,
+            segments_ir,
+            fft_buffer,
+            fft,
+            pre_multiplied,
+            conv,
+            overlap,
+            current,
+            input_buffer,
+            input_buffer_fill,
+            _complex_ops: std::marker::PhantomData,
+            temp_buffer,
+        }
+    }
+}
+
+impl<F: FftBackend, C: ComplexOps<Complex = F::Complex>> Convolution for GenericFFTConvolver<F, C>
+where
+    C: Default,
+    F: Default,
+    F::Complex: Clone + Default,
+{
+    fn init(impulse_response: &[f32], block_size: usize, max_response_length: usize) -> Self {
+        Self::init_internal(impulse_response, block_size, max_response_length)
+    }
+
+    fn update(&mut self, response: &[f32]) {
+        let new_ir_len = response.len();
+
+        if new_ir_len > self.ir_len {
+            panic!("New impulse response is longer than initialized length");
+        }
+
+        if self.ir_len == 0 {
+            return;
+        }
+
+        C::zero_real(&mut self.fft_buffer);
+        C::zero_complex(&mut self.conv);
+        C::zero_complex(&mut self.pre_multiplied);
+        C::zero_real(&mut self.overlap);
+
+        self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize;
+
+        // Prepare IR
+        for i in 0..self.active_seg_count {
+            let segment = &mut self.segments_ir[i];
+            let remaining = new_ir_len - (i * self.block_size);
+            let size_copy = if remaining >= self.block_size {
+                self.block_size
+            } else {
+                remaining
+            };
+            C::copy_and_pad(
+                &mut self.fft_buffer,
+                &response[i * self.block_size..],
+                size_copy,
+            );
+            self.fft.forward(&mut self.fft_buffer, segment).unwrap();
+        }
+
+        // Clear remaining segments
+        for i in self.active_seg_count..self.seg_count {
+            C::zero_complex(&mut self.segments_ir[i]);
+        }
+    }
+
+    fn process(&mut self, input: &[f32], output: &mut [f32]) {
+        if self.active_seg_count == 0 {
+            C::zero_real(output);
+            return;
+        }
+
+        let mut processed = 0;
+        while processed < output.len() {
+            let input_buffer_was_empty = self.input_buffer_fill == 0;
+            let processing = std::cmp::min(
+                output.len() - processed,
+                self.block_size - self.input_buffer_fill,
+            );
+
+            // Copy input to input buffer
+            let input_buffer_pos = self.input_buffer_fill;
+            C::copy_and_pad(
+                &mut self.input_buffer[input_buffer_pos..],
+                &input[processed..processed + processing],
+                processing,
+            );
+
+            // Forward FFT
+            C::copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size);
+            if let Err(_) = self
+                .fft
+                .forward(&mut self.fft_buffer, &mut self.segments[self.current])
+            {
+                C::zero_real(output);
+                return;
+            }
+
+            // complex multiplication
+            if input_buffer_was_empty {
+                C::zero_complex(&mut self.pre_multiplied);
+
+                for i in 1..self.active_seg_count {
+                    let index_ir = i;
+                    let index_audio = (self.current + i) % self.active_seg_count;
+                    C::complex_multiply_accumulate(
+                        &mut self.pre_multiplied,
+                        &self.segments_ir[index_ir],
+                        &self.segments[index_audio],
+                        self.temp_buffer.as_mut().map(|v| v.as_mut_slice()),
+                    );
+                }
+            }
+
+            C::copy_complex(&mut self.conv, &self.pre_multiplied);
+
+            C::complex_multiply_accumulate(
+                &mut self.conv,
+                &self.segments[self.current],
+                &self.segments_ir[0],
+                self.temp_buffer.as_mut().map(|v| v.as_mut_slice()),
+            );
+
+            // Backward FFT
+            if let Err(_) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) {
+                C::zero_real(output);
+                return;
+            }
+
+            // Add overlap
+            C::sum(
+                &mut output[processed..processed + processing],
+                &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing],
+                &self.overlap[input_buffer_pos..input_buffer_pos + processing],
+            );
+
+            // Input buffer full => Next block
+            self.input_buffer_fill += processing;
+            if self.input_buffer_fill == self.block_size {
+                // Input buffer is empty again now
+                C::zero_real(&mut self.input_buffer);
+                self.input_buffer_fill = 0;
+
+                // Save the overlap
+                C::copy_and_pad(
+                    &mut self.overlap,
+                    &self.fft_buffer[self.block_size..],
+                    self.block_size,
+                );
+
+                // Update the current segment
+                self.current = if self.current > 0 {
+                    self.current - 1
+                } else {
+                    self.active_seg_count - 1
+                };
+            }
+            processed += processing;
+        }
+    }
+}
+
+// Generic TwoStageFFTConvolver implementation
+#[derive(Clone)]
+pub struct GenericTwoStageFFTConvolver<F: FftBackend, C: ComplexOps<Complex = F::Complex>>
+where
+    C: Default,
+    F: Default,
+    F::Complex: Clone + Default,
+{
+    head_convolver: GenericFFTConvolver<F, C>,
+    tail_convolver0: GenericFFTConvolver<F, C>,
+    tail_output0: Vec<f32>,
+    tail_precalculated0: Vec<f32>,
+    tail_convolver: GenericFFTConvolver<F, C>,
+    tail_output: Vec<f32>,
+    tail_precalculated: Vec<f32>,
+    tail_input: Vec<f32>,
+    tail_input_fill: usize,
+    precalculated_pos: usize,
+    head_block_size: usize,
+    tail_block_size: usize,
+}
+
+impl<F: FftBackend, C: ComplexOps<Complex = F::Complex>> GenericTwoStageFFTConvolver<F, C>
+where
+    C: Default,
+    F: Default,
+    F::Complex: Clone + Default,
+{
+    pub fn init(
+        impulse_response: &[f32],
+        _block_size: usize,
+        max_response_length: usize,
+        head_block_size: usize,
+        tail_block_size: usize,
+    ) -> Self {
+        if max_response_length < impulse_response.len() {
+            panic!(
+                "max_response_length must be at least the length of the initial impulse response"
+            );
+        }
+        let mut padded_ir = impulse_response.to_vec();
+        padded_ir.resize(max_response_length, 0.);
+
+        let head_ir_len = std::cmp::min(max_response_length, tail_block_size);
+        let head_convolver = GenericFFTConvolver::<F, C>::init(
+            &padded_ir[0..head_ir_len],
+            head_block_size,
+            max_response_length,
+        );
+
+        let tail_convolver0 = if max_response_length > tail_block_size {
+            let tail_ir_len = std::cmp::min(max_response_length - tail_block_size, tail_block_size);
+            GenericFFTConvolver::<F, C>::init(
+                &padded_ir[tail_block_size..tail_block_size + tail_ir_len],
+                head_block_size,
+                max_response_length,
+            )
+        } else {
+            GenericFFTConvolver::<F, C>::default()
+        };
+
+        let tail_output0 = vec![0.0; tail_block_size];
+        let tail_precalculated0 = vec![0.0; tail_block_size];
+
+        let tail_convolver = if max_response_length > 2 * tail_block_size {
+            let tail_ir_len = max_response_length - 2 * tail_block_size;
+            GenericFFTConvolver::<F, C>::init(
+                &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len],
+                tail_block_size,
+                max_response_length,
+            )
+        } else {
+            GenericFFTConvolver::<F, C>::default()
+        };
+
+        let tail_output = vec![0.0; tail_block_size];
+        let tail_precalculated = vec![0.0; tail_block_size];
+        let tail_input = vec![0.0; tail_block_size];
+        let tail_input_fill = 0;
+        let precalculated_pos = 0;
+
+        GenericTwoStageFFTConvolver {
+            head_convolver,
+            tail_convolver0,
+            tail_output0,
+            tail_precalculated0,
+            tail_convolver,
+            tail_output,
+            tail_precalculated,
+            tail_input,
+            tail_input_fill,
+            precalculated_pos,
+            head_block_size,
+            tail_block_size,
+        }
+    }
+
+    pub fn update(&mut self, _response: &[f32]) {
+        todo!()
+    }
+
+    pub fn process(&mut self, input: &[f32], output: &mut [f32]) {
+        let head_block_size = self.head_block_size;
+        let tail_block_size = self.tail_block_size;
+
+        // Head
+        self.head_convolver.process(input, output);
+
+        // Tail
+        if self.tail_input.is_empty() {
+            return;
+        }
+
+        let len = input.len();
+        let mut processed = 0;
+
+        while processed < len {
+            let remaining = len - processed;
+            let processing = std::cmp::min(
+                remaining,
+                head_block_size - (self.tail_input_fill % head_block_size),
+            );
+
+            // Sum head and tail
+            let sum_begin = processed;
+
+            // Sum: 1st tail block
+            if !self.tail_precalculated0.is_empty() {
+                C::add_to_buffer(
+                    &mut output[sum_begin..sum_begin + processing],
+                    &self.tail_precalculated0
+                        [self.precalculated_pos..self.precalculated_pos + processing],
+                );
+            }
+
+            // Sum: 2nd-Nth tail block
+            if !self.tail_precalculated.is_empty() {
+                C::add_to_buffer(
+                    &mut output[sum_begin..sum_begin + processing],
+                    &self.tail_precalculated
+                        [self.precalculated_pos..self.precalculated_pos + processing],
+                );
+            }
+            self.precalculated_pos += processing;
+
+            // Fill input buffer for tail convolution
+            C::copy_and_pad(
+                &mut self.tail_input[self.tail_input_fill..],
+                &input[processed..processed + processing],
+                processing,
+            );
+            self.tail_input_fill += processing;
+
+            // Convolution: 1st tail block
+            if !self.tail_precalculated0.is_empty() && self.tail_input_fill % head_block_size == 0 {
+                assert!(self.tail_input_fill >= head_block_size);
+                let block_offset = self.tail_input_fill - head_block_size;
+                self.tail_convolver0.process(
+                    &self.tail_input[block_offset..block_offset + head_block_size],
+                    &mut self.tail_output0[block_offset..block_offset + head_block_size],
+                );
+                if self.tail_input_fill == tail_block_size {
+                    std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0);
+                }
+            }
+
+            // Convolution: 2nd-Nth tail block (might be done in some background thread)
+            if !self.tail_precalculated.is_empty()
+                && self.tail_input_fill == tail_block_size
+                && self.tail_output.len() == tail_block_size
+            {
+                std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output);
+                self.tail_convolver
+                    .process(&self.tail_input, &mut self.tail_output);
+            }
+
+            if self.tail_input_fill == tail_block_size {
+                self.tail_input_fill = 0;
+                self.precalculated_pos = 0;
+            }
+
+            processed += processing;
+        }
+    }
+}
+
+// Type definitions for IPP and RustFFT implementations
 #[cfg(not(feature = "ipp"))]
 pub use rust_fft::{
     complex_multiply_accumulate, complex_size, copy_and_pad, sum, FFTConvolver, Fft,
     TwoStageFFTConvolver,
 };
+pub mod rust_ops {
+    use super::*;
+    use rustfft::num_complex::Complex;
+
+    #[derive(Clone, Default)]
+    pub struct RustComplexOps;
+
+    impl ComplexOps for RustComplexOps {
+        type Complex = Complex<f32>;
+
+        fn complex_size(size: usize) -> usize {
+            rust_fft::complex_size(size)
+        }
+
+        fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) {
+            rust_fft::copy_and_pad(dst, src, src_size)
+        }
+
+        fn complex_multiply_accumulate(
+            result: &mut [Self::Complex],
+            a: &[Self::Complex],
+            b: &[Self::Complex],
+            _temp_buffer: Option<&mut [Self::Complex]>,
+        ) {
+            rust_fft::complex_multiply_accumulate(result, a, b)
+        }
+
+        fn sum(result: &mut [f32], a: &[f32], b: &[f32]) {
+            rust_fft::sum(result, a, b)
+        }
+
+        fn zero_complex(buffer: &mut [Self::Complex]) {
+            buffer.fill(Complex { re: 0.0, im: 0.0 });
+        }
+
+        fn zero_real(buffer: &mut [f32]) {
+            buffer.fill(0.0);
+        }
+
+        fn copy_complex(dst: &mut [Self::Complex], src: &[Self::Complex]) {
+            dst.clone_from_slice(src);
+        }
+
+        fn add_to_buffer(dst: &mut [f32], src: &[f32]) {
+            rust_fft::add_to_buffer(dst, src);
+        }
+    }
+}
 
 #[cfg(feature = "ipp")]
-pub use self::ipp_fft::{
+pub use ipp_fft::{
     complex_multiply_accumulate, complex_size, copy_and_pad, sum, FFTConvolver, Fft,
     TwoStageFFTConvolver,
 };
+#[cfg(feature = "ipp")]
+mod ipp_ops {
+    use super::*;
+    use ipp_sys::*;
+    use num_complex::Complex32;
+
+    #[derive(Clone, Default)]
+    pub struct IppComplexOps;
+
+    impl ComplexOps for IppComplexOps {
+        type Complex = Complex32;
+
+        fn complex_size(size: usize) -> usize {
+            ipp_fft::complex_size(size)
+        }
+
+        fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) {
+            ipp_fft::copy_and_pad(dst, src, src_size)
+        }
+
+        fn complex_multiply_accumulate(
+            result: &mut [Self::Complex],
+            a: &[Self::Complex],
+            b: &[Self::Complex],
+            temp_buffer: Option<&mut [Self::Complex]>,
+        ) {
+            let temp = temp_buffer.expect("IPP implementation requires a temp buffer");
+            ipp_fft::complex_multiply_accumulate(result, a, b, temp)
+        }
+
+        fn sum(result: &mut [f32], a: &[f32], b: &[f32]) {
+            ipp_fft::sum(result, a, b)
+        }
+
+        fn zero_complex(buffer: &mut [Self::Complex]) {
+            unsafe {
+                ippsZero_32fc(
+                    buffer.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
+                    buffer.len() as i32,
+                );
+            }
+        }
+
+        fn zero_real(buffer: &mut [f32]) {
+            unsafe {
+                ippsZero_32f(buffer.as_mut_ptr(), buffer.len() as i32);
+            }
+        }
+
+        fn copy_complex(dst: &mut [Self::Complex], src: &[Self::Complex]) {
+            unsafe {
+                ippsCopy_32fc(
+                    src.as_ptr() as *const ipp_sys::Ipp32fc,
+                    dst.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
+                    dst.len() as i32,
+                );
+            }
+        }
+
+        fn add_to_buffer(dst: &mut [f32], src: &[f32]) {
+            unsafe {
+                ippsAdd_32f_I(src.as_ptr(), dst.as_mut_ptr(), dst.len() as i32);
+            }
+        }
+    }
+}
 
 #[cfg(all(test, feature = "ipp"))]
 mod tests {
     use super::*;
-    use crate::{fft_convolver::rust_fft, Convolution};
+    use crate::{fft_convolver::ipp_fft, fft_convolver::rust_fft, Convolution};
 
     // Helper function that runs both implementations and compares results
     fn compare_implementations(impulse_response: &[f32], input: &[f32], block_size: usize) {
diff --git a/src/fft_convolver/rust_fft.rs b/src/fft_convolver/rust_fft.rs
index 870c292..a677440 100644
--- a/src/fft_convolver/rust_fft.rs
+++ b/src/fft_convolver/rust_fft.rs
@@ -1,7 +1,8 @@
-use crate::{Convolution, Sample};
+use crate::{fft_convolver::FftBackend, Convolution, Sample};
 use realfft::{ComplexToReal, FftError, RealFftPlanner, RealToComplex};
 use rustfft::num_complex::Complex;
 use std::sync::Arc;
+
 #[derive(Clone)]
 pub struct Fft {
     fft_forward: Arc<dyn RealToComplex<f32>>,
@@ -24,19 +25,30 @@ impl std::fmt::Debug for Fft {
     }
 }
 
-impl Fft {
-    pub fn init(&mut self, length: usize) {
+impl FftBackend for Fft {
+    type Complex = Complex<f32>;
+
+    fn init(&mut self, length: usize) {
         let mut planner = RealFftPlanner::<f32>::new();
         self.fft_forward = planner.plan_fft_forward(length);
         self.fft_inverse = planner.plan_fft_inverse(length);
     }
 
-    pub fn forward(&self, input: &mut [f32], output: &mut [Complex<f32>]) -> Result<(), FftError> {
-        self.fft_forward.process(input, output)?;
-        Ok(())
+    fn forward(
+        &mut self,
+        input: &mut [f32],
+        output: &mut [Self::Complex],
+    ) -> Result<(), Box<dyn std::error::Error>> {
+        self.fft_forward
+            .process(input, output)
+            .map_err(|e| e.into())
     }
 
-    pub fn inverse(&self, input: &mut [Complex<f32>], output: &mut [f32]) -> Result<(), FftError> {
+    fn inverse(
+        &mut self,
+        input: &mut [Self::Complex],
+        output: &mut [f32],
+    ) -> Result<(), Box<dyn std::error::Error>> {
         self.fft_inverse.process(input, output)?;
 
         // FFT Normalization
@@ -46,221 +58,7 @@ impl Fft {
         Ok(())
     }
 }
-#[derive(Default, Clone)]
-pub struct FFTConvolver {
-    ir_len: usize,
-    block_size: usize,
-    _seg_size: usize,
-    seg_count: usize,
-    active_seg_count: usize,
-    _fft_complex_size: usize,
-    segments: Vec<Vec<Complex<f32>>>,
-    segments_ir: Vec<Vec<Complex<f32>>>,
-    fft_buffer: Vec<f32>,
-    fft: Fft,
-    pre_multiplied: Vec<Complex<f32>>,
-    conv: Vec<Complex<f32>>,
-    overlap: Vec<f32>,
-    current: usize,
-    input_buffer: Vec<f32>,
-    input_buffer_fill: usize,
-}
-
-impl Convolution for FFTConvolver {
-    fn init(impulse_response: &[Sample], block_size: usize, max_response_length: usize) -> Self {
-        if max_response_length < impulse_response.len() {
-            panic!(
-                "max_response_length must be at least the length of the initial impulse response"
-            );
-        }
-        let mut padded_ir = impulse_response.to_vec();
-        padded_ir.resize(max_response_length, 0.);
-        let ir_len = padded_ir.len();
-
-        let block_size = block_size.next_power_of_two();
-        let seg_size = 2 * block_size;
-        let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize;
-        let active_seg_count = seg_count;
-        let fft_complex_size = complex_size(seg_size);
-
-        // FFT
-        let mut fft = Fft::default();
-        fft.init(seg_size);
-        let mut fft_buffer = vec![0.; seg_size];
-
-        // prepare segments
-        let segments = vec![vec![Complex::new(0., 0.); fft_complex_size]; seg_count];
-        let mut segments_ir = Vec::new();
-
-        // prepare ir
-        for i in 0..seg_count {
-            let mut segment = vec![Complex::new(0., 0.); fft_complex_size];
-            let remaining = ir_len - (i * block_size);
-            let size_copy = if remaining >= block_size {
-                block_size
-            } else {
-                remaining
-            };
-            copy_and_pad(&mut fft_buffer, &padded_ir[i * block_size..], size_copy);
-            fft.forward(&mut fft_buffer, &mut segment).unwrap();
-            segments_ir.push(segment);
-        }
-
-        // prepare convolution buffers
-        let pre_multiplied = vec![Complex::new(0., 0.); fft_complex_size];
-        let conv = vec![Complex::new(0., 0.); fft_complex_size];
-        let overlap = vec![0.; block_size];
-
-        // prepare input buffer
-        let input_buffer = vec![0.; block_size];
-        let input_buffer_fill = 0;
-
-        // reset current position
-        let current = 0;
-
-        Self {
-            ir_len,
-            block_size,
-            _seg_size: seg_size,
-            seg_count,
-            active_seg_count,
-            _fft_complex_size: fft_complex_size,
-            segments,
-            segments_ir,
-            fft_buffer,
-            fft,
-            pre_multiplied,
-            conv,
-            overlap,
-            current,
-            input_buffer,
-            input_buffer_fill,
-        }
-    }
-
-    fn update(&mut self, response: &[Sample]) {
-        let new_ir_len = response.len();
-
-        if new_ir_len > self.ir_len {
-            panic!("New impulse response is longer than initialized length");
-        }
-
-        if self.ir_len == 0 {
-            return;
-        }
-
-        self.fft_buffer.fill(0.);
-        self.conv.fill(Complex::new(0., 0.));
-        self.pre_multiplied.fill(Complex::new(0., 0.));
-        self.overlap.fill(0.);
-
-        self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize;
-
-        // Prepare IR
-        for i in 0..self.active_seg_count {
-            let segment = &mut self.segments_ir[i];
-            let remaining = new_ir_len - (i * self.block_size);
-            let size_copy = if remaining >= self.block_size {
-                self.block_size
-            } else {
-                remaining
-            };
-            copy_and_pad(
-                &mut self.fft_buffer,
-                &response[i * self.block_size..],
-                size_copy,
-            );
-            self.fft.forward(&mut self.fft_buffer, segment).unwrap();
-        }
-
-        // Clear remaining segments
-        for i in self.active_seg_count..self.seg_count {
-            self.segments_ir[i].fill(Complex::new(0., 0.));
-        }
-    }
 
-    fn process(&mut self, input: &[Sample], output: &mut [Sample]) {
-        if self.active_seg_count == 0 {
-            output.fill(0.);
-            return;
-        }
-
-        let mut processed = 0;
-        while processed < output.len() {
-            let input_buffer_was_empty = self.input_buffer_fill == 0;
-            let processing = std::cmp::min(
-                output.len() - processed,
-                self.block_size - self.input_buffer_fill,
-            );
-
-            let input_buffer_pos = self.input_buffer_fill;
-            self.input_buffer[input_buffer_pos..input_buffer_pos + processing]
-                .clone_from_slice(&input[processed..processed + processing]);
-
-            // Forward FFT
-            copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size);
-            if let Err(_err) = self
-                .fft
-                .forward(&mut self.fft_buffer, &mut self.segments[self.current])
-            {
-                output.fill(0.);
-                return; // error!
-            }
-
-            // complex multiplication
-            if input_buffer_was_empty {
-                self.pre_multiplied.fill(Complex { re: 0., im: 0. });
-                for i in 1..self.active_seg_count {
-                    let index_ir = i;
-                    let index_audio = (self.current + i) % self.active_seg_count;
-                    complex_multiply_accumulate(
-                        &mut self.pre_multiplied,
-                        &self.segments_ir[index_ir],
-                        &self.segments[index_audio],
-                    );
-                }
-            }
-            self.conv.clone_from_slice(&self.pre_multiplied);
-            complex_multiply_accumulate(
-                &mut self.conv,
-                &self.segments[self.current],
-                &self.segments_ir[0],
-            );
-
-            // Backward FFT
-            if let Err(_err) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) {
-                output.fill(0.);
-                return; // error!
-            }
-
-            // Add overlap
-            sum(
-                &mut output[processed..processed + processing],
-                &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing],
-                &self.overlap[input_buffer_pos..input_buffer_pos + processing],
-            );
-
-            // Input buffer full => Next block
-            self.input_buffer_fill += processing;
-            if self.input_buffer_fill == self.block_size {
-                // Input buffer is empty again now
-                self.input_buffer.fill(0.);
-                self.input_buffer_fill = 0;
-                // Save the overlap
-                self.overlap
-                    .clone_from_slice(&self.fft_buffer[self.block_size..self.block_size * 2]);
-
-                // Update the current segment
-                self.current = if self.current > 0 {
-                    self.current - 1
-                } else {
-                    self.active_seg_count - 1
-                };
-            }
-            processed += processing;
-        }
-    }
-}
 pub fn complex_size(size: usize) -> usize {
     (size / 2) + 1
 }
@@ -311,6 +109,22 @@ pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) {
         result[i] = a[i] + b[i];
     }
 }
+
+pub fn add_to_buffer(dst: &mut [f32], src: &[f32]) {
+    assert_eq!(dst.len(), src.len());
+    let len = dst.len();
+    let end4 = 3 * (len / 4);
+    for i in (0..end4).step_by(4) {
+        dst[i + 0] += src[i + 0];
+        dst[i + 1] += src[i + 1];
+        dst[i + 2] += src[i + 2];
+        dst[i + 3] += src[i + 3];
+    }
+    for i in end4..len {
+        dst[i] += src[i];
+    }
+}
+
 #[test]
 fn test_fft_convolver_passthrough() {
     let mut response = [0.0; 1024];
@@ -324,177 +138,27 @@ fn test_fft_convolver_passthrough() {
         assert!((output[i] - 1.0).abs() < 1e-6);
     }
 }
-#[derive(Clone)]
-pub struct TwoStageFFTConvolver {
-    head_convolver: FFTConvolver,
-    tail_convolver0: FFTConvolver,
-    tail_output0: Vec<Sample>,
-    tail_precalculated0: Vec<Sample>,
-    tail_convolver: FFTConvolver,
-    tail_output: Vec<Sample>,
-    tail_precalculated: Vec<Sample>,
-    tail_input: Vec<Sample>,
-    tail_input_fill: usize,
-    precalculated_pos: usize,
-    head_block_size: usize,
-    tail_block_size: usize,
-}
+
+pub type FFTConvolver =
+    crate::fft_convolver::GenericFFTConvolver<Fft, crate::fft_convolver::rust_ops::RustComplexOps>;
+pub type TwoStageFFTConvolver = crate::fft_convolver::GenericTwoStageFFTConvolver<
+    Fft,
+    crate::fft_convolver::rust_ops::RustComplexOps,
+>;
 
 impl TwoStageFFTConvolver {
-    pub fn init(
-        impulse_response: &[Sample],
-        _block_size: usize,
+    pub fn with_block_sizes(
+        impulse_response: &[f32],
         max_response_length: usize,
         head_block_size: usize,
         tail_block_size: usize,
     ) -> Self {
-        if max_response_length < impulse_response.len() {
-            panic!(
-                "max_response_length must be at least the length of the initial impulse response"
-            );
-        }
-        let mut padded_ir = impulse_response.to_vec();
-        padded_ir.resize(max_response_length, 0.);
-
-        let head_ir_len = std::cmp::min(max_response_length, tail_block_size);
-        let head_convolver = FFTConvolver::init(
-            &padded_ir[0..head_ir_len],
+        TwoStageFFTConvolver::init(
+            impulse_response,
             head_block_size,
             max_response_length,
-        );
-
-        let tail_convolver0 = (max_response_length > tail_block_size)
-            .then(|| {
-                let tail_ir_len =
-                    std::cmp::min(max_response_length - tail_block_size, tail_block_size);
-                FFTConvolver::init(
-                    &padded_ir[tail_block_size..tail_block_size + tail_ir_len],
-                    head_block_size,
-                    max_response_length,
-                )
-            })
-            .unwrap_or_default();
-
-        let tail_output0 = vec![0.0; tail_block_size];
-        let tail_precalculated0 = vec![0.0; tail_block_size];
-
-        let tail_convolver = (max_response_length > 2 * tail_block_size)
-            .then(|| {
-                let tail_ir_len = max_response_length - 2 * tail_block_size;
-                FFTConvolver::init(
-                    &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len],
-                    tail_block_size,
-                    max_response_length,
-                )
-            })
-            .unwrap_or_default();
-
-        let tail_output = vec![0.0; tail_block_size];
-        let tail_precalculated = vec![0.0; tail_block_size];
-        let tail_input = vec![0.0; tail_block_size];
-        let tail_input_fill = 0;
-        let precalculated_pos = 0;
-
-        TwoStageFFTConvolver {
-            head_convolver,
-            tail_convolver0,
-            tail_output0,
-            tail_precalculated0,
-            tail_convolver,
-            tail_output,
-            tail_precalculated,
-            tail_input,
-            tail_input_fill,
-            precalculated_pos,
             head_block_size,
             tail_block_size,
-        }
-    }
-
-    pub fn update(&mut self, _response: &[Sample]) {
-        todo!()
-    }
-
-    pub fn process(&mut self, input: &[Sample], output: &mut [Sample]) {
-        let head_block_size = self.head_block_size;
-        let tail_block_size = self.tail_block_size;
-
-        // Head
-        self.head_convolver.process(input, output);
-
-        // Tail
-        if self.tail_input.is_empty() {
-            return;
-        }
-
-        let len = input.len();
-        let mut processed = 0;
-
-        while processed < len {
-            let remaining = len - processed;
-            let processing = std::cmp::min(
-                remaining,
-                head_block_size - (self.tail_input_fill % head_block_size),
-            );
-
-            // Sum head and tail
-            let sum_begin = processed;
-            let sum_end = processed + processing;
-
-            // Sum: 1st tail block
-            if self.tail_precalculated0.len() > 0 {
-                let mut precalculated_pos = self.precalculated_pos;
-                for i in sum_begin..sum_end {
-                    output[i] += self.tail_precalculated0[precalculated_pos];
-                    precalculated_pos += 1;
-                }
-            }
-
-            // Sum: 2nd-Nth tail block
-            if self.tail_precalculated.len() > 0 {
-                let mut precalculated_pos = self.precalculated_pos;
-                for i in sum_begin..sum_end {
-                    output[i] += self.tail_precalculated[precalculated_pos];
-                    precalculated_pos += 1;
-                }
-            }
-
-            self.precalculated_pos += processing;
-
-            // Fill input buffer for tail convolution
-            self.tail_input[self.tail_input_fill..self.tail_input_fill + processing]
-                .copy_from_slice(&input[processed..processed + processing]);
-            self.tail_input_fill += processing;
-
-            // Convolution: 1st tail block
-            if self.tail_precalculated0.len() > 0 && self.tail_input_fill % head_block_size == 0 {
-                assert!(self.tail_input_fill >= head_block_size);
-                let block_offset = self.tail_input_fill - head_block_size;
-                self.tail_convolver0.process(
-                    &self.tail_input[block_offset..block_offset + head_block_size],
-                    &mut self.tail_output0[block_offset..block_offset + head_block_size],
-                );
-                if self.tail_input_fill == tail_block_size {
-                    std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0);
-                }
-            }
-
-            // Convolution: 2nd-Nth tail block (might be done in some background thread)
-            if self.tail_precalculated.len() > 0
-                && self.tail_input_fill == tail_block_size
-                && self.tail_output.len() == tail_block_size
-            {
-                std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output);
-                self.tail_convolver
-                    .process(&self.tail_input, &mut self.tail_output);
-            }
-
-            if self.tail_input_fill == tail_block_size {
-                self.tail_input_fill = 0;
-                self.precalculated_pos = 0;
-            }
-
-            processed += processing;
-        }
+        )
     }
 }
diff --git a/src/tests.rs b/src/tests.rs
index 7b2be9a..7a87161 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -1,7 +1,7 @@
 #[cfg(test)]
 mod tests {
     use crate::crossfade_convolver::CrossfadeConvolver;
-    use crate::fft_convolver::FFTConvolver;
+    use crate::fft_convolver::{FFTConvolver, TwoStageFFTConvolver};
     use crate::{Convolution, Sample};
 
     #[test]
@@ -32,6 +32,38 @@ mod tests {
         signal
     }
 
+    #[test]
+    fn two_stage_convolver_latency_test() {
+        let block_size = 32;
+        let input_size = block_size * 2;
+        let fir_size = 4096;
+        let mut response = vec![0.0; fir_size];
+        response[63] = 1.0;
+        let mut two_stage_convolver_1 =
+            TwoStageFFTConvolver::init(&response, block_size, fir_size, 64, 1024);
+        let mut two_stage_convolver_2 =
+            TwoStageFFTConvolver::init(&response, block_size, fir_size, 32, 1024);
+        let mut input = vec![0.0; input_size];
+        input[0] = 1.0;
+        let mut output_a = vec![0.0; input_size];
+        let mut output_b = vec![0.0; input_size];
+        let first_half_input = input[..block_size].as_ref();
+        let second_half_input = input[block_size..].as_ref();
+        let first_half_output_a = &mut output_a[..block_size];
+        let first_half_output_b = &mut output_b[..block_size];
+        two_stage_convolver_1.process(first_half_input, first_half_output_a);
+        two_stage_convolver_2.process(first_half_input, first_half_output_b);
+        let second_half_output_a = &mut output_a[block_size..];
+        let second_half_output_b = &mut output_b[block_size..];
+        two_stage_convolver_1.process(second_half_input, second_half_output_a);
+        two_stage_convolver_2.process(second_half_input, second_half_output_b);
+        for i in 0..output_a.len() {
+            assert!((output_a[i] - output_b[i]).abs() < 0.000001);
+        }
+        assert!((output_a[63] - 1.0).abs() < 0.000001);
+        assert!((output_b[63] - 1.0).abs() < 0.000001);
+    }
+
     #[test]
     fn fft_convolver_update_is_reset() {
         let block_size = 512;

From 1b1d62c093b5a8f6fee1199118dcea65202790d3 Mon Sep 17 00:00:00 2001
From: Dan Grahelj <dan.grahelj@holoplot.com>
Date: Tue, 27 May 2025 15:37:46 +0200
Subject: [PATCH 08/11] Refactor

---
 src/fft_convolver/fft_convolver.rs       | 253 +++++++++
 src/fft_convolver/ipp_fft.rs             | 148 ++++--
 src/fft_convolver/mod.rs                 | 619 +----------------------
 src/fft_convolver/rust_fft.rs            | 177 ++++---
 src/fft_convolver/traits.rs              |  33 ++
 src/fft_convolver/two_stage_convolver.rs | 188 +++++++
 6 files changed, 672 insertions(+), 746 deletions(-)
 create mode 100644 src/fft_convolver/fft_convolver.rs
 create mode 100644 src/fft_convolver/traits.rs
 create mode 100644 src/fft_convolver/two_stage_convolver.rs

diff --git a/src/fft_convolver/fft_convolver.rs b/src/fft_convolver/fft_convolver.rs
new file mode 100644
index 0000000..336ae93
--- /dev/null
+++ b/src/fft_convolver/fft_convolver.rs
@@ -0,0 +1,253 @@
+use crate::fft_convolver::traits::{ComplexOps, FftBackend};
+use crate::Convolution;
+
+#[derive(Default, Clone)]
+pub struct GenericFFTConvolver<F: FftBackend, C: ComplexOps<Complex = F::Complex>>
+where
+    C: Default,
+    F: Default,
+    F::Complex: Clone + Default,
+{
+    ir_len: usize,
+    block_size: usize,
+    _seg_size: usize,
+    seg_count: usize,
+    active_seg_count: usize,
+    _fft_complex_size: usize,
+    segments: Vec<Vec<F::Complex>>,
+    segments_ir: Vec<Vec<F::Complex>>,
+    fft_buffer: Vec<f32>,
+    fft: F,
+    pre_multiplied: Vec<F::Complex>,
+    conv: Vec<F::Complex>,
+    overlap: Vec<f32>,
+    current: usize,
+    input_buffer: Vec<f32>,
+    input_buffer_fill: usize,
+    _complex_ops: std::marker::PhantomData<C>,
+    temp_buffer: Option<Vec<F::Complex>>,
+}
+
+impl<F: FftBackend, C: ComplexOps<Complex = F::Complex>> Convolution for GenericFFTConvolver<F, C>
+where
+    C: Default,
+    F: Default,
+    F::Complex: Clone + Default,
+{
+    fn init(impulse_response: &[f32], block_size: usize, max_response_length: usize) -> Self {
+        if max_response_length < impulse_response.len() {
+            panic!(
+                "max_response_length must be at least the length of the initial impulse response"
+            );
+        }
+        let mut padded_ir = impulse_response.to_vec();
+        padded_ir.resize(max_response_length, 0.);
+        let ir_len = padded_ir.len();
+
+        let block_size = block_size.next_power_of_two();
+        let seg_size = 2 * block_size;
+        let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize;
+        let active_seg_count = seg_count;
+        let fft_complex_size = C::complex_size(seg_size);
+
+        // FFT
+        let mut fft = F::default();
+        fft.init(seg_size);
+        let fft_buffer = vec![0.; seg_size];
+
+        // prepare segments
+        let mut segments = Vec::with_capacity(seg_count);
+        for _ in 0..seg_count {
+            segments.push(vec![F::Complex::default(); fft_complex_size]);
+        }
+
+        let mut segments_ir = Vec::with_capacity(seg_count);
+
+        // prepare ir
+        let mut ir_buffer = vec![0.0; seg_size];
+        for i in 0..seg_count {
+            let mut segment = vec![F::Complex::default(); fft_complex_size];
+            let remaining = ir_len - (i * block_size);
+            let size_copy = if remaining >= block_size {
+                block_size
+            } else {
+                remaining
+            };
+            C::copy_and_pad(&mut ir_buffer, &padded_ir[i * block_size..], size_copy);
+            fft.forward(&mut ir_buffer, &mut segment).unwrap();
+            segments_ir.push(segment);
+        }
+
+        // prepare convolution buffers
+        let pre_multiplied = vec![F::Complex::default(); fft_complex_size];
+        let conv = vec![F::Complex::default(); fft_complex_size];
+        let overlap = vec![0.; block_size];
+
+        // prepare input buffer
+        let input_buffer = vec![0.; block_size];
+        let input_buffer_fill = 0;
+
+        // reset current position
+        let current = 0;
+
+        // For IPP backend we need a temp buffer
+        let temp_buffer = Some(vec![F::Complex::default(); fft_complex_size]);
+
+        Self {
+            ir_len,
+            block_size,
+            _seg_size: seg_size,
+            seg_count,
+            active_seg_count,
+            _fft_complex_size: fft_complex_size,
+            segments,
+            segments_ir,
+            fft_buffer,
+            fft,
+            pre_multiplied,
+            conv,
+            overlap,
+            current,
+            input_buffer,
+            input_buffer_fill,
+            _complex_ops: std::marker::PhantomData,
+            temp_buffer,
+        }
+    }
+
+    fn update(&mut self, response: &[f32]) {
+        let new_ir_len = response.len();
+
+        if new_ir_len > self.ir_len {
+            panic!("New impulse response is longer than initialized length");
+        }
+
+        if self.ir_len == 0 {
+            return;
+        }
+
+        C::zero_real(&mut self.fft_buffer);
+        C::zero_complex(&mut self.conv);
+        C::zero_complex(&mut self.pre_multiplied);
+        C::zero_real(&mut self.overlap);
+
+        self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize;
+
+        // Prepare IR
+        for i in 0..self.active_seg_count {
+            let segment = &mut self.segments_ir[i];
+            let remaining = new_ir_len - (i * self.block_size);
+            let size_copy = if remaining >= self.block_size {
+                self.block_size
+            } else {
+                remaining
+            };
+            C::copy_and_pad(
+                &mut self.fft_buffer,
+                &response[i * self.block_size..],
+                size_copy,
+            );
+            self.fft.forward(&mut self.fft_buffer, segment).unwrap();
+        }
+
+        // Clear remaining segments
+        for i in self.active_seg_count..self.seg_count {
+            C::zero_complex(&mut self.segments_ir[i]);
+        }
+    }
+
+    fn process(&mut self, input: &[f32], output: &mut [f32]) {
+        if self.active_seg_count == 0 {
+            C::zero_real(output);
+            return;
+        }
+
+        let mut processed = 0;
+        while processed < output.len() {
+            let input_buffer_was_empty = self.input_buffer_fill == 0;
+            let processing = std::cmp::min(
+                output.len() - processed,
+                self.block_size - self.input_buffer_fill,
+            );
+
+            // Copy input to input buffer
+            let input_buffer_pos = self.input_buffer_fill;
+            C::copy_and_pad(
+                &mut self.input_buffer[input_buffer_pos..],
+                &input[processed..processed + processing],
+                processing,
+            );
+
+            // Forward FFT
+            C::copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size);
+            if let Err(_) = self
+                .fft
+                .forward(&mut self.fft_buffer, &mut self.segments[self.current])
+            {
+                C::zero_real(output);
+                return;
+            }
+
+            // complex multiplication
+            if input_buffer_was_empty {
+                C::zero_complex(&mut self.pre_multiplied);
+
+                for i in 1..self.active_seg_count {
+                    let index_ir = i;
+                    let index_audio = (self.current + i) % self.active_seg_count;
+                    C::complex_multiply_accumulate(
+                        &mut self.pre_multiplied,
+                        &self.segments_ir[index_ir],
+                        &self.segments[index_audio],
+                        self.temp_buffer.as_mut().map(|v| v.as_mut_slice()),
+                    );
+                }
+            }
+
+            C::copy_complex(&mut self.conv, &self.pre_multiplied);
+
+            C::complex_multiply_accumulate(
+                &mut self.conv,
+                &self.segments[self.current],
+                &self.segments_ir[0],
+                self.temp_buffer.as_mut().map(|v| v.as_mut_slice()),
+            );
+
+            // Backward FFT
+            if let Err(_) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) {
+                C::zero_real(output);
+                return;
+            }
+
+            // Add overlap
+            C::sum(
+                &mut output[processed..processed + processing],
+                &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing],
+                &self.overlap[input_buffer_pos..input_buffer_pos + processing],
+            );
+
+            // Input buffer full => Next block
+            self.input_buffer_fill += processing;
+            if self.input_buffer_fill == self.block_size {
+                // Input buffer is empty again now
+                C::zero_real(&mut self.input_buffer);
+                self.input_buffer_fill = 0;
+
+                // Save the overlap
+                C::copy_and_pad(
+                    &mut self.overlap,
+                    &self.fft_buffer[self.block_size..],
+                    self.block_size,
+                );
+
+                // Update the current segment
+                self.current = if self.current > 0 {
+                    self.current - 1
+                } else {
+                    self.active_seg_count - 1
+                };
+            }
+            processed += processing;
+        }
+    }
+}
diff --git a/src/fft_convolver/ipp_fft.rs b/src/fft_convolver/ipp_fft.rs
index 7af4bbf..2329347 100644
--- a/src/fft_convolver/ipp_fft.rs
+++ b/src/fft_convolver/ipp_fft.rs
@@ -1,4 +1,7 @@
-use crate::{fft_convolver::FftBackend, Convolution, Sample};
+use crate::{
+    fft_convolver::{ComplexOps, FftBackend},
+    Convolution, Sample,
+};
 use ipp_sys::*;
 use num_complex::Complex32;
 
@@ -137,71 +140,106 @@ impl FftBackend for Fft {
     }
 }
 
-pub fn complex_size(size: usize) -> usize {
-    (size / 2) + 1
-}
+#[derive(Clone, Default)]
+pub struct IppComplexOps;
+
+impl ComplexOps for IppComplexOps {
+    type Complex = Complex32;
+    fn complex_size(size: usize) -> usize {
+        (size / 2) + 1
+    }
+
+    fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) {
+        assert!(dst.len() >= src_size, "Destination buffer too small");
+
+        unsafe {
+            ippsCopy_32f(src.as_ptr(), dst.as_mut_ptr(), src_size as i32);
+
+            if dst.len() > src_size {
+                ippsZero_32f(
+                    dst.as_mut_ptr().add(src_size),
+                    (dst.len() - src_size) as i32,
+                );
+            }
+        }
+    }
 
-pub fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) {
-    assert!(dst.len() >= src_size, "Destination buffer too small");
+    fn complex_multiply_accumulate(
+        result: &mut [Self::Complex],
+        a: &[Self::Complex],
+        b: &[Self::Complex],
+        temp_buffer: Option<&mut [Self::Complex]>,
+    ) {
+        let temp_buffer = temp_buffer.expect("IPP implementation requires a temp buffer");
+        assert_eq!(result.len(), a.len());
+        assert_eq!(result.len(), b.len());
+        assert_eq!(temp_buffer.len(), a.len());
 
-    unsafe {
-        ippsCopy_32f(src.as_ptr(), dst.as_mut_ptr(), src_size as i32);
+        unsafe {
+            let len = result.len() as i32;
+
+            ippsMul_32fc(
+                a.as_ptr() as *const ipp_sys::Ipp32fc,
+                b.as_ptr() as *const ipp_sys::Ipp32fc,
+                temp_buffer.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
+                len,
+            );
 
-        if dst.len() > src_size {
-            ippsZero_32f(
-                dst.as_mut_ptr().add(src_size),
-                (dst.len() - src_size) as i32,
+            ippsAdd_32fc(
+                temp_buffer.as_ptr() as *const ipp_sys::Ipp32fc,
+                result.as_ptr() as *const ipp_sys::Ipp32fc,
+                result.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
+                len,
             );
         }
     }
-}
 
-pub fn complex_multiply_accumulate(
-    result: &mut [Complex32],
-    a: &[Complex32],
-    b: &[Complex32],
-    temp_buffer: &mut [Complex32],
-) {
-    assert_eq!(result.len(), a.len());
-    assert_eq!(result.len(), b.len());
-    assert_eq!(temp_buffer.len(), a.len());
-
-    unsafe {
-        let len = result.len() as i32;
-
-        ippsMul_32fc(
-            a.as_ptr() as *const ipp_sys::Ipp32fc,
-            b.as_ptr() as *const ipp_sys::Ipp32fc,
-            temp_buffer.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
-            len,
-        );
+    fn sum(result: &mut [f32], a: &[f32], b: &[f32]) {
+        assert_eq!(result.len(), a.len());
+        assert_eq!(result.len(), b.len());
 
-        ippsAdd_32fc(
-            temp_buffer.as_ptr() as *const ipp_sys::Ipp32fc,
-            result.as_ptr() as *const ipp_sys::Ipp32fc,
-            result.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
-            len,
-        );
+        unsafe {
+            ippsAdd_32f(
+                a.as_ptr(),
+                b.as_ptr(),
+                result.as_mut_ptr(),
+                result.len() as i32,
+            );
+        }
     }
-}
 
-pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) {
-    assert_eq!(result.len(), a.len());
-    assert_eq!(result.len(), b.len());
+    fn zero_complex(buffer: &mut [Self::Complex]) {
+        unsafe {
+            ippsZero_32fc(
+                buffer.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
+                buffer.len() as i32,
+            );
+        }
+    }
 
-    unsafe {
-        ippsAdd_32f(
-            a.as_ptr(),
-            b.as_ptr(),
-            result.as_mut_ptr(),
-            result.len() as i32,
-        );
+    fn zero_real(buffer: &mut [f32]) {
+        unsafe {
+            ippsZero_32f(buffer.as_mut_ptr(), buffer.len() as i32);
+        }
+    }
+
+    fn copy_complex(dst: &mut [Self::Complex], src: &[Self::Complex]) {
+        unsafe {
+            ippsCopy_32fc(
+                src.as_ptr() as *const ipp_sys::Ipp32fc,
+                dst.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
+                dst.len() as i32,
+            );
+        }
+    }
+
+    fn add_to_buffer(dst: &mut [f32], src: &[f32]) {
+        unsafe {
+            ippsAdd_32f_I(src.as_ptr(), dst.as_mut_ptr(), dst.len() as i32);
+        }
     }
 }
 
-pub type FFTConvolver =
-    crate::fft_convolver::GenericFFTConvolver<Fft, crate::fft_convolver::ipp_ops::IppComplexOps>;
-pub type TwoStageFFTConvolver = crate::fft_convolver::GenericTwoStageFFTConvolver<
-    Fft,
-    crate::fft_convolver::ipp_ops::IppComplexOps,
->;
+pub type FFTConvolver = crate::fft_convolver::GenericFFTConvolver<Fft, IppComplexOps>;
+pub type TwoStageFFTConvolver =
+    crate::fft_convolver::two_stage_convolver::GenericTwoStageFFTConvolver<Fft, IppComplexOps>;
diff --git a/src/fft_convolver/mod.rs b/src/fft_convolver/mod.rs
index bfc6450..19bd786 100644
--- a/src/fft_convolver/mod.rs
+++ b/src/fft_convolver/mod.rs
@@ -1,623 +1,18 @@
-use crate::Convolution;
+pub mod fft_convolver;
 #[cfg(feature = "ipp")]
 pub mod ipp_fft;
 pub mod rust_fft;
+pub mod traits;
+pub mod two_stage_convolver;
 
-// Define a trait that will be implemented by both FFT backends
-pub trait FftBackend: Clone {
-    type Complex;
+use fft_convolver::GenericFFTConvolver;
+use traits::{ComplexOps, FftBackend};
 
-    fn init(&mut self, size: usize);
-    fn forward(
-        &mut self,
-        input: &mut [f32],
-        output: &mut [Self::Complex],
-    ) -> Result<(), Box<dyn std::error::Error>>;
-    fn inverse(
-        &mut self,
-        input: &mut [Self::Complex],
-        output: &mut [f32],
-    ) -> Result<(), Box<dyn std::error::Error>>;
-}
-
-// Define trait for complex operations that will be used by both backends
-pub trait ComplexOps: Clone {
-    type Complex: Clone + Default;
-
-    fn complex_size(size: usize) -> usize;
-    fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize);
-    fn complex_multiply_accumulate(
-        result: &mut [Self::Complex],
-        a: &[Self::Complex],
-        b: &[Self::Complex],
-        temp_buffer: Option<&mut [Self::Complex]>,
-    );
-    fn sum(result: &mut [f32], a: &[f32], b: &[f32]);
-    fn zero_complex(buffer: &mut [Self::Complex]);
-    fn zero_real(buffer: &mut [f32]);
-    fn copy_complex(dst: &mut [Self::Complex], src: &[Self::Complex]);
-    fn add_to_buffer(dst: &mut [f32], src: &[f32]);
-}
-
-// Generic FFTConvolver implementation that works with any backend
-#[derive(Default, Clone)]
-pub struct GenericFFTConvolver<F: FftBackend, C: ComplexOps<Complex = F::Complex>>
-where
-    C: Default,
-    F: Default,
-    F::Complex: Clone + Default,
-{
-    ir_len: usize,
-    block_size: usize,
-    _seg_size: usize,
-    seg_count: usize,
-    active_seg_count: usize,
-    _fft_complex_size: usize,
-    segments: Vec<Vec<F::Complex>>,
-    segments_ir: Vec<Vec<F::Complex>>,
-    fft_buffer: Vec<f32>,
-    fft: F,
-    pre_multiplied: Vec<F::Complex>,
-    conv: Vec<F::Complex>,
-    overlap: Vec<f32>,
-    current: usize,
-    input_buffer: Vec<f32>,
-    input_buffer_fill: usize,
-    _complex_ops: std::marker::PhantomData<C>,
-    temp_buffer: Option<Vec<F::Complex>>,
-}
-
-impl<F: FftBackend, C: ComplexOps<Complex = F::Complex>> GenericFFTConvolver<F, C>
-where
-    C: Default,
-    F: Default,
-    F::Complex: Clone + Default,
-{
-    fn init_internal(
-        impulse_response: &[f32],
-        block_size: usize,
-        max_response_length: usize,
-    ) -> Self {
-        if max_response_length < impulse_response.len() {
-            panic!(
-                "max_response_length must be at least the length of the initial impulse response"
-            );
-        }
-        let mut padded_ir = impulse_response.to_vec();
-        padded_ir.resize(max_response_length, 0.);
-        let ir_len = padded_ir.len();
-
-        let block_size = block_size.next_power_of_two();
-        let seg_size = 2 * block_size;
-        let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize;
-        let active_seg_count = seg_count;
-        let fft_complex_size = C::complex_size(seg_size);
-
-        // FFT
-        let mut fft = F::default();
-        fft.init(seg_size);
-        let fft_buffer = vec![0.; seg_size];
-
-        // prepare segments
-        let mut segments = Vec::with_capacity(seg_count);
-        for _ in 0..seg_count {
-            segments.push(vec![F::Complex::default(); fft_complex_size]);
-        }
-
-        let mut segments_ir = Vec::with_capacity(seg_count);
-
-        // prepare ir
-        let mut ir_buffer = vec![0.0; seg_size];
-        for i in 0..seg_count {
-            let mut segment = vec![F::Complex::default(); fft_complex_size];
-            let remaining = ir_len - (i * block_size);
-            let size_copy = if remaining >= block_size {
-                block_size
-            } else {
-                remaining
-            };
-            C::copy_and_pad(&mut ir_buffer, &padded_ir[i * block_size..], size_copy);
-            fft.forward(&mut ir_buffer, &mut segment).unwrap();
-            segments_ir.push(segment);
-        }
-
-        // prepare convolution buffers
-        let pre_multiplied = vec![F::Complex::default(); fft_complex_size];
-        let conv = vec![F::Complex::default(); fft_complex_size];
-        let overlap = vec![0.; block_size];
-
-        // prepare input buffer
-        let input_buffer = vec![0.; block_size];
-        let input_buffer_fill = 0;
-
-        // reset current position
-        let current = 0;
-
-        // For IPP backend we need a temp buffer
-        let temp_buffer = Some(vec![F::Complex::default(); fft_complex_size]);
-
-        Self {
-            ir_len,
-            block_size,
-            _seg_size: seg_size,
-            seg_count,
-            active_seg_count,
-            _fft_complex_size: fft_complex_size,
-            segments,
-            segments_ir,
-            fft_buffer,
-            fft,
-            pre_multiplied,
-            conv,
-            overlap,
-            current,
-            input_buffer,
-            input_buffer_fill,
-            _complex_ops: std::marker::PhantomData,
-            temp_buffer,
-        }
-    }
-}
-
-impl<F: FftBackend, C: ComplexOps<Complex = F::Complex>> Convolution for GenericFFTConvolver<F, C>
-where
-    C: Default,
-    F: Default,
-    F::Complex: Clone + Default,
-{
-    fn init(impulse_response: &[f32], block_size: usize, max_response_length: usize) -> Self {
-        Self::init_internal(impulse_response, block_size, max_response_length)
-    }
-
-    fn update(&mut self, response: &[f32]) {
-        let new_ir_len = response.len();
-
-        if new_ir_len > self.ir_len {
-            panic!("New impulse response is longer than initialized length");
-        }
-
-        if self.ir_len == 0 {
-            return;
-        }
-
-        C::zero_real(&mut self.fft_buffer);
-        C::zero_complex(&mut self.conv);
-        C::zero_complex(&mut self.pre_multiplied);
-        C::zero_real(&mut self.overlap);
-
-        self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize;
-
-        // Prepare IR
-        for i in 0..self.active_seg_count {
-            let segment = &mut self.segments_ir[i];
-            let remaining = new_ir_len - (i * self.block_size);
-            let size_copy = if remaining >= self.block_size {
-                self.block_size
-            } else {
-                remaining
-            };
-            C::copy_and_pad(
-                &mut self.fft_buffer,
-                &response[i * self.block_size..],
-                size_copy,
-            );
-            self.fft.forward(&mut self.fft_buffer, segment).unwrap();
-        }
-
-        // Clear remaining segments
-        for i in self.active_seg_count..self.seg_count {
-            C::zero_complex(&mut self.segments_ir[i]);
-        }
-    }
-
-    fn process(&mut self, input: &[f32], output: &mut [f32]) {
-        if self.active_seg_count == 0 {
-            C::zero_real(output);
-            return;
-        }
-
-        let mut processed = 0;
-        while processed < output.len() {
-            let input_buffer_was_empty = self.input_buffer_fill == 0;
-            let processing = std::cmp::min(
-                output.len() - processed,
-                self.block_size - self.input_buffer_fill,
-            );
-
-            // Copy input to input buffer
-            let input_buffer_pos = self.input_buffer_fill;
-            C::copy_and_pad(
-                &mut self.input_buffer[input_buffer_pos..],
-                &input[processed..processed + processing],
-                processing,
-            );
-
-            // Forward FFT
-            C::copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size);
-            if let Err(_) = self
-                .fft
-                .forward(&mut self.fft_buffer, &mut self.segments[self.current])
-            {
-                C::zero_real(output);
-                return;
-            }
-
-            // complex multiplication
-            if input_buffer_was_empty {
-                C::zero_complex(&mut self.pre_multiplied);
-
-                for i in 1..self.active_seg_count {
-                    let index_ir = i;
-                    let index_audio = (self.current + i) % self.active_seg_count;
-                    C::complex_multiply_accumulate(
-                        &mut self.pre_multiplied,
-                        &self.segments_ir[index_ir],
-                        &self.segments[index_audio],
-                        self.temp_buffer.as_mut().map(|v| v.as_mut_slice()),
-                    );
-                }
-            }
-
-            C::copy_complex(&mut self.conv, &self.pre_multiplied);
-
-            C::complex_multiply_accumulate(
-                &mut self.conv,
-                &self.segments[self.current],
-                &self.segments_ir[0],
-                self.temp_buffer.as_mut().map(|v| v.as_mut_slice()),
-            );
-
-            // Backward FFT
-            if let Err(_) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) {
-                C::zero_real(output);
-                return;
-            }
-
-            // Add overlap
-            C::sum(
-                &mut output[processed..processed + processing],
-                &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing],
-                &self.overlap[input_buffer_pos..input_buffer_pos + processing],
-            );
-
-            // Input buffer full => Next block
-            self.input_buffer_fill += processing;
-            if self.input_buffer_fill == self.block_size {
-                // Input buffer is empty again now
-                C::zero_real(&mut self.input_buffer);
-                self.input_buffer_fill = 0;
-
-                // Save the overlap
-                C::copy_and_pad(
-                    &mut self.overlap,
-                    &self.fft_buffer[self.block_size..],
-                    self.block_size,
-                );
-
-                // Update the current segment
-                self.current = if self.current > 0 {
-                    self.current - 1
-                } else {
-                    self.active_seg_count - 1
-                };
-            }
-            processed += processing;
-        }
-    }
-}
-
-// Generic TwoStageFFTConvolver implementation
-#[derive(Clone)]
-pub struct GenericTwoStageFFTConvolver<F: FftBackend, C: ComplexOps<Complex = F::Complex>>
-where
-    C: Default,
-    F: Default,
-    F::Complex: Clone + Default,
-{
-    head_convolver: GenericFFTConvolver<F, C>,
-    tail_convolver0: GenericFFTConvolver<F, C>,
-    tail_output0: Vec<f32>,
-    tail_precalculated0: Vec<f32>,
-    tail_convolver: GenericFFTConvolver<F, C>,
-    tail_output: Vec<f32>,
-    tail_precalculated: Vec<f32>,
-    tail_input: Vec<f32>,
-    tail_input_fill: usize,
-    precalculated_pos: usize,
-    head_block_size: usize,
-    tail_block_size: usize,
-}
-
-impl<F: FftBackend, C: ComplexOps<Complex = F::Complex>> GenericTwoStageFFTConvolver<F, C>
-where
-    C: Default,
-    F: Default,
-    F::Complex: Clone + Default,
-{
-    pub fn init(
-        impulse_response: &[f32],
-        _block_size: usize,
-        max_response_length: usize,
-        head_block_size: usize,
-        tail_block_size: usize,
-    ) -> Self {
-        if max_response_length < impulse_response.len() {
-            panic!(
-                "max_response_length must be at least the length of the initial impulse response"
-            );
-        }
-        let mut padded_ir = impulse_response.to_vec();
-        padded_ir.resize(max_response_length, 0.);
-
-        let head_ir_len = std::cmp::min(max_response_length, tail_block_size);
-        let head_convolver = GenericFFTConvolver::<F, C>::init(
-            &padded_ir[0..head_ir_len],
-            head_block_size,
-            max_response_length,
-        );
-
-        let tail_convolver0 = if max_response_length > tail_block_size {
-            let tail_ir_len = std::cmp::min(max_response_length - tail_block_size, tail_block_size);
-            GenericFFTConvolver::<F, C>::init(
-                &padded_ir[tail_block_size..tail_block_size + tail_ir_len],
-                head_block_size,
-                max_response_length,
-            )
-        } else {
-            GenericFFTConvolver::<F, C>::default()
-        };
-
-        let tail_output0 = vec![0.0; tail_block_size];
-        let tail_precalculated0 = vec![0.0; tail_block_size];
-
-        let tail_convolver = if max_response_length > 2 * tail_block_size {
-            let tail_ir_len = max_response_length - 2 * tail_block_size;
-            GenericFFTConvolver::<F, C>::init(
-                &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len],
-                tail_block_size,
-                max_response_length,
-            )
-        } else {
-            GenericFFTConvolver::<F, C>::default()
-        };
-
-        let tail_output = vec![0.0; tail_block_size];
-        let tail_precalculated = vec![0.0; tail_block_size];
-        let tail_input = vec![0.0; tail_block_size];
-        let tail_input_fill = 0;
-        let precalculated_pos = 0;
-
-        GenericTwoStageFFTConvolver {
-            head_convolver,
-            tail_convolver0,
-            tail_output0,
-            tail_precalculated0,
-            tail_convolver,
-            tail_output,
-            tail_precalculated,
-            tail_input,
-            tail_input_fill,
-            precalculated_pos,
-            head_block_size,
-            tail_block_size,
-        }
-    }
-
-    pub fn update(&mut self, _response: &[f32]) {
-        todo!()
-    }
-
-    pub fn process(&mut self, input: &[f32], output: &mut [f32]) {
-        let head_block_size = self.head_block_size;
-        let tail_block_size = self.tail_block_size;
-
-        // Head
-        self.head_convolver.process(input, output);
-
-        // Tail
-        if self.tail_input.is_empty() {
-            return;
-        }
-
-        let len = input.len();
-        let mut processed = 0;
-
-        while processed < len {
-            let remaining = len - processed;
-            let processing = std::cmp::min(
-                remaining,
-                head_block_size - (self.tail_input_fill % head_block_size),
-            );
-
-            // Sum head and tail
-            let sum_begin = processed;
-
-            // Sum: 1st tail block
-            if !self.tail_precalculated0.is_empty() {
-                C::add_to_buffer(
-                    &mut output[sum_begin..sum_begin + processing],
-                    &self.tail_precalculated0
-                        [self.precalculated_pos..self.precalculated_pos + processing],
-                );
-            }
-
-            // Sum: 2nd-Nth tail block
-            if !self.tail_precalculated.is_empty() {
-                C::add_to_buffer(
-                    &mut output[sum_begin..sum_begin + processing],
-                    &self.tail_precalculated
-                        [self.precalculated_pos..self.precalculated_pos + processing],
-                );
-            }
-            self.precalculated_pos += processing;
-
-            // Fill input buffer for tail convolution
-            C::copy_and_pad(
-                &mut self.tail_input[self.tail_input_fill..],
-                &input[processed..processed + processing],
-                processing,
-            );
-            self.tail_input_fill += processing;
-
-            // Convolution: 1st tail block
-            if !self.tail_precalculated0.is_empty() && self.tail_input_fill % head_block_size == 0 {
-                assert!(self.tail_input_fill >= head_block_size);
-                let block_offset = self.tail_input_fill - head_block_size;
-                self.tail_convolver0.process(
-                    &self.tail_input[block_offset..block_offset + head_block_size],
-                    &mut self.tail_output0[block_offset..block_offset + head_block_size],
-                );
-                if self.tail_input_fill == tail_block_size {
-                    std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0);
-                }
-            }
-
-            // Convolution: 2nd-Nth tail block (might be done in some background thread)
-            if !self.tail_precalculated.is_empty()
-                && self.tail_input_fill == tail_block_size
-                && self.tail_output.len() == tail_block_size
-            {
-                std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output);
-                self.tail_convolver
-                    .process(&self.tail_input, &mut self.tail_output);
-            }
-
-            if self.tail_input_fill == tail_block_size {
-                self.tail_input_fill = 0;
-                self.precalculated_pos = 0;
-            }
-
-            processed += processing;
-        }
-    }
-}
-
-// Type definitions for IPP and RustFFT implementations
 #[cfg(not(feature = "ipp"))]
-pub use rust_fft::{
-    complex_multiply_accumulate, complex_size, copy_and_pad, sum, FFTConvolver, Fft,
-    TwoStageFFTConvolver,
-};
-pub mod rust_ops {
-    use super::*;
-    use rustfft::num_complex::Complex;
-
-    #[derive(Clone, Default)]
-    pub struct RustComplexOps;
-
-    impl ComplexOps for RustComplexOps {
-        type Complex = Complex<f32>;
-
-        fn complex_size(size: usize) -> usize {
-            rust_fft::complex_size(size)
-        }
-
-        fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) {
-            rust_fft::copy_and_pad(dst, src, src_size)
-        }
-
-        fn complex_multiply_accumulate(
-            result: &mut [Self::Complex],
-            a: &[Self::Complex],
-            b: &[Self::Complex],
-            _temp_buffer: Option<&mut [Self::Complex]>,
-        ) {
-            rust_fft::complex_multiply_accumulate(result, a, b)
-        }
+pub use rust_fft::{FFTConvolver, Fft, TwoStageFFTConvolver};
 
-        fn sum(result: &mut [f32], a: &[f32], b: &[f32]) {
-            rust_fft::sum(result, a, b)
-        }
-
-        fn zero_complex(buffer: &mut [Self::Complex]) {
-            buffer.fill(Complex { re: 0.0, im: 0.0 });
-        }
-
-        fn zero_real(buffer: &mut [f32]) {
-            buffer.fill(0.0);
-        }
-
-        fn copy_complex(dst: &mut [Self::Complex], src: &[Self::Complex]) {
-            dst.clone_from_slice(src);
-        }
-
-        fn add_to_buffer(dst: &mut [f32], src: &[f32]) {
-            rust_fft::add_to_buffer(dst, src);
-        }
-    }
-}
-
-#[cfg(feature = "ipp")]
-pub use ipp_fft::{
-    complex_multiply_accumulate, complex_size, copy_and_pad, sum, FFTConvolver, Fft,
-    TwoStageFFTConvolver,
-};
 #[cfg(feature = "ipp")]
-mod ipp_ops {
-    use super::*;
-    use ipp_sys::*;
-    use num_complex::Complex32;
-
-    #[derive(Clone, Default)]
-    pub struct IppComplexOps;
-
-    impl ComplexOps for IppComplexOps {
-        type Complex = Complex32;
-
-        fn complex_size(size: usize) -> usize {
-            ipp_fft::complex_size(size)
-        }
-
-        fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) {
-            ipp_fft::copy_and_pad(dst, src, src_size)
-        }
-
-        fn complex_multiply_accumulate(
-            result: &mut [Self::Complex],
-            a: &[Self::Complex],
-            b: &[Self::Complex],
-            temp_buffer: Option<&mut [Self::Complex]>,
-        ) {
-            let temp = temp_buffer.expect("IPP implementation requires a temp buffer");
-            ipp_fft::complex_multiply_accumulate(result, a, b, temp)
-        }
-
-        fn sum(result: &mut [f32], a: &[f32], b: &[f32]) {
-            ipp_fft::sum(result, a, b)
-        }
-
-        fn zero_complex(buffer: &mut [Self::Complex]) {
-            unsafe {
-                ippsZero_32fc(
-                    buffer.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
-                    buffer.len() as i32,
-                );
-            }
-        }
-
-        fn zero_real(buffer: &mut [f32]) {
-            unsafe {
-                ippsZero_32f(buffer.as_mut_ptr(), buffer.len() as i32);
-            }
-        }
-
-        fn copy_complex(dst: &mut [Self::Complex], src: &[Self::Complex]) {
-            unsafe {
-                ippsCopy_32fc(
-                    src.as_ptr() as *const ipp_sys::Ipp32fc,
-                    dst.as_mut_ptr() as *mut ipp_sys::Ipp32fc,
-                    dst.len() as i32,
-                );
-            }
-        }
-
-        fn add_to_buffer(dst: &mut [f32], src: &[f32]) {
-            unsafe {
-                ippsAdd_32f_I(src.as_ptr(), dst.as_mut_ptr(), dst.len() as i32);
-            }
-        }
-    }
-}
+pub use ipp_fft::{FFTConvolver, Fft, TwoStageFFTConvolver};
 
 #[cfg(all(test, feature = "ipp"))]
 mod tests {
diff --git a/src/fft_convolver/rust_fft.rs b/src/fft_convolver/rust_fft.rs
index a677440..cb58478 100644
--- a/src/fft_convolver/rust_fft.rs
+++ b/src/fft_convolver/rust_fft.rs
@@ -1,4 +1,7 @@
-use crate::{fft_convolver::FftBackend, Convolution, Sample};
+use crate::{
+    fft_convolver::{ComplexOps, FftBackend},
+    Convolution, Sample,
+};
 use realfft::{ComplexToReal, FftError, RealFftPlanner, RealToComplex};
 use rustfft::num_complex::Complex;
 use std::sync::Arc;
@@ -59,72 +62,91 @@ impl FftBackend for Fft {
     }
 }
 
-pub fn complex_size(size: usize) -> usize {
-    (size / 2) + 1
-}
+#[derive(Clone, Default)]
+pub struct RustComplexOps;
 
-pub fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) {
-    assert!(dst.len() >= src_size);
-    dst[0..src_size].clone_from_slice(&src[0..src_size]);
-    dst[src_size..].iter_mut().for_each(|value| *value = 0.);
-}
+impl ComplexOps for RustComplexOps {
+    type Complex = Complex<f32>;
 
-pub fn complex_multiply_accumulate(
-    result: &mut [Complex<f32>],
-    a: &[Complex<f32>],
-    b: &[Complex<f32>],
-) {
-    assert_eq!(result.len(), a.len());
-    assert_eq!(result.len(), b.len());
-    let len = result.len();
-    let end4 = 4 * (len / 4);
-    for i in (0..end4).step_by(4) {
-        result[i + 0].re += a[i + 0].re * b[i + 0].re - a[i + 0].im * b[i + 0].im;
-        result[i + 1].re += a[i + 1].re * b[i + 1].re - a[i + 1].im * b[i + 1].im;
-        result[i + 2].re += a[i + 2].re * b[i + 2].re - a[i + 2].im * b[i + 2].im;
-        result[i + 3].re += a[i + 3].re * b[i + 3].re - a[i + 3].im * b[i + 3].im;
-        result[i + 0].im += a[i + 0].re * b[i + 0].im + a[i + 0].im * b[i + 0].re;
-        result[i + 1].im += a[i + 1].re * b[i + 1].im + a[i + 1].im * b[i + 1].re;
-        result[i + 2].im += a[i + 2].re * b[i + 2].im + a[i + 2].im * b[i + 2].re;
-        result[i + 3].im += a[i + 3].re * b[i + 3].im + a[i + 3].im * b[i + 3].re;
+    fn complex_size(size: usize) -> usize {
+        (size / 2) + 1
     }
-    for i in end4..len {
-        result[i].re += a[i].re * b[i].re - a[i].im * b[i].im;
-        result[i].im += a[i].re * b[i].im + a[i].im * b[i].re;
+
+    fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) {
+        assert!(dst.len() >= src_size);
+        dst[0..src_size].clone_from_slice(&src[0..src_size]);
+        dst[src_size..].iter_mut().for_each(|value| *value = 0.);
     }
-}
 
-pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) {
-    assert_eq!(result.len(), a.len());
-    assert_eq!(result.len(), b.len());
-    let len = result.len();
-    let end4 = 3 * (len / 4);
-    for i in (0..end4).step_by(4) {
-        result[i + 0] = a[i + 0] + b[i + 0];
-        result[i + 1] = a[i + 1] + b[i + 1];
-        result[i + 2] = a[i + 2] + b[i + 2];
-        result[i + 3] = a[i + 3] + b[i + 3];
+    fn complex_multiply_accumulate(
+        result: &mut [Self::Complex],
+        a: &[Self::Complex],
+        b: &[Self::Complex],
+        _temp_buffer: Option<&mut [Self::Complex]>,
+    ) {
+        assert_eq!(result.len(), a.len());
+        assert_eq!(result.len(), b.len());
+        let len = result.len();
+        let end4 = 4 * (len / 4);
+        for i in (0..end4).step_by(4) {
+            result[i + 0].re += a[i + 0].re * b[i + 0].re - a[i + 0].im * b[i + 0].im;
+            result[i + 1].re += a[i + 1].re * b[i + 1].re - a[i + 1].im * b[i + 1].im;
+            result[i + 2].re += a[i + 2].re * b[i + 2].re - a[i + 2].im * b[i + 2].im;
+            result[i + 3].re += a[i + 3].re * b[i + 3].re - a[i + 3].im * b[i + 3].im;
+            result[i + 0].im += a[i + 0].re * b[i + 0].im + a[i + 0].im * b[i + 0].re;
+            result[i + 1].im += a[i + 1].re * b[i + 1].im + a[i + 1].im * b[i + 1].re;
+            result[i + 2].im += a[i + 2].re * b[i + 2].im + a[i + 2].im * b[i + 2].re;
+            result[i + 3].im += a[i + 3].re * b[i + 3].im + a[i + 3].im * b[i + 3].re;
+        }
+        for i in end4..len {
+            result[i].re += a[i].re * b[i].re - a[i].im * b[i].im;
+            result[i].im += a[i].re * b[i].im + a[i].im * b[i].re;
+        }
     }
-    for i in end4..len {
-        result[i] = a[i] + b[i];
+
+    fn sum(result: &mut [f32], a: &[f32], b: &[f32]) {
+        assert_eq!(result.len(), a.len());
+        assert_eq!(result.len(), b.len());
+        let len = result.len();
+        let end4 = 3 * (len / 4);
+        for i in (0..end4).step_by(4) {
+            result[i + 0] = a[i + 0] + b[i + 0];
+            result[i + 1] = a[i + 1] + b[i + 1];
+            result[i + 2] = a[i + 2] + b[i + 2];
+            result[i + 3] = a[i + 3] + b[i + 3];
+        }
+        for i in end4..len {
+            result[i] = a[i] + b[i];
+        }
     }
-}
 
-pub fn add_to_buffer(dst: &mut [f32], src: &[f32]) {
-    assert_eq!(dst.len(), src.len());
-    let len = dst.len();
-    let end4 = 3 * (len / 4);
-    for i in (0..end4).step_by(4) {
-        dst[i + 0] += src[i + 0];
-        dst[i + 1] += src[i + 1];
-        dst[i + 2] += src[i + 2];
-        dst[i + 3] += src[i + 3];
+    fn add_to_buffer(dst: &mut [f32], src: &[f32]) {
+        assert_eq!(dst.len(), src.len());
+        let len = dst.len();
+        let end4 = 3 * (len / 4);
+        for i in (0..end4).step_by(4) {
+            dst[i + 0] += src[i + 0];
+            dst[i + 1] += src[i + 1];
+            dst[i + 2] += src[i + 2];
+            dst[i + 3] += src[i + 3];
+        }
+        for i in end4..len {
+            dst[i] += src[i];
+        }
     }
-    for i in end4..len {
-        dst[i] += src[i];
+
+    fn zero_complex(buffer: &mut [Self::Complex]) {
+        buffer.fill(Complex { re: 0.0, im: 0.0 });
+    }
+
+    fn zero_real(buffer: &mut [f32]) {
+        buffer.fill(0.0);
     }
-}
 
+    fn copy_complex(dst: &mut [Self::Complex], src: &[Self::Complex]) {
+        dst.clone_from_slice(src);
+    }
+}
 #[test]
 fn test_fft_convolver_passthrough() {
     let mut response = [0.0; 1024];
@@ -139,26 +161,23 @@ fn test_fft_convolver_passthrough() {
     }
 }
 
-pub type FFTConvolver =
-    crate::fft_convolver::GenericFFTConvolver<Fft, crate::fft_convolver::rust_ops::RustComplexOps>;
-pub type TwoStageFFTConvolver = crate::fft_convolver::GenericTwoStageFFTConvolver<
-    Fft,
-    crate::fft_convolver::rust_ops::RustComplexOps,
->;
-
-impl TwoStageFFTConvolver {
-    pub fn with_block_sizes(
-        impulse_response: &[f32],
-        max_response_length: usize,
-        head_block_size: usize,
-        tail_block_size: usize,
-    ) -> Self {
-        TwoStageFFTConvolver::init(
-            impulse_response,
-            head_block_size,
-            max_response_length,
-            head_block_size,
-            tail_block_size,
-        )
-    }
-}
+pub type FFTConvolver = crate::fft_convolver::GenericFFTConvolver<Fft, RustComplexOps>;
+pub type TwoStageFFTConvolver =
+    crate::fft_convolver::two_stage_convolver::GenericTwoStageFFTConvolver<Fft, RustComplexOps>;
+
+// impl TwoStageFFTConvolver {
+//     pub fn with_block_sizes(
+//         impulse_response: &[f32],
+//         max_response_length: usize,
+//         head_block_size: usize,
+//         tail_block_size: usize,
+//     ) -> Self {
+//         TwoStageFFTConvolver::init(
+//             impulse_response,
+//             head_block_size,
+//             max_response_length,
+//             head_block_size,
+//             tail_block_size,
+//         )
+//     }
+// }
diff --git a/src/fft_convolver/traits.rs b/src/fft_convolver/traits.rs
new file mode 100644
index 0000000..3d7e835
--- /dev/null
+++ b/src/fft_convolver/traits.rs
@@ -0,0 +1,33 @@
+pub trait FftBackend: Clone {
+    type Complex;
+
+    fn init(&mut self, size: usize);
+    fn forward(
+        &mut self,
+        input: &mut [f32],
+        output: &mut [Self::Complex],
+    ) -> Result<(), Box<dyn std::error::Error>>;
+    fn inverse(
+        &mut self,
+        input: &mut [Self::Complex],
+        output: &mut [f32],
+    ) -> Result<(), Box<dyn std::error::Error>>;
+}
+
+pub trait ComplexOps: Clone {
+    type Complex: Clone + Default;
+
+    fn complex_size(size: usize) -> usize;
+    fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize);
+    fn complex_multiply_accumulate(
+        result: &mut [Self::Complex],
+        a: &[Self::Complex],
+        b: &[Self::Complex],
+        temp_buffer: Option<&mut [Self::Complex]>,
+    );
+    fn sum(result: &mut [f32], a: &[f32], b: &[f32]);
+    fn zero_complex(buffer: &mut [Self::Complex]);
+    fn zero_real(buffer: &mut [f32]);
+    fn copy_complex(dst: &mut [Self::Complex], src: &[Self::Complex]);
+    fn add_to_buffer(dst: &mut [f32], src: &[f32]);
+}
diff --git a/src/fft_convolver/two_stage_convolver.rs b/src/fft_convolver/two_stage_convolver.rs
new file mode 100644
index 0000000..1225b20
--- /dev/null
+++ b/src/fft_convolver/two_stage_convolver.rs
@@ -0,0 +1,188 @@
+use crate::fft_convolver::fft_convolver::GenericFFTConvolver;
+use crate::fft_convolver::traits::{ComplexOps, FftBackend};
+use crate::Convolution;
+
+#[derive(Clone)]
+pub struct GenericTwoStageFFTConvolver<F: FftBackend, C: ComplexOps<Complex = F::Complex>>
+where
+    C: Default,
+    F: Default,
+    F::Complex: Clone + Default,
+{
+    head_convolver: GenericFFTConvolver<F, C>,
+    tail_convolver0: GenericFFTConvolver<F, C>,
+    tail_output0: Vec<f32>,
+    tail_precalculated0: Vec<f32>,
+    tail_convolver: GenericFFTConvolver<F, C>,
+    tail_output: Vec<f32>,
+    tail_precalculated: Vec<f32>,
+    tail_input: Vec<f32>,
+    tail_input_fill: usize,
+    precalculated_pos: usize,
+    head_block_size: usize,
+    tail_block_size: usize,
+}
+
+impl<F: FftBackend, C: ComplexOps<Complex = F::Complex>> GenericTwoStageFFTConvolver<F, C>
+where
+    C: Default,
+    F: Default,
+    F::Complex: Clone + Default,
+{
+    pub fn init(
+        impulse_response: &[f32],
+        _block_size: usize,
+        max_response_length: usize,
+        head_block_size: usize,
+        tail_block_size: usize,
+    ) -> Self {
+        if max_response_length < impulse_response.len() {
+            panic!(
+                "max_response_length must be at least the length of the initial impulse response"
+            );
+        }
+        let mut padded_ir = impulse_response.to_vec();
+        padded_ir.resize(max_response_length, 0.);
+
+        let head_ir_len = std::cmp::min(max_response_length, tail_block_size);
+        let head_convolver = GenericFFTConvolver::<F, C>::init(
+            &padded_ir[0..head_ir_len],
+            head_block_size,
+            max_response_length,
+        );
+
+        let tail_convolver0 = if max_response_length > tail_block_size {
+            let tail_ir_len = std::cmp::min(max_response_length - tail_block_size, tail_block_size);
+            GenericFFTConvolver::<F, C>::init(
+                &padded_ir[tail_block_size..tail_block_size + tail_ir_len],
+                head_block_size,
+                max_response_length,
+            )
+        } else {
+            GenericFFTConvolver::<F, C>::default()
+        };
+
+        let tail_output0 = vec![0.0; tail_block_size];
+        let tail_precalculated0 = vec![0.0; tail_block_size];
+
+        let tail_convolver = if max_response_length > 2 * tail_block_size {
+            let tail_ir_len = max_response_length - 2 * tail_block_size;
+            GenericFFTConvolver::<F, C>::init(
+                &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len],
+                tail_block_size,
+                max_response_length,
+            )
+        } else {
+            GenericFFTConvolver::<F, C>::default()
+        };
+
+        let tail_output = vec![0.0; tail_block_size];
+        let tail_precalculated = vec![0.0; tail_block_size];
+        let tail_input = vec![0.0; tail_block_size];
+        let tail_input_fill = 0;
+        let precalculated_pos = 0;
+
+        GenericTwoStageFFTConvolver {
+            head_convolver,
+            tail_convolver0,
+            tail_output0,
+            tail_precalculated0,
+            tail_convolver,
+            tail_output,
+            tail_precalculated,
+            tail_input,
+            tail_input_fill,
+            precalculated_pos,
+            head_block_size,
+            tail_block_size,
+        }
+    }
+
+    pub fn update(&mut self, _response: &[f32]) {
+        todo!()
+    }
+
+    pub fn process(&mut self, input: &[f32], output: &mut [f32]) {
+        let head_block_size = self.head_block_size;
+        let tail_block_size = self.tail_block_size;
+
+        // Head
+        self.head_convolver.process(input, output);
+
+        // Tail
+        if self.tail_input.is_empty() {
+            return;
+        }
+
+        let len = input.len();
+        let mut processed = 0;
+
+        while processed < len {
+            let remaining = len - processed;
+            let processing = std::cmp::min(
+                remaining,
+                head_block_size - (self.tail_input_fill % head_block_size),
+            );
+
+            // Sum head and tail
+            let sum_begin = processed;
+
+            // Sum: 1st tail block
+            if !self.tail_precalculated0.is_empty() {
+                C::add_to_buffer(
+                    &mut output[sum_begin..sum_begin + processing],
+                    &self.tail_precalculated0
+                        [self.precalculated_pos..self.precalculated_pos + processing],
+                );
+            }
+
+            // Sum: 2nd-Nth tail block
+            if !self.tail_precalculated.is_empty() {
+                C::add_to_buffer(
+                    &mut output[sum_begin..sum_begin + processing],
+                    &self.tail_precalculated
+                        [self.precalculated_pos..self.precalculated_pos + processing],
+                );
+            }
+            self.precalculated_pos += processing;
+
+            // Fill input buffer for tail convolution
+            C::copy_and_pad(
+                &mut self.tail_input[self.tail_input_fill..],
+                &input[processed..processed + processing],
+                processing,
+            );
+            self.tail_input_fill += processing;
+
+            // Convolution: 1st tail block
+            if !self.tail_precalculated0.is_empty() && self.tail_input_fill % head_block_size == 0 {
+                assert!(self.tail_input_fill >= head_block_size);
+                let block_offset = self.tail_input_fill - head_block_size;
+                self.tail_convolver0.process(
+                    &self.tail_input[block_offset..block_offset + head_block_size],
+                    &mut self.tail_output0[block_offset..block_offset + head_block_size],
+                );
+                if self.tail_input_fill == tail_block_size {
+                    std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0);
+                }
+            }
+
+            // Convolution: 2nd-Nth tail block (might be done in some background thread)
+            if !self.tail_precalculated.is_empty()
+                && self.tail_input_fill == tail_block_size
+                && self.tail_output.len() == tail_block_size
+            {
+                std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output);
+                self.tail_convolver
+                    .process(&self.tail_input, &mut self.tail_output);
+            }
+
+            if self.tail_input_fill == tail_block_size {
+                self.tail_input_fill = 0;
+                self.precalculated_pos = 0;
+            }
+
+            processed += processing;
+        }
+    }
+}

From 1ef9594e819c45ff8ab5b9c883a2772b3d3f21b1 Mon Sep 17 00:00:00 2001
From: Dan Grahelj <dan.grahelj@holoplot.com>
Date: Tue, 27 May 2025 16:05:34 +0200
Subject: [PATCH 09/11] Move tests

---
 src/fft_convolver/ipp_fft.rs  |  5 +--
 src/fft_convolver/mod.rs      | 71 -----------------------------------
 src/fft_convolver/rust_fft.rs | 20 +---------
 src/tests.rs                  | 63 +++++++++++++++++++++++++++++++
 4 files changed, 66 insertions(+), 93 deletions(-)

diff --git a/src/fft_convolver/ipp_fft.rs b/src/fft_convolver/ipp_fft.rs
index 2329347..43071e6 100644
--- a/src/fft_convolver/ipp_fft.rs
+++ b/src/fft_convolver/ipp_fft.rs
@@ -1,7 +1,4 @@
-use crate::{
-    fft_convolver::{ComplexOps, FftBackend},
-    Convolution, Sample,
-};
+use crate::fft_convolver::{ComplexOps, FftBackend};
 use ipp_sys::*;
 use num_complex::Complex32;
 
diff --git a/src/fft_convolver/mod.rs b/src/fft_convolver/mod.rs
index 19bd786..68f8436 100644
--- a/src/fft_convolver/mod.rs
+++ b/src/fft_convolver/mod.rs
@@ -13,74 +13,3 @@ pub use rust_fft::{FFTConvolver, Fft, TwoStageFFTConvolver};
 
 #[cfg(feature = "ipp")]
 pub use ipp_fft::{FFTConvolver, Fft, TwoStageFFTConvolver};
-
-#[cfg(all(test, feature = "ipp"))]
-mod tests {
-    use super::*;
-    use crate::{fft_convolver::ipp_fft, fft_convolver::rust_fft, Convolution};
-
-    // Helper function that runs both implementations and compares results
-    fn compare_implementations(impulse_response: &[f32], input: &[f32], block_size: usize) {
-        let max_len = impulse_response.len();
-
-        // Create both implementations
-        let mut rust_convolver =
-            rust_fft::FFTConvolver::init(impulse_response, block_size, max_len);
-        let mut ipp_convolver = ipp_fft::FFTConvolver::init(impulse_response, block_size, max_len);
-
-        // Prepare output buffers
-        let mut rust_output = vec![0.0; input.len()];
-        let mut ipp_output = vec![0.0; input.len()];
-
-        // Process with both implementations
-        rust_convolver.process(input, &mut rust_output);
-        ipp_convolver.process(input, &mut ipp_output);
-
-        // Compare results (accounting for floating-point precision differences)
-        for i in 0..input.len() {
-            assert!(
-                (rust_output[i] - ipp_output[i]).abs() < 1e-5,
-                "Outputs differ at position {}: rust={}, ipp={}",
-                i,
-                rust_output[i],
-                ipp_output[i]
-            );
-        }
-    }
-
-    #[test]
-    fn test_ipp_vs_rust_impulse() {
-        // Test with an impulse response
-        let mut response = vec![0.0; 1024];
-        response[0] = 1.0;
-        let input = vec![1.0; 1024];
-
-        compare_implementations(&response, &input, 256);
-    }
-
-    #[test]
-    fn test_ipp_vs_rust_decay() {
-        // Test with a decaying impulse response
-        let mut response = vec![0.0; 1024];
-        for i in 0..response.len() {
-            response[i] = 0.9f32.powi(i as i32);
-        }
-        let input = vec![1.0; 1024];
-
-        compare_implementations(&response, &input, 256);
-    }
-
-    #[test]
-    fn test_ipp_vs_rust_sine() {
-        // Test with a sine wave input
-        let mut response = vec![0.0; 1024];
-        response[0] = 1.0;
-
-        let mut input = vec![0.0; 1024];
-        for i in 0..input.len() {
-            input[i] = (i as f32 * 0.1).sin();
-        }
-
-        compare_implementations(&response, &input, 128);
-    }
-}
diff --git a/src/fft_convolver/rust_fft.rs b/src/fft_convolver/rust_fft.rs
index cb58478..30ccb87 100644
--- a/src/fft_convolver/rust_fft.rs
+++ b/src/fft_convolver/rust_fft.rs
@@ -1,8 +1,5 @@
-use crate::{
-    fft_convolver::{ComplexOps, FftBackend},
-    Convolution, Sample,
-};
-use realfft::{ComplexToReal, FftError, RealFftPlanner, RealToComplex};
+use crate::fft_convolver::{ComplexOps, FftBackend};
+use realfft::{ComplexToReal, RealFftPlanner, RealToComplex};
 use rustfft::num_complex::Complex;
 use std::sync::Arc;
 
@@ -147,19 +144,6 @@ impl ComplexOps for RustComplexOps {
         dst.clone_from_slice(src);
     }
 }
-#[test]
-fn test_fft_convolver_passthrough() {
-    let mut response = [0.0; 1024];
-    response[0] = 1.0;
-    let mut convolver = FFTConvolver::init(&response, 1024, response.len());
-    let input = vec![1.0; 1024];
-    let mut output = vec![0.0; 1024];
-    convolver.process(&input, &mut output);
-
-    for i in 0..1024 {
-        assert!((output[i] - 1.0).abs() < 1e-6);
-    }
-}
 
 pub type FFTConvolver = crate::fft_convolver::GenericFFTConvolver<Fft, RustComplexOps>;
 pub type TwoStageFFTConvolver =
diff --git a/src/tests.rs b/src/tests.rs
index 7a87161..830b302 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -164,4 +164,67 @@ mod tests {
             }
         }
     }
+
+    #[cfg(feature = "ipp")]
+    mod ipp_comparison_tests {
+        use crate::{fft_convolver::ipp_fft, fft_convolver::rust_fft, Convolution};
+
+        fn compare_implementations(impulse_response: &[f32], input: &[f32], block_size: usize) {
+            let max_len = impulse_response.len();
+
+            let mut rust_convolver =
+                rust_fft::FFTConvolver::init(impulse_response, block_size, max_len);
+            let mut ipp_convolver =
+                ipp_fft::FFTConvolver::init(impulse_response, block_size, max_len);
+
+            let mut rust_output = vec![0.0; input.len()];
+            let mut ipp_output = vec![0.0; input.len()];
+
+            rust_convolver.process(input, &mut rust_output);
+            ipp_convolver.process(input, &mut ipp_output);
+
+            for i in 0..input.len() {
+                assert!(
+                    (rust_output[i] - ipp_output[i]).abs() < 1e-5,
+                    "Outputs differ at position {}: rust={}, ipp={}",
+                    i,
+                    rust_output[i],
+                    ipp_output[i]
+                );
+            }
+        }
+
+        #[test]
+        fn test_ipp_vs_rust_impulse() {
+            let mut response = vec![0.0; 1024];
+            response[0] = 1.0;
+            let input = vec![1.0; 1024];
+
+            compare_implementations(&response, &input, 256);
+        }
+
+        #[test]
+        fn test_ipp_vs_rust_decay() {
+            let mut response = vec![0.0; 1024];
+            for i in 0..response.len() {
+                response[i] = 0.9f32.powi(i as i32);
+            }
+            let input = vec![1.0; 1024];
+
+            compare_implementations(&response, &input, 256);
+        }
+
+        #[test]
+        fn test_ipp_vs_rust_sine() {
+            let mut response = vec![0.0; 1024];
+            response[0] = 1.0;
+
+            let mut input = vec![0.0; 1024];
+            for i in 0..input.len() {
+                input[i] = (i as f32 * 0.1).sin();
+            }
+
+            compare_implementations(&response, &input, 128);
+        }
+    }
 }

From a4930342e75b6cd4755b91f38d9f3e0577b13626 Mon Sep 17 00:00:00 2001
From: Dan Grahelj <dan.grahelj@holoplot.com>
Date: Tue, 27 May 2025 16:23:50 +0200
Subject: [PATCH 10/11] Reimplement convolution trait for two state convolver

---
 src/fft_convolver/two_stage_convolver.rs | 27 +++++++++++++++++++++---
 src/tests.rs                             |  6 +++---
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/src/fft_convolver/two_stage_convolver.rs b/src/fft_convolver/two_stage_convolver.rs
index 1225b20..68e597f 100644
--- a/src/fft_convolver/two_stage_convolver.rs
+++ b/src/fft_convolver/two_stage_convolver.rs
@@ -2,6 +2,9 @@ use crate::fft_convolver::fft_convolver::GenericFFTConvolver;
 use crate::fft_convolver::traits::{ComplexOps, FftBackend};
 use crate::Convolution;
 
+const DEFAULT_HEAD_BLOCK_SIZE: usize = 128;
+const DEFAULT_TAIL_BLOCK_SIZE: usize = 1024;
+
 #[derive(Clone)]
 pub struct GenericTwoStageFFTConvolver<F: FftBackend, C: ComplexOps<Complex = F::Complex>>
 where
@@ -29,7 +32,7 @@ where
     F: Default,
     F::Complex: Clone + Default,
 {
-    pub fn init(
+    pub fn with_sizes(
         impulse_response: &[f32],
         _block_size: usize,
         max_response_length: usize,
@@ -97,12 +100,30 @@ where
             tail_block_size,
         }
     }
+}
+
+impl<F: FftBackend, C: ComplexOps<Complex = F::Complex>> Convolution
+    for GenericTwoStageFFTConvolver<F, C>
+where
+    C: Default,
+    F: Default,
+    F::Complex: Clone + Default,
+{
+    fn init(impulse_response: &[f32], _block_size: usize, max_response_length: usize) -> Self {
+        Self::with_sizes(
+            impulse_response,
+            _block_size,
+            max_response_length,
+            DEFAULT_HEAD_BLOCK_SIZE,
+            DEFAULT_TAIL_BLOCK_SIZE,
+        )
+    }
 
-    pub fn update(&mut self, _response: &[f32]) {
+    fn update(&mut self, _response: &[f32]) {
         todo!()
     }
 
-    pub fn process(&mut self, input: &[f32], output: &mut [f32]) {
+    fn process(&mut self, input: &[f32], output: &mut [f32]) {
         let head_block_size = self.head_block_size;
         let tail_block_size = self.tail_block_size;
 
diff --git a/src/tests.rs b/src/tests.rs
index 830b302..7cbb439 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -33,16 +33,16 @@ mod tests {
     }
 
     #[test]
-    fn two_stage_convolver_latency_test() {
+    fn two_stage_convolver_head_tail_sizes_test() {
         let block_size = 32;
         let input_size = block_size * 2;
         let fir_size = 4096;
         let mut response = vec![0.0; fir_size];
         response[63] = 1.0;
         let mut two_stage_convolver_1 =
-            TwoStageFFTConvolver::init(&response, block_size, fir_size, 64, 1024);
+            TwoStageFFTConvolver::with_sizes(&response, block_size, fir_size, 64, 1024);
         let mut two_stage_convolver_2 =
-            TwoStageFFTConvolver::init(&response, block_size, fir_size, 32, 1024);
+            TwoStageFFTConvolver::with_sizes(&response, block_size, fir_size, 32, 1024);
         let mut input = vec![0.0; input_size];
         input[0] = 1.0;
         let mut output_a = vec![0.0; input_size];

From 0720c34af374e4bed0db30784a99d33825ccab0b Mon Sep 17 00:00:00 2001
From: Dan Grahelj <dan.grahelj@holoplot.com>
Date: Tue, 27 May 2025 17:05:34 +0200
Subject: [PATCH 11/11] Pin ipp-sys commit SHA

---
 Cargo.lock | 16 ++++++++--------
 Cargo.toml |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5ffe61c..74a16b5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -21,12 +21,12 @@ dependencies = [
 [[package]]
 name = "ipp-headers-sys"
 version = "0.4.5"
-source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c"
+source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c"
 
 [[package]]
 name = "ipp-sys"
 version = "0.4.5"
-source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c"
+source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c"
 dependencies = [
  "ipp-headers-sys",
  "link-ippcore",
@@ -39,12 +39,12 @@ dependencies = [
 [[package]]
 name = "ipp-sys-build-help"
 version = "0.1.2"
-source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c"
+source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c"
 
 [[package]]
 name = "link-ippcore"
 version = "0.1.1"
-source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c"
+source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c"
 dependencies = [
  "ipp-sys-build-help",
 ]
@@ -52,7 +52,7 @@ dependencies = [
 [[package]]
 name = "link-ippcv"
 version = "0.1.2"
-source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c"
+source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c"
 dependencies = [
  "ipp-sys-build-help",
  "link-ippcore",
@@ -62,7 +62,7 @@ dependencies = [
 [[package]]
 name = "link-ippi"
 version = "0.1.2"
-source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c"
+source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c"
 dependencies = [
  "ipp-sys-build-help",
  "link-ippcore",
@@ -72,7 +72,7 @@ dependencies = [
 [[package]]
 name = "link-ipps"
 version = "0.1.2"
-source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c"
+source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c"
 dependencies = [
  "ipp-sys-build-help",
  "link-ippcore",
@@ -82,7 +82,7 @@ dependencies = [
 [[package]]
 name = "link-ippvm"
 version = "0.1.0"
-source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c"
+source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c"
 dependencies = [
  "ipp-sys-build-help",
  "link-ippcore",
diff --git a/Cargo.toml b/Cargo.toml
index 33c4d45..ba0578c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,7 +8,7 @@ edition = "2021"
 [dependencies]
 realfft = "3.3.0"
 rustfft = "6.1.0"
-ipp-sys = { git = "ssh://git@github.com/holoplot/ipp-sys.git", branch = "dan/generate-2022-bindings", features = ["2022"], optional = true  }
+ipp-sys = { git = "ssh://git@github.com/holoplot/ipp-sys.git", rev = "3e69ca21c1e24b141ab225a743910d999e591a4c", features = ["2022"], optional = true  }
 num-complex = "0.4.6"
 
 [features]