From 7a60c2859fcbe9fa86e8959a43764532708e1108 Mon Sep 17 00:00:00 2001 From: Dan Grahelj Date: Tue, 6 May 2025 10:21:23 +0200 Subject: [PATCH 01/11] Add ipp feature --- Cargo.lock | 73 ++- Cargo.toml | 5 + src/fft_convolver.rs | 1014 ++++++++++++++++++++++++------------------ 3 files changed, 662 insertions(+), 430 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bd6cbfa..9b1d380 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "autocfg" @@ -12,10 +12,81 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" name = "convolution" version = "0.1.0" dependencies = [ + "ipp-sys", "realfft", "rustfft", ] +[[package]] +name = "ipp-headers-sys" +version = "0.4.5" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c" + +[[package]] +name = "ipp-sys" +version = "0.4.5" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c" +dependencies = [ + "ipp-headers-sys", + "link-ippcore", + "link-ippcv", + "link-ippi", + "link-ipps", + "link-ippvm", +] + +[[package]] +name = "ipp-sys-build-help" +version = "0.1.2" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c" + +[[package]] +name = "link-ippcore" +version = "0.1.1" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c" +dependencies = [ + "ipp-sys-build-help", +] + +[[package]] +name = "link-ippcv" +version = "0.1.2" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c" +dependencies = [ + "ipp-sys-build-help", + "link-ippcore", + "link-ipps", +] + +[[package]] +name = "link-ippi" +version = "0.1.2" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c" +dependencies = [ + "ipp-sys-build-help", + "link-ippcore", + "link-ipps", +] + +[[package]] +name = "link-ipps" +version = "0.1.2" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c" +dependencies = [ + "ipp-sys-build-help", + "link-ippcore", + "link-ippvm", +] + +[[package]] +name = "link-ippvm" +version = "0.1.0" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c" +dependencies = [ + "ipp-sys-build-help", + "link-ippcore", +] + [[package]] name = "num-complex" version = "0.4.6" diff --git a/Cargo.toml b/Cargo.toml index 1b2270d..48b314f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,3 +8,8 @@ edition = "2021" [dependencies] realfft = "3.3.0" rustfft = "6.1.0" +ipp-sys = { git = "ssh://git@github.com/holoplot/ipp-sys.git", branch = "dan/generate-2022-bindings", features = ["2022"], optional = true } + +[features] +default = [] +ipp = ["ipp-sys"] diff --git a/src/fft_convolver.rs b/src/fft_convolver.rs index b3b1818..47f3c75 100644 --- a/src/fft_convolver.rs +++ b/src/fft_convolver.rs @@ -1,498 +1,654 @@ -use realfft::{ComplexToReal, FftError, RealFftPlanner, RealToComplex}; -use rustfft::num_complex::Complex; -use std::sync::Arc; - -use crate::{Convolution, Sample}; - -#[derive(Clone)] -pub struct Fft { - fft_forward: Arc>, - fft_inverse: Arc>, -} - -impl Default for Fft { - fn default() -> Self { - let mut planner = RealFftPlanner::::new(); - Self { - fft_forward: planner.plan_fft_forward(0), - fft_inverse: planner.plan_fft_inverse(0), - } +mod rust_fft { + use crate::{Convolution, Sample}; + use realfft::{ComplexToReal, FftError, RealFftPlanner, RealToComplex}; + use rustfft::num_complex::Complex; + use std::sync::Arc; + #[derive(Clone)] + pub struct Fft { + fft_forward: Arc>, + fft_inverse: Arc>, } -} - -impl std::fmt::Debug for Fft { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "") - } -} -impl Fft { - pub fn init(&mut self, length: usize) { - let mut planner = RealFftPlanner::::new(); - self.fft_forward = planner.plan_fft_forward(length); - self.fft_inverse = planner.plan_fft_inverse(length); + impl Default for Fft { + fn default() -> Self { + let mut planner = RealFftPlanner::::new(); + Self { + fft_forward: planner.plan_fft_forward(0), + fft_inverse: planner.plan_fft_inverse(0), + } + } } - pub fn forward(&self, input: &mut [f32], output: &mut [Complex]) -> Result<(), FftError> { - self.fft_forward.process(input, output)?; - Ok(()) + impl std::fmt::Debug for Fft { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "") + } } - pub fn inverse(&self, input: &mut [Complex], output: &mut [f32]) -> Result<(), FftError> { - self.fft_inverse.process(input, output)?; - - // FFT Normalization - let len = output.len(); - output.iter_mut().for_each(|bin| *bin /= len as f32); - - Ok(()) - } -} + impl Fft { + pub fn init(&mut self, length: usize) { + let mut planner = RealFftPlanner::::new(); + self.fft_forward = planner.plan_fft_forward(length); + self.fft_inverse = planner.plan_fft_inverse(length); + } -pub fn complex_size(size: usize) -> usize { - (size / 2) + 1 -} + pub fn forward( + &self, + input: &mut [f32], + output: &mut [Complex], + ) -> Result<(), FftError> { + self.fft_forward.process(input, output)?; + Ok(()) + } -pub fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) { - assert!(dst.len() >= src_size); - dst[0..src_size].clone_from_slice(&src[0..src_size]); - dst[src_size..].iter_mut().for_each(|value| *value = 0.); -} + pub fn inverse( + &self, + input: &mut [Complex], + output: &mut [f32], + ) -> Result<(), FftError> { + self.fft_inverse.process(input, output)?; -pub fn complex_multiply_accumulate( - result: &mut [Complex], - a: &[Complex], - b: &[Complex], -) { - assert_eq!(result.len(), a.len()); - assert_eq!(result.len(), b.len()); - let len = result.len(); - let end4 = 4 * (len / 4); - for i in (0..end4).step_by(4) { - result[i + 0].re += a[i + 0].re * b[i + 0].re - a[i + 0].im * b[i + 0].im; - result[i + 1].re += a[i + 1].re * b[i + 1].re - a[i + 1].im * b[i + 1].im; - result[i + 2].re += a[i + 2].re * b[i + 2].re - a[i + 2].im * b[i + 2].im; - result[i + 3].re += a[i + 3].re * b[i + 3].re - a[i + 3].im * b[i + 3].im; - result[i + 0].im += a[i + 0].re * b[i + 0].im + a[i + 0].im * b[i + 0].re; - result[i + 1].im += a[i + 1].re * b[i + 1].im + a[i + 1].im * b[i + 1].re; - result[i + 2].im += a[i + 2].re * b[i + 2].im + a[i + 2].im * b[i + 2].re; - result[i + 3].im += a[i + 3].re * b[i + 3].im + a[i + 3].im * b[i + 3].re; - } - for i in end4..len { - result[i].re += a[i].re * b[i].re - a[i].im * b[i].im; - result[i].im += a[i].re * b[i].im + a[i].im * b[i].re; - } -} + // FFT Normalization + let len = output.len(); + output.iter_mut().for_each(|bin| *bin /= len as f32); -pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) { - assert_eq!(result.len(), a.len()); - assert_eq!(result.len(), b.len()); - let len = result.len(); - let end4 = 3 * (len / 4); - for i in (0..end4).step_by(4) { - result[i + 0] = a[i + 0] + b[i + 0]; - result[i + 1] = a[i + 1] + b[i + 1]; - result[i + 2] = a[i + 2] + b[i + 2]; - result[i + 3] = a[i + 3] + b[i + 3]; + Ok(()) + } } - for i in end4..len { - result[i] = a[i] + b[i]; + #[derive(Default, Clone)] + pub struct FFTConvolver { + ir_len: usize, + block_size: usize, + _seg_size: usize, + seg_count: usize, + active_seg_count: usize, + _fft_complex_size: usize, + segments: Vec>>, + segments_ir: Vec>>, + fft_buffer: Vec, + fft: Fft, + pre_multiplied: Vec>, + conv: Vec>, + overlap: Vec, + current: usize, + input_buffer: Vec, + input_buffer_fill: usize, } -} -#[derive(Default, Clone)] -pub struct FFTConvolver { - ir_len: usize, - block_size: usize, - _seg_size: usize, - seg_count: usize, - active_seg_count: usize, - _fft_complex_size: usize, - segments: Vec>>, - segments_ir: Vec>>, - fft_buffer: Vec, - fft: Fft, - pre_multiplied: Vec>, - conv: Vec>, - overlap: Vec, - current: usize, - input_buffer: Vec, - input_buffer_fill: usize, -} -impl Convolution for FFTConvolver { - fn init(impulse_response: &[Sample], block_size: usize, max_response_length: usize) -> Self { - if max_response_length < impulse_response.len() { - panic!( + impl Convolution for FFTConvolver { + fn init( + impulse_response: &[Sample], + block_size: usize, + max_response_length: usize, + ) -> Self { + if max_response_length < impulse_response.len() { + panic!( "max_response_length must be at least the length of the initial impulse response" ); - } - let mut padded_ir = impulse_response.to_vec(); - padded_ir.resize(max_response_length, 0.); - let ir_len = padded_ir.len(); - - let block_size = block_size.next_power_of_two(); - let seg_size = 2 * block_size; - let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize; - let active_seg_count = seg_count; - let fft_complex_size = complex_size(seg_size); - - // FFT - let mut fft = Fft::default(); - fft.init(seg_size); - let mut fft_buffer = vec![0.; seg_size]; - - // prepare segments - let segments = vec![vec![Complex::new(0., 0.); fft_complex_size]; seg_count]; - let mut segments_ir = Vec::new(); - - // prepare ir - for i in 0..seg_count { - let mut segment = vec![Complex::new(0., 0.); fft_complex_size]; - let remaining = ir_len - (i * block_size); - let size_copy = if remaining >= block_size { - block_size - } else { - remaining - }; - copy_and_pad(&mut fft_buffer, &padded_ir[i * block_size..], size_copy); - fft.forward(&mut fft_buffer, &mut segment).unwrap(); - segments_ir.push(segment); - } + } + let mut padded_ir = impulse_response.to_vec(); + padded_ir.resize(max_response_length, 0.); + let ir_len = padded_ir.len(); + + let block_size = block_size.next_power_of_two(); + let seg_size = 2 * block_size; + let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize; + let active_seg_count = seg_count; + let fft_complex_size = complex_size(seg_size); + + // FFT + let mut fft = Fft::default(); + fft.init(seg_size); + let mut fft_buffer = vec![0.; seg_size]; + + // prepare segments + let segments = vec![vec![Complex::new(0., 0.); fft_complex_size]; seg_count]; + let mut segments_ir = Vec::new(); + + // prepare ir + for i in 0..seg_count { + let mut segment = vec![Complex::new(0., 0.); fft_complex_size]; + let remaining = ir_len - (i * block_size); + let size_copy = if remaining >= block_size { + block_size + } else { + remaining + }; + copy_and_pad(&mut fft_buffer, &padded_ir[i * block_size..], size_copy); + fft.forward(&mut fft_buffer, &mut segment).unwrap(); + segments_ir.push(segment); + } - // prepare convolution buffers - let pre_multiplied = vec![Complex::new(0., 0.); fft_complex_size]; - let conv = vec![Complex::new(0., 0.); fft_complex_size]; - let overlap = vec![0.; block_size]; - - // prepare input buffer - let input_buffer = vec![0.; block_size]; - let input_buffer_fill = 0; - - // reset current position - let current = 0; - - Self { - ir_len, - block_size, - _seg_size: seg_size, - seg_count, - active_seg_count, - _fft_complex_size: fft_complex_size, - segments, - segments_ir, - fft_buffer, - fft, - pre_multiplied, - conv, - overlap, - current, - input_buffer, - input_buffer_fill, + // prepare convolution buffers + let pre_multiplied = vec![Complex::new(0., 0.); fft_complex_size]; + let conv = vec![Complex::new(0., 0.); fft_complex_size]; + let overlap = vec![0.; block_size]; + + // prepare input buffer + let input_buffer = vec![0.; block_size]; + let input_buffer_fill = 0; + + // reset current position + let current = 0; + + Self { + ir_len, + block_size, + _seg_size: seg_size, + seg_count, + active_seg_count, + _fft_complex_size: fft_complex_size, + segments, + segments_ir, + fft_buffer, + fft, + pre_multiplied, + conv, + overlap, + current, + input_buffer, + input_buffer_fill, + } } - } - fn update(&mut self, response: &[Sample]) { - let new_ir_len = response.len(); + fn update(&mut self, response: &[Sample]) { + let new_ir_len = response.len(); - if new_ir_len > self.ir_len { - panic!("New impulse response is longer than initialized length"); - } + if new_ir_len > self.ir_len { + panic!("New impulse response is longer than initialized length"); + } - if self.ir_len == 0 { - return; - } + if self.ir_len == 0 { + return; + } - self.fft_buffer.fill(0.); - self.conv.fill(Complex::new(0., 0.)); - self.pre_multiplied.fill(Complex::new(0., 0.)); - self.overlap.fill(0.); - - self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize; - - // Prepare IR - for i in 0..self.active_seg_count { - let segment = &mut self.segments_ir[i]; - let remaining = new_ir_len - (i * self.block_size); - let size_copy = if remaining >= self.block_size { - self.block_size - } else { - remaining - }; - copy_and_pad( - &mut self.fft_buffer, - &response[i * self.block_size..], - size_copy, - ); - self.fft.forward(&mut self.fft_buffer, segment).unwrap(); - } + self.fft_buffer.fill(0.); + self.conv.fill(Complex::new(0., 0.)); + self.pre_multiplied.fill(Complex::new(0., 0.)); + self.overlap.fill(0.); - // Clear remaining segments - for i in self.active_seg_count..self.seg_count { - self.segments_ir[i].fill(Complex::new(0., 0.)); - } - } + self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize; - fn process(&mut self, input: &[Sample], output: &mut [Sample]) { - if self.active_seg_count == 0 { - output.fill(0.); - return; - } - - let mut processed = 0; - while processed < output.len() { - let input_buffer_was_empty = self.input_buffer_fill == 0; - let processing = std::cmp::min( - output.len() - processed, - self.block_size - self.input_buffer_fill, - ); + // Prepare IR + for i in 0..self.active_seg_count { + let segment = &mut self.segments_ir[i]; + let remaining = new_ir_len - (i * self.block_size); + let size_copy = if remaining >= self.block_size { + self.block_size + } else { + remaining + }; + copy_and_pad( + &mut self.fft_buffer, + &response[i * self.block_size..], + size_copy, + ); + self.fft.forward(&mut self.fft_buffer, segment).unwrap(); + } - let input_buffer_pos = self.input_buffer_fill; - self.input_buffer[input_buffer_pos..input_buffer_pos + processing] - .clone_from_slice(&input[processed..processed + processing]); + // Clear remaining segments + for i in self.active_seg_count..self.seg_count { + self.segments_ir[i].fill(Complex::new(0., 0.)); + } + } - // Forward FFT - copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size); - if let Err(_err) = self - .fft - .forward(&mut self.fft_buffer, &mut self.segments[self.current]) - { + fn process(&mut self, input: &[Sample], output: &mut [Sample]) { + if self.active_seg_count == 0 { output.fill(0.); - return; // error! + return; } - // complex multiplication - if input_buffer_was_empty { - self.pre_multiplied.fill(Complex { re: 0., im: 0. }); - for i in 1..self.active_seg_count { - let index_ir = i; - let index_audio = (self.current + i) % self.active_seg_count; - complex_multiply_accumulate( - &mut self.pre_multiplied, - &self.segments_ir[index_ir], - &self.segments[index_audio], - ); + let mut processed = 0; + while processed < output.len() { + let input_buffer_was_empty = self.input_buffer_fill == 0; + let processing = std::cmp::min( + output.len() - processed, + self.block_size - self.input_buffer_fill, + ); + + let input_buffer_pos = self.input_buffer_fill; + self.input_buffer[input_buffer_pos..input_buffer_pos + processing] + .clone_from_slice(&input[processed..processed + processing]); + + // Forward FFT + copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size); + if let Err(_err) = self + .fft + .forward(&mut self.fft_buffer, &mut self.segments[self.current]) + { + output.fill(0.); + return; // error! } - } - self.conv.clone_from_slice(&self.pre_multiplied); - complex_multiply_accumulate( - &mut self.conv, - &self.segments[self.current], - &self.segments_ir[0], - ); - // Backward FFT - if let Err(_err) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) { - output.fill(0.); - return; // error! - } + // complex multiplication + if input_buffer_was_empty { + self.pre_multiplied.fill(Complex { re: 0., im: 0. }); + for i in 1..self.active_seg_count { + let index_ir = i; + let index_audio = (self.current + i) % self.active_seg_count; + complex_multiply_accumulate( + &mut self.pre_multiplied, + &self.segments_ir[index_ir], + &self.segments[index_audio], + ); + } + } + self.conv.clone_from_slice(&self.pre_multiplied); + complex_multiply_accumulate( + &mut self.conv, + &self.segments[self.current], + &self.segments_ir[0], + ); - // Add overlap - sum( - &mut output[processed..processed + processing], - &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing], - &self.overlap[input_buffer_pos..input_buffer_pos + processing], - ); + // Backward FFT + if let Err(_err) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) { + output.fill(0.); + return; // error! + } - // Input buffer full => Next block - self.input_buffer_fill += processing; - if self.input_buffer_fill == self.block_size { - // Input buffer is empty again now - self.input_buffer.fill(0.); - self.input_buffer_fill = 0; - // Save the overlap - self.overlap - .clone_from_slice(&self.fft_buffer[self.block_size..self.block_size * 2]); - - // Update the current segment - self.current = if self.current > 0 { - self.current - 1 - } else { - self.active_seg_count - 1 - }; + // Add overlap + sum( + &mut output[processed..processed + processing], + &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing], + &self.overlap[input_buffer_pos..input_buffer_pos + processing], + ); + + // Input buffer full => Next block + self.input_buffer_fill += processing; + if self.input_buffer_fill == self.block_size { + // Input buffer is empty again now + self.input_buffer.fill(0.); + self.input_buffer_fill = 0; + // Save the overlap + self.overlap + .clone_from_slice(&self.fft_buffer[self.block_size..self.block_size * 2]); + + // Update the current segment + self.current = if self.current > 0 { + self.current - 1 + } else { + self.active_seg_count - 1 + }; + } + processed += processing; } - processed += processing; } } -} + pub fn complex_size(size: usize) -> usize { + (size / 2) + 1 + } -#[test] -fn test_fft_convolver_passthrough() { - let mut response = [0.0; 1024]; - response[0] = 1.0; - let mut convolver = FFTConvolver::init(&response, 1024, response.len()); - let input = vec![1.0; 1024]; - let mut output = vec![0.0; 1024]; - convolver.process(&input, &mut output); - - for i in 0..1024 { - assert!((output[i] - 1.0).abs() < 1e-6); + pub fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) { + assert!(dst.len() >= src_size); + dst[0..src_size].clone_from_slice(&src[0..src_size]); + dst[src_size..].iter_mut().for_each(|value| *value = 0.); } -} -#[derive(Clone)] -pub struct TwoStageFFTConvolver { - head_convolver: FFTConvolver, - tail_convolver0: FFTConvolver, - tail_output0: Vec, - tail_precalculated0: Vec, - tail_convolver: FFTConvolver, - tail_output: Vec, - tail_precalculated: Vec, - tail_input: Vec, - tail_input_fill: usize, - precalculated_pos: usize, -} + pub fn complex_multiply_accumulate( + result: &mut [Complex], + a: &[Complex], + b: &[Complex], + ) { + assert_eq!(result.len(), a.len()); + assert_eq!(result.len(), b.len()); + let len = result.len(); + let end4 = 4 * (len / 4); + for i in (0..end4).step_by(4) { + result[i + 0].re += a[i + 0].re * b[i + 0].re - a[i + 0].im * b[i + 0].im; + result[i + 1].re += a[i + 1].re * b[i + 1].re - a[i + 1].im * b[i + 1].im; + result[i + 2].re += a[i + 2].re * b[i + 2].re - a[i + 2].im * b[i + 2].im; + result[i + 3].re += a[i + 3].re * b[i + 3].re - a[i + 3].im * b[i + 3].im; + result[i + 0].im += a[i + 0].re * b[i + 0].im + a[i + 0].im * b[i + 0].re; + result[i + 1].im += a[i + 1].re * b[i + 1].im + a[i + 1].im * b[i + 1].re; + result[i + 2].im += a[i + 2].re * b[i + 2].im + a[i + 2].im * b[i + 2].re; + result[i + 3].im += a[i + 3].re * b[i + 3].im + a[i + 3].im * b[i + 3].re; + } + for i in end4..len { + result[i].re += a[i].re * b[i].re - a[i].im * b[i].im; + result[i].im += a[i].re * b[i].im + a[i].im * b[i].re; + } + } -const HEAD_BLOCK_SIZE: usize = 128; -const TAIL_BLOCK_SIZE: usize = 1024; + pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) { + assert_eq!(result.len(), a.len()); + assert_eq!(result.len(), b.len()); + let len = result.len(); + let end4 = 3 * (len / 4); + for i in (0..end4).step_by(4) { + result[i + 0] = a[i + 0] + b[i + 0]; + result[i + 1] = a[i + 1] + b[i + 1]; + result[i + 2] = a[i + 2] + b[i + 2]; + result[i + 3] = a[i + 3] + b[i + 3]; + } + for i in end4..len { + result[i] = a[i] + b[i]; + } + } + #[test] + fn test_fft_convolver_passthrough() { + let mut response = [0.0; 1024]; + response[0] = 1.0; + let mut convolver = FFTConvolver::init(&response, 1024, response.len()); + let input = vec![1.0; 1024]; + let mut output = vec![0.0; 1024]; + convolver.process(&input, &mut output); + + for i in 0..1024 { + assert!((output[i] - 1.0).abs() < 1e-6); + } + } + #[derive(Clone)] + pub struct TwoStageFFTConvolver { + head_convolver: FFTConvolver, + tail_convolver0: FFTConvolver, + tail_output0: Vec, + tail_precalculated0: Vec, + tail_convolver: FFTConvolver, + tail_output: Vec, + tail_precalculated: Vec, + tail_input: Vec, + tail_input_fill: usize, + precalculated_pos: usize, + } + + const HEAD_BLOCK_SIZE: usize = 128; + const TAIL_BLOCK_SIZE: usize = 1024; -impl Convolution for TwoStageFFTConvolver { - fn init(impulse_response: &[Sample], _block_size: usize, max_response_length: usize) -> Self { - let head_block_size = HEAD_BLOCK_SIZE; - let tail_block_size = TAIL_BLOCK_SIZE; + impl Convolution for TwoStageFFTConvolver { + fn init( + impulse_response: &[Sample], + _block_size: usize, + max_response_length: usize, + ) -> Self { + let head_block_size = HEAD_BLOCK_SIZE; + let tail_block_size = TAIL_BLOCK_SIZE; - if max_response_length < impulse_response.len() { - panic!( + if max_response_length < impulse_response.len() { + panic!( "max_response_length must be at least the length of the initial impulse response" ); + } + let mut padded_ir = impulse_response.to_vec(); + padded_ir.resize(max_response_length, 0.); + + let head_ir_len = std::cmp::min(max_response_length, tail_block_size); + let head_convolver = FFTConvolver::init( + &padded_ir[0..head_ir_len], + head_block_size, + max_response_length, + ); + + let tail_convolver0 = (max_response_length > tail_block_size) + .then(|| { + let tail_ir_len = + std::cmp::min(max_response_length - tail_block_size, tail_block_size); + FFTConvolver::init( + &padded_ir[tail_block_size..tail_block_size + tail_ir_len], + head_block_size, + max_response_length, + ) + }) + .unwrap_or_default(); + + let tail_output0 = vec![0.0; tail_block_size]; + let tail_precalculated0 = vec![0.0; tail_block_size]; + + let tail_convolver = (max_response_length > 2 * tail_block_size) + .then(|| { + let tail_ir_len = max_response_length - 2 * tail_block_size; + FFTConvolver::init( + &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len], + tail_block_size, + max_response_length, + ) + }) + .unwrap_or_default(); + + let tail_output = vec![0.0; tail_block_size]; + let tail_precalculated = vec![0.0; tail_block_size]; + let tail_input = vec![0.0; tail_block_size]; + let tail_input_fill = 0; + let precalculated_pos = 0; + + TwoStageFFTConvolver { + head_convolver, + tail_convolver0, + tail_output0, + tail_precalculated0, + tail_convolver, + tail_output, + tail_precalculated, + tail_input, + tail_input_fill, + precalculated_pos, + } } - let mut padded_ir = impulse_response.to_vec(); - padded_ir.resize(max_response_length, 0.); - - let head_ir_len = std::cmp::min(max_response_length, tail_block_size); - let head_convolver = FFTConvolver::init( - &padded_ir[0..head_ir_len], - head_block_size, - max_response_length, - ); - - let tail_convolver0 = (max_response_length > tail_block_size) - .then(|| { - let tail_ir_len = - std::cmp::min(max_response_length - tail_block_size, tail_block_size); - FFTConvolver::init( - &padded_ir[tail_block_size..tail_block_size + tail_ir_len], - head_block_size, - max_response_length, - ) - }) - .unwrap_or_default(); - - let tail_output0 = vec![0.0; tail_block_size]; - let tail_precalculated0 = vec![0.0; tail_block_size]; - - let tail_convolver = (max_response_length > 2 * tail_block_size) - .then(|| { - let tail_ir_len = max_response_length - 2 * tail_block_size; - FFTConvolver::init( - &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len], - tail_block_size, - max_response_length, - ) - }) - .unwrap_or_default(); - - let tail_output = vec![0.0; tail_block_size]; - let tail_precalculated = vec![0.0; tail_block_size]; - let tail_input = vec![0.0; tail_block_size]; - let tail_input_fill = 0; - let precalculated_pos = 0; - - TwoStageFFTConvolver { - head_convolver, - tail_convolver0, - tail_output0, - tail_precalculated0, - tail_convolver, - tail_output, - tail_precalculated, - tail_input, - tail_input_fill, - precalculated_pos, + + fn update(&mut self, _response: &[Sample]) { + todo!() } - } - fn update(&mut self, _response: &[Sample]) { - todo!() - } + fn process(&mut self, input: &[Sample], output: &mut [Sample]) { + // Head + self.head_convolver.process(input, output); - fn process(&mut self, input: &[Sample], output: &mut [Sample]) { - // Head - self.head_convolver.process(input, output); + // Tail + if self.tail_input.is_empty() { + return; + } - // Tail - if self.tail_input.is_empty() { - return; - } + let len = input.len(); + let mut processed = 0; - let len = input.len(); - let mut processed = 0; + while processed < len { + let remaining = len - processed; + let processing = std::cmp::min( + remaining, + HEAD_BLOCK_SIZE - (self.tail_input_fill % HEAD_BLOCK_SIZE), + ); - while processed < len { - let remaining = len - processed; - let processing = std::cmp::min( - remaining, - HEAD_BLOCK_SIZE - (self.tail_input_fill % HEAD_BLOCK_SIZE), - ); + // Sum head and tail + let sum_begin = processed; + let sum_end = processed + processing; + + // Sum: 1st tail block + if self.tail_precalculated0.len() > 0 { + let mut precalculated_pos = self.precalculated_pos; + for i in sum_begin..sum_end { + output[i] += self.tail_precalculated0[precalculated_pos]; + precalculated_pos += 1; + } + } - // Sum head and tail - let sum_begin = processed; - let sum_end = processed + processing; + // Sum: 2nd-Nth tail block + if self.tail_precalculated.len() > 0 { + let mut precalculated_pos = self.precalculated_pos; + for i in sum_begin..sum_end { + output[i] += self.tail_precalculated[precalculated_pos]; + precalculated_pos += 1; + } + } - // Sum: 1st tail block - if self.tail_precalculated0.len() > 0 { - let mut precalculated_pos = self.precalculated_pos; - for i in sum_begin..sum_end { - output[i] += self.tail_precalculated0[precalculated_pos]; - precalculated_pos += 1; + self.precalculated_pos += processing; + + // Fill input buffer for tail convolution + self.tail_input[self.tail_input_fill..self.tail_input_fill + processing] + .copy_from_slice(&input[processed..processed + processing]); + self.tail_input_fill += processing; + + // Convolution: 1st tail block + if self.tail_precalculated0.len() > 0 && self.tail_input_fill % HEAD_BLOCK_SIZE == 0 + { + assert!(self.tail_input_fill >= HEAD_BLOCK_SIZE); + let block_offset = self.tail_input_fill - HEAD_BLOCK_SIZE; + self.tail_convolver0.process( + &self.tail_input[block_offset..block_offset + HEAD_BLOCK_SIZE], + &mut self.tail_output0[block_offset..block_offset + HEAD_BLOCK_SIZE], + ); + if self.tail_input_fill == TAIL_BLOCK_SIZE { + std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0); + } + } + + // Convolution: 2nd-Nth tail block (might be done in some background thread) + if self.tail_precalculated.len() > 0 + && self.tail_input_fill == TAIL_BLOCK_SIZE + && self.tail_output.len() == TAIL_BLOCK_SIZE + { + std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output); + self.tail_convolver + .process(&self.tail_input, &mut self.tail_output); } - } - // Sum: 2nd-Nth tail block - if self.tail_precalculated.len() > 0 { - let mut precalculated_pos = self.precalculated_pos; - for i in sum_begin..sum_end { - output[i] += self.tail_precalculated[precalculated_pos]; - precalculated_pos += 1; + if self.tail_input_fill == TAIL_BLOCK_SIZE { + self.tail_input_fill = 0; + self.precalculated_pos = 0; } + + processed += processing; + } + } + } +} + +#[cfg(feature = "ipp")] +mod ipp_fft { + use ipp_sys::*; + use num_complex::Complex32; + use std::mem; + use std::ptr; + use std::slice; + + pub struct FFT { + size: usize, + spec: Vec, + scratch_buffer: Vec, + } + + impl FFT { + pub fn new(size: usize) -> Self { + assert!(size.is_power_of_two(), "FFT size must be a power of 2"); + + let mut spec_size = 0; + let mut init_size = 0; + let mut work_buf_size = 0; + + unsafe { + ippsDFTGetSize_R_32f( + size as i32, + 8, // IPP_FFT_NODIV_BY_ANY + 0, // No special hint + &mut spec_size, + &mut init_size, + &mut work_buf_size, + ); } - self.precalculated_pos += processing; + // Allocate aligned memory for buffers to get best SIMD performance + let mut init_buffer = allocate_aligned_buffer(init_size as usize); + let scratch_buffer = allocate_aligned_buffer(work_buf_size as usize); + let mut spec = allocate_aligned_buffer(spec_size as usize); + + unsafe { + ippsDFTInit_R_32f( + size as i32, + 8, // IPP_FFT_NODIV_BY_ANY + 0, // No special hint + spec.as_mut_ptr(), + init_buffer.as_mut_ptr(), + ); + } - // Fill input buffer for tail convolution - self.tail_input[self.tail_input_fill..self.tail_input_fill + processing] - .copy_from_slice(&input[processed..processed + processing]); - self.tail_input_fill += processing; + Self { + size, + spec, + scratch_buffer, + } + } + + pub fn forward(&mut self, input: &[f32], output: &mut [Complex32]) { + assert_eq!(input.len(), self.size); + assert_eq!(output.len(), self.size / 2 + 1); - // Convolution: 1st tail block - if self.tail_precalculated0.len() > 0 && self.tail_input_fill % HEAD_BLOCK_SIZE == 0 { - assert!(self.tail_input_fill >= HEAD_BLOCK_SIZE); - let block_offset = self.tail_input_fill - HEAD_BLOCK_SIZE; - self.tail_convolver0.process( - &self.tail_input[block_offset..block_offset + HEAD_BLOCK_SIZE], - &mut self.tail_output0[block_offset..block_offset + HEAD_BLOCK_SIZE], + unsafe { + ippsDFTFwd_RToCCS_32f( + input.as_ptr(), + output.as_mut_ptr() as *mut _, + self.spec.as_ptr(), + self.scratch_buffer.as_mut_ptr(), ); - if self.tail_input_fill == TAIL_BLOCK_SIZE { - std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0); - } } + } + + pub fn inverse(&mut self, input: &[Complex32], output: &mut [f32]) { + assert_eq!(input.len(), self.size / 2 + 1); + assert_eq!(output.len(), self.size); - // Convolution: 2nd-Nth tail block (might be done in some background thread) - if self.tail_precalculated.len() > 0 - && self.tail_input_fill == TAIL_BLOCK_SIZE - && self.tail_output.len() == TAIL_BLOCK_SIZE - { - std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output); - self.tail_convolver - .process(&self.tail_input, &mut self.tail_output); + unsafe { + ippsDFTInv_CCSToR_32f( + input.as_ptr() as *const _, + output.as_mut_ptr(), + self.spec.as_ptr(), + self.scratch_buffer.as_mut_ptr(), + ); } + } - if self.tail_input_fill == TAIL_BLOCK_SIZE { - self.tail_input_fill = 0; - self.precalculated_pos = 0; + pub fn size(&self) -> usize { + self.size + } + + pub fn normalization_factor(&self) -> f32 { + self.size as f32 + } + } + + // Helper function to allocate aligned memory using IPP functions + fn allocate_aligned_buffer(size: usize) -> Vec { + if size == 0 { + return Vec::new(); + } + + unsafe { + let ptr = ippsMalloc_8u(size as i32); + if ptr.is_null() { + panic!("Failed to allocate aligned memory"); } - processed += processing; + // Create a vector that will free the memory when dropped + let mut v = Vec::with_capacity(0); + let v_ptr = v.as_mut_ptr(); + mem::forget(v); + + // Construct a vector from the IPP-aligned pointer + Vec::from_raw_parts(ptr, size, size) + } + } + + // Drop trait implementation to free aligned memory + impl Drop for FFT { + fn drop(&mut self) { + unsafe { + if !self.spec.is_empty() { + let ptr = self.spec.as_mut_ptr(); + let len = self.spec.len(); + mem::forget(mem::replace(&mut self.spec, Vec::new())); + ippsFree(ptr as *mut _); + } + + if !self.scratch_buffer.is_empty() { + let ptr = self.scratch_buffer.as_mut_ptr(); + let len = self.scratch_buffer.len(); + mem::forget(mem::replace(&mut self.scratch_buffer, Vec::new())); + ippsFree(ptr as *mut _); + } + } } } } + +#[cfg(not(feature = "ipp"))] +pub use rust_fft::{complex_multiply_accumulate, complex_size, sum, FFTConvolver, Fft}; + +#[cfg(feature = "ipp")] +pub use self::ipp_fft::FFT; From 38bbb61e71a99acfd9b3a5fd435bdc23a75d9eea Mon Sep 17 00:00:00 2001 From: Dan Grahelj Date: Tue, 6 May 2025 10:37:32 +0200 Subject: [PATCH 02/11] Add crude FFT and convolvers implementation using ipp --- Cargo.lock | 1 + Cargo.toml | 1 + src/fft_convolver.rs | 702 ++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 665 insertions(+), 39 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9b1d380..5ffe61c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13,6 +13,7 @@ name = "convolution" version = "0.1.0" dependencies = [ "ipp-sys", + "num-complex", "realfft", "rustfft", ] diff --git a/Cargo.toml b/Cargo.toml index 48b314f..33c4d45 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,7 @@ edition = "2021" realfft = "3.3.0" rustfft = "6.1.0" ipp-sys = { git = "ssh://git@github.com/holoplot/ipp-sys.git", branch = "dan/generate-2022-bindings", features = ["2022"], optional = true } +num-complex = "0.4.6" [features] default = [] diff --git a/src/fft_convolver.rs b/src/fft_convolver.rs index 47f3c75..f83b2d7 100644 --- a/src/fft_convolver.rs +++ b/src/fft_convolver.rs @@ -513,20 +513,60 @@ mod rust_fft { #[cfg(feature = "ipp")] mod ipp_fft { + use crate::{Convolution, Sample}; use ipp_sys::*; use num_complex::Complex32; use std::mem; use std::ptr; use std::slice; - pub struct FFT { + pub struct Fft { size: usize, spec: Vec, scratch_buffer: Vec, } - impl FFT { + impl Default for Fft { + fn default() -> Self { + Self { + size: 0, + spec: Vec::new(), + scratch_buffer: Vec::new(), + } + } + } + + impl std::fmt::Debug for Fft { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "IppFft(size: {})", self.size) + } + } + + impl Clone for Fft { + fn clone(&self) -> Self { + let mut new_fft = Fft::default(); + if self.size > 0 { + new_fft.init(self.size); + } + new_fft + } + } + + impl Fft { pub fn new(size: usize) -> Self { + let mut fft = Self::default(); + fft.init(size); + fft + } + + pub fn init(&mut self, size: usize) { + // Clean up old resources if any + self.clean_up(); + + if size == 0 { + return; + } + assert!(size.is_power_of_two(), "FFT size must be a power of 2"); let mut spec_size = 0; @@ -554,52 +594,108 @@ mod ipp_fft { size as i32, 8, // IPP_FFT_NODIV_BY_ANY 0, // No special hint - spec.as_mut_ptr(), + spec.as_mut_ptr() as *mut ipp_sys::IppsDFTSpec_R_32f, init_buffer.as_mut_ptr(), ); } - Self { - size, - spec, - scratch_buffer, + // Free the init buffer as it's no longer needed + unsafe { + if !init_buffer.is_empty() { + let ptr = init_buffer.as_mut_ptr(); + mem::forget(mem::replace(&mut init_buffer, Vec::new())); + ippsFree(ptr as *mut _); + } } + + self.size = size; + self.spec = spec; + self.scratch_buffer = scratch_buffer; } - pub fn forward(&mut self, input: &[f32], output: &mut [Complex32]) { - assert_eq!(input.len(), self.size); - assert_eq!(output.len(), self.size / 2 + 1); + pub fn forward( + &mut self, + input: &mut [f32], + output: &mut [Complex32], + ) -> Result<(), &'static str> { + if self.size == 0 { + return Err("FFT not initialized"); + } + + assert_eq!(input.len(), self.size, "Input length must match FFT size"); + assert_eq!( + output.len(), + self.size / 2 + 1, + "Output length must be size/2 + 1" + ); unsafe { ippsDFTFwd_RToCCS_32f( input.as_ptr(), output.as_mut_ptr() as *mut _, - self.spec.as_ptr(), + self.spec.as_ptr() as *const DFTSpec_R_32f, self.scratch_buffer.as_mut_ptr(), ); } + + Ok(()) } - pub fn inverse(&mut self, input: &[Complex32], output: &mut [f32]) { - assert_eq!(input.len(), self.size / 2 + 1); - assert_eq!(output.len(), self.size); + pub fn inverse( + &mut self, + input: &mut [Complex32], + output: &mut [f32], + ) -> Result<(), &'static str> { + if self.size == 0 { + return Err("FFT not initialized"); + } + + assert_eq!( + input.len(), + self.size / 2 + 1, + "Input length must be size/2 + 1" + ); + assert_eq!(output.len(), self.size, "Output length must match FFT size"); unsafe { ippsDFTInv_CCSToR_32f( input.as_ptr() as *const _, output.as_mut_ptr(), - self.spec.as_ptr(), + self.spec.as_ptr() as *const DFTSpec_R_32f, self.scratch_buffer.as_mut_ptr(), ); } + + // Normalize the result + unsafe { + ippsDivC_32f_I(self.size as f32, output.as_mut_ptr(), self.size as i32); + } + + Ok(()) } - pub fn size(&self) -> usize { - self.size + fn clean_up(&mut self) { + unsafe { + if !self.spec.is_empty() { + let ptr = self.spec.as_mut_ptr(); + mem::forget(mem::replace(&mut self.spec, Vec::new())); + ippsFree(ptr as *mut _); + } + + if !self.scratch_buffer.is_empty() { + let ptr = self.scratch_buffer.as_mut_ptr(); + mem::forget(mem::replace(&mut self.scratch_buffer, Vec::new())); + ippsFree(ptr as *mut _); + } + } + self.size = 0; } + } - pub fn normalization_factor(&self) -> f32 { - self.size as f32 + // Drop trait implementation to free aligned memory + impl Drop for Fft { + fn drop(&mut self) { + self.clean_up(); } } @@ -615,40 +711,568 @@ mod ipp_fft { panic!("Failed to allocate aligned memory"); } - // Create a vector that will free the memory when dropped - let mut v = Vec::with_capacity(0); - let v_ptr = v.as_mut_ptr(); - mem::forget(v); - // Construct a vector from the IPP-aligned pointer Vec::from_raw_parts(ptr, size, size) } } - // Drop trait implementation to free aligned memory - impl Drop for FFT { - fn drop(&mut self) { + // Helper functions for FFT convolution + pub fn complex_size(size: usize) -> usize { + (size / 2) + 1 + } + + pub fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) { + assert!(dst.len() >= src_size, "Destination buffer too small"); + + // Copy the source data + unsafe { + ippsCopy_32f(src.as_ptr(), dst.as_mut_ptr(), src_size as i32); + + // Zero-pad the rest + if dst.len() > src_size { + ippsZero_32f( + dst.as_mut_ptr().add(src_size), + (dst.len() - src_size) as i32, + ); + } + } + } + + pub fn complex_multiply_accumulate(result: &mut [Complex32], a: &[Complex32], b: &[Complex32]) { + assert_eq!(result.len(), a.len()); + assert_eq!(result.len(), b.len()); + + unsafe { + // IPP doesn't have a direct complex multiply-accumulate, so we do it in two steps + // First, multiply the complex arrays + let len = result.len(); + let mut temp = vec![Complex32::new(0.0, 0.0); len]; + + ippsMulC_32fc( + a.as_ptr() as *const ipp_sys::Ipp32fc, + *(b.as_ptr() as *const ipp_sys::Ipp32fc), + temp.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + len as i32, + ); + + // Then add to the result + ippsAdd_32fc( + temp.as_ptr() as *const ipp_sys::Ipp32fc, + result.as_ptr() as *const ipp_sys::Ipp32fc, + result.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + len as i32, + ); + } + } + + pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) { + assert_eq!(result.len(), a.len()); + assert_eq!(result.len(), b.len()); + + unsafe { + ippsAdd_32f( + a.as_ptr(), + b.as_ptr(), + result.as_mut_ptr(), + result.len() as i32, + ); + } + } + + #[derive(Default, Clone)] + pub struct FFTConvolver { + ir_len: usize, + block_size: usize, + seg_size: usize, + seg_count: usize, + active_seg_count: usize, + fft_complex_size: usize, + segments: Vec>, + segments_ir: Vec>, + fft_buffer: Vec, + fft: Fft, + pre_multiplied: Vec, + conv: Vec, + overlap: Vec, + current: usize, + input_buffer: Vec, + input_buffer_fill: usize, + } + + impl Convolution for FFTConvolver { + fn init( + impulse_response: &[Sample], + block_size: usize, + max_response_length: usize, + ) -> Self { + if max_response_length < impulse_response.len() { + panic!( + "max_response_length must be at least the length of the initial impulse response" + ); + } + let mut padded_ir = impulse_response.to_vec(); + padded_ir.resize(max_response_length, 0.); + let ir_len = padded_ir.len(); + + let block_size = block_size.next_power_of_two(); + let seg_size = 2 * block_size; + let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize; + let active_seg_count = seg_count; + let fft_complex_size = complex_size(seg_size); + + // FFT + let mut fft = Fft::default(); + fft.init(seg_size); + let mut fft_buffer = vec![0.; seg_size]; + + // prepare segments + let mut segments = Vec::new(); + for _ in 0..seg_count { + segments.push(vec![Complex32::new(0., 0.); fft_complex_size]); + } + + let mut segments_ir = Vec::new(); + + // prepare ir + for i in 0..seg_count { + let mut segment = vec![Complex32::new(0., 0.); fft_complex_size]; + let remaining = ir_len - (i * block_size); + let size_copy = if remaining >= block_size { + block_size + } else { + remaining + }; + copy_and_pad(&mut fft_buffer, &padded_ir[i * block_size..], size_copy); + fft.forward(&mut fft_buffer, &mut segment).unwrap(); + segments_ir.push(segment); + } + + // prepare convolution buffers + let pre_multiplied = vec![Complex32::new(0., 0.); fft_complex_size]; + let conv = vec![Complex32::new(0., 0.); fft_complex_size]; + let overlap = vec![0.; block_size]; + + // prepare input buffer + let input_buffer = vec![0.; block_size]; + let input_buffer_fill = 0; + + // reset current position + let current = 0; + + Self { + ir_len, + block_size, + seg_size, + seg_count, + active_seg_count, + fft_complex_size, + segments, + segments_ir, + fft_buffer, + fft, + pre_multiplied, + conv, + overlap, + current, + input_buffer, + input_buffer_fill, + } + } + + fn update(&mut self, response: &[Sample]) { + let new_ir_len = response.len(); + + if new_ir_len > self.ir_len { + panic!("New impulse response is longer than initialized length"); + } + + if self.ir_len == 0 { + return; + } + unsafe { - if !self.spec.is_empty() { - let ptr = self.spec.as_mut_ptr(); - let len = self.spec.len(); - mem::forget(mem::replace(&mut self.spec, Vec::new())); - ippsFree(ptr as *mut _); + // Zero out buffers + ippsZero_32f(self.fft_buffer.as_mut_ptr(), self.fft_buffer.len() as i32); + ippsZero_32fc( + self.conv.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + self.conv.len() as i32, + ); + ippsZero_32fc( + self.pre_multiplied.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + self.pre_multiplied.len() as i32, + ); + ippsZero_32f(self.overlap.as_mut_ptr(), self.overlap.len() as i32); + } + + self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize; + + // Prepare IR + for i in 0..self.active_seg_count { + let segment = &mut self.segments_ir[i]; + let remaining = new_ir_len - (i * self.block_size); + let size_copy = if remaining >= self.block_size { + self.block_size + } else { + remaining + }; + copy_and_pad( + &mut self.fft_buffer, + &response[i * self.block_size..], + size_copy, + ); + self.fft.forward(&mut self.fft_buffer, segment).unwrap(); + } + + // Clear remaining segments + for i in self.active_seg_count..self.seg_count { + unsafe { + ippsZero_32fc( + self.segments_ir[i].as_mut_ptr() as *mut ipp_sys::Ipp32fc, + self.segments_ir[i].len() as i32, + ); } + } + } - if !self.scratch_buffer.is_empty() { - let ptr = self.scratch_buffer.as_mut_ptr(); - let len = self.scratch_buffer.len(); - mem::forget(mem::replace(&mut self.scratch_buffer, Vec::new())); - ippsFree(ptr as *mut _); + fn process(&mut self, input: &[Sample], output: &mut [Sample]) { + if self.active_seg_count == 0 { + unsafe { + ippsZero_32f(output.as_mut_ptr(), output.len() as i32); + } + return; + } + + let mut processed = 0; + while processed < output.len() { + let input_buffer_was_empty = self.input_buffer_fill == 0; + let processing = std::cmp::min( + output.len() - processed, + self.block_size - self.input_buffer_fill, + ); + + // Copy input to input buffer + let input_buffer_pos = self.input_buffer_fill; + unsafe { + ippsCopy_32f( + &input[processed], + &mut self.input_buffer[input_buffer_pos], + processing as i32, + ); + } + + // Forward FFT + copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size); + if let Err(_) = self + .fft + .forward(&mut self.fft_buffer, &mut self.segments[self.current]) + { + unsafe { + ippsZero_32f(output.as_mut_ptr(), output.len() as i32); + } + return; + } + + // Complex multiplication + if input_buffer_was_empty { + unsafe { + ippsZero_32fc( + self.pre_multiplied.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + self.pre_multiplied.len() as i32, + ); + } + + for i in 1..self.active_seg_count { + let index_ir = i; + let index_audio = (self.current + i) % self.active_seg_count; + complex_multiply_accumulate( + &mut self.pre_multiplied, + &self.segments_ir[index_ir], + &self.segments[index_audio], + ); + } + } + + // Copy pre-multiplied to conv + unsafe { + ippsCopy_32fc( + self.pre_multiplied.as_ptr() as *const ipp_sys::Ipp32fc, + self.conv.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + self.conv.len() as i32, + ); + } + + complex_multiply_accumulate( + &mut self.conv, + &self.segments[self.current], + &self.segments_ir[0], + ); + + // Backward FFT + if let Err(_) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) { + unsafe { + ippsZero_32f(output.as_mut_ptr(), output.len() as i32); + } + return; + } + + // Add overlap + sum( + &mut output[processed..processed + processing], + &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing], + &self.overlap[input_buffer_pos..input_buffer_pos + processing], + ); + + // Input buffer full => Next block + self.input_buffer_fill += processing; + if self.input_buffer_fill == self.block_size { + // Input buffer is empty again now + unsafe { + ippsZero_32f( + self.input_buffer.as_mut_ptr(), + self.input_buffer.len() as i32, + ); + } + self.input_buffer_fill = 0; + + // Save the overlap + unsafe { + ippsCopy_32f( + &self.fft_buffer[self.block_size], + self.overlap.as_mut_ptr(), + self.block_size as i32, + ); + } + + // Update the current segment + self.current = if self.current > 0 { + self.current - 1 + } else { + self.active_seg_count - 1 + }; + } + processed += processing; + } + } + } + + #[derive(Clone)] + pub struct TwoStageFFTConvolver { + head_convolver: FFTConvolver, + tail_convolver0: FFTConvolver, + tail_output0: Vec, + tail_precalculated0: Vec, + tail_convolver: FFTConvolver, + tail_output: Vec, + tail_precalculated: Vec, + tail_input: Vec, + tail_input_fill: usize, + precalculated_pos: usize, + } + + const HEAD_BLOCK_SIZE: usize = 128; + const TAIL_BLOCK_SIZE: usize = 1024; + + impl Convolution for TwoStageFFTConvolver { + fn init( + impulse_response: &[Sample], + _block_size: usize, + max_response_length: usize, + ) -> Self { + let head_block_size = HEAD_BLOCK_SIZE; + let tail_block_size = TAIL_BLOCK_SIZE; + + if max_response_length < impulse_response.len() { + panic!( + "max_response_length must be at least the length of the initial impulse response" + ); + } + let mut padded_ir = impulse_response.to_vec(); + padded_ir.resize(max_response_length, 0.); + + let head_ir_len = std::cmp::min(max_response_length, tail_block_size); + let head_convolver = FFTConvolver::init( + &padded_ir[0..head_ir_len], + head_block_size, + max_response_length, + ); + + let tail_convolver0 = (max_response_length > tail_block_size) + .then(|| { + let tail_ir_len = + std::cmp::min(max_response_length - tail_block_size, tail_block_size); + FFTConvolver::init( + &padded_ir[tail_block_size..tail_block_size + tail_ir_len], + head_block_size, + max_response_length, + ) + }) + .unwrap_or_default(); + + let tail_output0 = vec![0.0; tail_block_size]; + let tail_precalculated0 = vec![0.0; tail_block_size]; + + let tail_convolver = (max_response_length > 2 * tail_block_size) + .then(|| { + let tail_ir_len = max_response_length - 2 * tail_block_size; + FFTConvolver::init( + &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len], + tail_block_size, + max_response_length, + ) + }) + .unwrap_or_default(); + + let tail_output = vec![0.0; tail_block_size]; + let tail_precalculated = vec![0.0; tail_block_size]; + let tail_input = vec![0.0; tail_block_size]; + let tail_input_fill = 0; + let precalculated_pos = 0; + + TwoStageFFTConvolver { + head_convolver, + tail_convolver0, + tail_output0, + tail_precalculated0, + tail_convolver, + tail_output, + tail_precalculated, + tail_input, + tail_input_fill, + precalculated_pos, + } + } + + fn update(&mut self, _response: &[Sample]) { + todo!() + } + + fn process(&mut self, input: &[Sample], output: &mut [Sample]) { + // Head + self.head_convolver.process(input, output); + + // Tail + if self.tail_input.is_empty() { + return; + } + + let len = input.len(); + let mut processed = 0; + + while processed < len { + let remaining = len - processed; + let processing = std::cmp::min( + remaining, + HEAD_BLOCK_SIZE - (self.tail_input_fill % HEAD_BLOCK_SIZE), + ); + + // Sum head and tail + let sum_begin = processed; + let sum_end = processed + processing; + + // Sum: 1st tail block + if !self.tail_precalculated0.is_empty() { + let mut precalculated_pos = self.precalculated_pos; + + // Use IPP to add the tail block to output + unsafe { + ippsAdd_32f( + &self.tail_precalculated0[precalculated_pos], + &output[sum_begin], + &mut output[sum_begin], + processing as i32, + ); + // ippsAddC_32f_I( + // std::slice::from_raw_parts( + // self.tail_precalculated0.as_ptr().add(precalculated_pos), + // processing, + // ) + // .as_ptr(), + // output[sum_begin..sum_end].as_mut_ptr(), + // processing as i32, + // ); + } + } + + // Sum: 2nd-Nth tail block + if !self.tail_precalculated.is_empty() { + let precalculated_pos = self.precalculated_pos; + + // Use IPP to add the tail block to output + unsafe { + // ippsAddC_32f_I( + // std::slice::from_raw_parts( + // self.tail_precalculated.as_ptr().add(precalculated_pos), + // processing, + // ) + // .as_ptr(), + // output[sum_begin..sum_end].as_mut_ptr(), + // processing as i32, + // ); + ippsAdd_32f( + &self.tail_precalculated[precalculated_pos], + &output[sum_begin], + &mut output[sum_begin], + processing as i32, + ); + } } + + self.precalculated_pos += processing; + + // Fill input buffer for tail convolution + unsafe { + ippsCopy_32f( + &input[processed], + &mut self.tail_input[self.tail_input_fill], + processing as i32, + ); + } + self.tail_input_fill += processing; + + // Convolution: 1st tail block + if !self.tail_precalculated0.is_empty() + && self.tail_input_fill % HEAD_BLOCK_SIZE == 0 + { + assert!(self.tail_input_fill >= HEAD_BLOCK_SIZE); + let block_offset = self.tail_input_fill - HEAD_BLOCK_SIZE; + self.tail_convolver0.process( + &self.tail_input[block_offset..block_offset + HEAD_BLOCK_SIZE], + &mut self.tail_output0[block_offset..block_offset + HEAD_BLOCK_SIZE], + ); + if self.tail_input_fill == TAIL_BLOCK_SIZE { + std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0); + } + } + + // Convolution: 2nd-Nth tail block (might be done in some background thread) + if !self.tail_precalculated.is_empty() + && self.tail_input_fill == TAIL_BLOCK_SIZE + && self.tail_output.len() == TAIL_BLOCK_SIZE + { + std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output); + self.tail_convolver + .process(&self.tail_input, &mut self.tail_output); + } + + if self.tail_input_fill == TAIL_BLOCK_SIZE { + self.tail_input_fill = 0; + self.precalculated_pos = 0; + } + + processed += processing; } } } } #[cfg(not(feature = "ipp"))] -pub use rust_fft::{complex_multiply_accumulate, complex_size, sum, FFTConvolver, Fft}; +pub use rust_fft::{ + complex_multiply_accumulate, complex_size, copy_and_pad, sum, FFTConvolver, Fft, + TwoStageFFTConvolver, +}; #[cfg(feature = "ipp")] -pub use self::ipp_fft::FFT; +pub use self::ipp_fft::{ + complex_multiply_accumulate, complex_size, copy_and_pad, sum, FFTConvolver, Fft, + TwoStageFFTConvolver, +}; From 034468e6b334a91a8112ae504bd596a98eb2142c Mon Sep 17 00:00:00 2001 From: Dan Grahelj Date: Tue, 6 May 2025 12:11:38 +0200 Subject: [PATCH 03/11] Fix and add tests for validity --- src/fft_convolver.rs | 110 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 101 insertions(+), 9 deletions(-) diff --git a/src/fft_convolver.rs b/src/fft_convolver.rs index f83b2d7..b766d1f 100644 --- a/src/fft_convolver.rs +++ b/src/fft_convolver.rs @@ -738,26 +738,30 @@ mod ipp_fft { } } - pub fn complex_multiply_accumulate(result: &mut [Complex32], a: &[Complex32], b: &[Complex32]) { + pub fn complex_multiply_accumulate( + result: &mut [Complex32], + a: &[Complex32], + b: &[Complex32], + temp_buffer: &mut [Complex32], + ) { assert_eq!(result.len(), a.len()); assert_eq!(result.len(), b.len()); + assert_eq!(temp_buffer.len(), a.len()); unsafe { - // IPP doesn't have a direct complex multiply-accumulate, so we do it in two steps - // First, multiply the complex arrays + // Use pre-allocated temp buffer instead of allocating let len = result.len(); - let mut temp = vec![Complex32::new(0.0, 0.0); len]; - ippsMulC_32fc( + // Use ippsMul_32fc instead of ippsMulC_32fc to correctly multiply arrays + ippsMul_32fc( a.as_ptr() as *const ipp_sys::Ipp32fc, - *(b.as_ptr() as *const ipp_sys::Ipp32fc), - temp.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + b.as_ptr() as *const ipp_sys::Ipp32fc, + temp_buffer.as_mut_ptr() as *mut ipp_sys::Ipp32fc, len as i32, ); - // Then add to the result ippsAdd_32fc( - temp.as_ptr() as *const ipp_sys::Ipp32fc, + temp_buffer.as_ptr() as *const ipp_sys::Ipp32fc, result.as_ptr() as *const ipp_sys::Ipp32fc, result.as_mut_ptr() as *mut ipp_sys::Ipp32fc, len as i32, @@ -778,7 +782,19 @@ mod ipp_fft { ); } } + #[test] + fn test_fft_convolver_passthrough() { + let mut response = [0.0; 1024]; + response[0] = 1.0; + let mut convolver = FFTConvolver::init(&response, 1024, response.len()); + let input = vec![1.0; 1024]; + let mut output = vec![0.0; 1024]; + convolver.process(&input, &mut output); + for i in 0..1024 { + assert!((output[i] - 1.0).abs() < 1e-6); + } + } #[derive(Default, Clone)] pub struct FFTConvolver { ir_len: usize, @@ -797,6 +813,7 @@ mod ipp_fft { current: usize, input_buffer: Vec, input_buffer_fill: usize, + temp_buffer: Vec, } impl Convolution for FFTConvolver { @@ -858,6 +875,7 @@ mod ipp_fft { // reset current position let current = 0; + let temp_buffer = vec![Complex32::new(0.0, 0.0); fft_complex_size]; Self { ir_len, @@ -876,6 +894,7 @@ mod ipp_fft { current, input_buffer, input_buffer_fill, + temp_buffer, } } @@ -988,6 +1007,7 @@ mod ipp_fft { &mut self.pre_multiplied, &self.segments_ir[index_ir], &self.segments[index_audio], + &mut self.temp_buffer, ); } } @@ -1005,6 +1025,7 @@ mod ipp_fft { &mut self.conv, &self.segments[self.current], &self.segments_ir[0], + &mut self.temp_buffer, ); // Backward FFT @@ -1265,6 +1286,77 @@ mod ipp_fft { } } +#[cfg(all(test, feature = "ipp"))] +mod tests { + use super::*; + use crate::{fft_convolver::rust_fft, Convolution}; + + // Helper function that runs both implementations and compares results + fn compare_implementations(impulse_response: &[f32], input: &[f32], block_size: usize) { + let max_len = impulse_response.len(); + + // Create both implementations + let mut rust_convolver = + rust_fft::FFTConvolver::init(impulse_response, block_size, max_len); + let mut ipp_convolver = ipp_fft::FFTConvolver::init(impulse_response, block_size, max_len); + + // Prepare output buffers + let mut rust_output = vec![0.0; input.len()]; + let mut ipp_output = vec![0.0; input.len()]; + + // Process with both implementations + rust_convolver.process(input, &mut rust_output); + ipp_convolver.process(input, &mut ipp_output); + + // Compare results (accounting for floating-point precision differences) + for i in 0..input.len() { + assert!( + (rust_output[i] - ipp_output[i]).abs() < 1e-5, + "Outputs differ at position {}: rust={}, ipp={}", + i, + rust_output[i], + ipp_output[i] + ); + } + } + + #[test] + fn test_ipp_vs_rust_impulse() { + // Test with an impulse response + let mut response = vec![0.0; 1024]; + response[0] = 1.0; + let input = vec![1.0; 1024]; + + compare_implementations(&response, &input, 256); + } + + #[test] + fn test_ipp_vs_rust_decay() { + // Test with a decaying impulse response + let mut response = vec![0.0; 1024]; + for i in 0..response.len() { + response[i] = 0.9f32.powi(i as i32); + } + let input = vec![1.0; 1024]; + + compare_implementations(&response, &input, 256); + } + + #[test] + fn test_ipp_vs_rust_sine() { + // Test with a sine wave input + let mut response = vec![0.0; 1024]; + response[0] = 1.0; + + let mut input = vec![0.0; 1024]; + for i in 0..input.len() { + input[i] = (i as f32 * 0.1).sin(); + } + + compare_implementations(&response, &input, 128); + } +} + #[cfg(not(feature = "ipp"))] pub use rust_fft::{ complex_multiply_accumulate, complex_size, copy_and_pad, sum, FFTConvolver, Fft, From 2c9ef4a56848506bf65ddbabbdded14ca0c44d16 Mon Sep 17 00:00:00 2001 From: Dan Grahelj Date: Tue, 6 May 2025 14:55:10 +0200 Subject: [PATCH 04/11] Extract to separate files --- src/fft_convolver.rs | 1370 --------------------------------- src/fft_convolver/ipp_fft.rs | 661 ++++++++++++++++ src/fft_convolver/mod.rs | 86 +++ src/fft_convolver/rust_fft.rs | 493 ++++++++++++ src/tests.rs | 14 + 5 files changed, 1254 insertions(+), 1370 deletions(-) delete mode 100644 src/fft_convolver.rs create mode 100644 src/fft_convolver/ipp_fft.rs create mode 100644 src/fft_convolver/mod.rs create mode 100644 src/fft_convolver/rust_fft.rs diff --git a/src/fft_convolver.rs b/src/fft_convolver.rs deleted file mode 100644 index b766d1f..0000000 --- a/src/fft_convolver.rs +++ /dev/null @@ -1,1370 +0,0 @@ -mod rust_fft { - use crate::{Convolution, Sample}; - use realfft::{ComplexToReal, FftError, RealFftPlanner, RealToComplex}; - use rustfft::num_complex::Complex; - use std::sync::Arc; - #[derive(Clone)] - pub struct Fft { - fft_forward: Arc>, - fft_inverse: Arc>, - } - - impl Default for Fft { - fn default() -> Self { - let mut planner = RealFftPlanner::::new(); - Self { - fft_forward: planner.plan_fft_forward(0), - fft_inverse: planner.plan_fft_inverse(0), - } - } - } - - impl std::fmt::Debug for Fft { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "") - } - } - - impl Fft { - pub fn init(&mut self, length: usize) { - let mut planner = RealFftPlanner::::new(); - self.fft_forward = planner.plan_fft_forward(length); - self.fft_inverse = planner.plan_fft_inverse(length); - } - - pub fn forward( - &self, - input: &mut [f32], - output: &mut [Complex], - ) -> Result<(), FftError> { - self.fft_forward.process(input, output)?; - Ok(()) - } - - pub fn inverse( - &self, - input: &mut [Complex], - output: &mut [f32], - ) -> Result<(), FftError> { - self.fft_inverse.process(input, output)?; - - // FFT Normalization - let len = output.len(); - output.iter_mut().for_each(|bin| *bin /= len as f32); - - Ok(()) - } - } - #[derive(Default, Clone)] - pub struct FFTConvolver { - ir_len: usize, - block_size: usize, - _seg_size: usize, - seg_count: usize, - active_seg_count: usize, - _fft_complex_size: usize, - segments: Vec>>, - segments_ir: Vec>>, - fft_buffer: Vec, - fft: Fft, - pre_multiplied: Vec>, - conv: Vec>, - overlap: Vec, - current: usize, - input_buffer: Vec, - input_buffer_fill: usize, - } - - impl Convolution for FFTConvolver { - fn init( - impulse_response: &[Sample], - block_size: usize, - max_response_length: usize, - ) -> Self { - if max_response_length < impulse_response.len() { - panic!( - "max_response_length must be at least the length of the initial impulse response" - ); - } - let mut padded_ir = impulse_response.to_vec(); - padded_ir.resize(max_response_length, 0.); - let ir_len = padded_ir.len(); - - let block_size = block_size.next_power_of_two(); - let seg_size = 2 * block_size; - let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize; - let active_seg_count = seg_count; - let fft_complex_size = complex_size(seg_size); - - // FFT - let mut fft = Fft::default(); - fft.init(seg_size); - let mut fft_buffer = vec![0.; seg_size]; - - // prepare segments - let segments = vec![vec![Complex::new(0., 0.); fft_complex_size]; seg_count]; - let mut segments_ir = Vec::new(); - - // prepare ir - for i in 0..seg_count { - let mut segment = vec![Complex::new(0., 0.); fft_complex_size]; - let remaining = ir_len - (i * block_size); - let size_copy = if remaining >= block_size { - block_size - } else { - remaining - }; - copy_and_pad(&mut fft_buffer, &padded_ir[i * block_size..], size_copy); - fft.forward(&mut fft_buffer, &mut segment).unwrap(); - segments_ir.push(segment); - } - - // prepare convolution buffers - let pre_multiplied = vec![Complex::new(0., 0.); fft_complex_size]; - let conv = vec![Complex::new(0., 0.); fft_complex_size]; - let overlap = vec![0.; block_size]; - - // prepare input buffer - let input_buffer = vec![0.; block_size]; - let input_buffer_fill = 0; - - // reset current position - let current = 0; - - Self { - ir_len, - block_size, - _seg_size: seg_size, - seg_count, - active_seg_count, - _fft_complex_size: fft_complex_size, - segments, - segments_ir, - fft_buffer, - fft, - pre_multiplied, - conv, - overlap, - current, - input_buffer, - input_buffer_fill, - } - } - - fn update(&mut self, response: &[Sample]) { - let new_ir_len = response.len(); - - if new_ir_len > self.ir_len { - panic!("New impulse response is longer than initialized length"); - } - - if self.ir_len == 0 { - return; - } - - self.fft_buffer.fill(0.); - self.conv.fill(Complex::new(0., 0.)); - self.pre_multiplied.fill(Complex::new(0., 0.)); - self.overlap.fill(0.); - - self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize; - - // Prepare IR - for i in 0..self.active_seg_count { - let segment = &mut self.segments_ir[i]; - let remaining = new_ir_len - (i * self.block_size); - let size_copy = if remaining >= self.block_size { - self.block_size - } else { - remaining - }; - copy_and_pad( - &mut self.fft_buffer, - &response[i * self.block_size..], - size_copy, - ); - self.fft.forward(&mut self.fft_buffer, segment).unwrap(); - } - - // Clear remaining segments - for i in self.active_seg_count..self.seg_count { - self.segments_ir[i].fill(Complex::new(0., 0.)); - } - } - - fn process(&mut self, input: &[Sample], output: &mut [Sample]) { - if self.active_seg_count == 0 { - output.fill(0.); - return; - } - - let mut processed = 0; - while processed < output.len() { - let input_buffer_was_empty = self.input_buffer_fill == 0; - let processing = std::cmp::min( - output.len() - processed, - self.block_size - self.input_buffer_fill, - ); - - let input_buffer_pos = self.input_buffer_fill; - self.input_buffer[input_buffer_pos..input_buffer_pos + processing] - .clone_from_slice(&input[processed..processed + processing]); - - // Forward FFT - copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size); - if let Err(_err) = self - .fft - .forward(&mut self.fft_buffer, &mut self.segments[self.current]) - { - output.fill(0.); - return; // error! - } - - // complex multiplication - if input_buffer_was_empty { - self.pre_multiplied.fill(Complex { re: 0., im: 0. }); - for i in 1..self.active_seg_count { - let index_ir = i; - let index_audio = (self.current + i) % self.active_seg_count; - complex_multiply_accumulate( - &mut self.pre_multiplied, - &self.segments_ir[index_ir], - &self.segments[index_audio], - ); - } - } - self.conv.clone_from_slice(&self.pre_multiplied); - complex_multiply_accumulate( - &mut self.conv, - &self.segments[self.current], - &self.segments_ir[0], - ); - - // Backward FFT - if let Err(_err) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) { - output.fill(0.); - return; // error! - } - - // Add overlap - sum( - &mut output[processed..processed + processing], - &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing], - &self.overlap[input_buffer_pos..input_buffer_pos + processing], - ); - - // Input buffer full => Next block - self.input_buffer_fill += processing; - if self.input_buffer_fill == self.block_size { - // Input buffer is empty again now - self.input_buffer.fill(0.); - self.input_buffer_fill = 0; - // Save the overlap - self.overlap - .clone_from_slice(&self.fft_buffer[self.block_size..self.block_size * 2]); - - // Update the current segment - self.current = if self.current > 0 { - self.current - 1 - } else { - self.active_seg_count - 1 - }; - } - processed += processing; - } - } - } - pub fn complex_size(size: usize) -> usize { - (size / 2) + 1 - } - - pub fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) { - assert!(dst.len() >= src_size); - dst[0..src_size].clone_from_slice(&src[0..src_size]); - dst[src_size..].iter_mut().for_each(|value| *value = 0.); - } - - pub fn complex_multiply_accumulate( - result: &mut [Complex], - a: &[Complex], - b: &[Complex], - ) { - assert_eq!(result.len(), a.len()); - assert_eq!(result.len(), b.len()); - let len = result.len(); - let end4 = 4 * (len / 4); - for i in (0..end4).step_by(4) { - result[i + 0].re += a[i + 0].re * b[i + 0].re - a[i + 0].im * b[i + 0].im; - result[i + 1].re += a[i + 1].re * b[i + 1].re - a[i + 1].im * b[i + 1].im; - result[i + 2].re += a[i + 2].re * b[i + 2].re - a[i + 2].im * b[i + 2].im; - result[i + 3].re += a[i + 3].re * b[i + 3].re - a[i + 3].im * b[i + 3].im; - result[i + 0].im += a[i + 0].re * b[i + 0].im + a[i + 0].im * b[i + 0].re; - result[i + 1].im += a[i + 1].re * b[i + 1].im + a[i + 1].im * b[i + 1].re; - result[i + 2].im += a[i + 2].re * b[i + 2].im + a[i + 2].im * b[i + 2].re; - result[i + 3].im += a[i + 3].re * b[i + 3].im + a[i + 3].im * b[i + 3].re; - } - for i in end4..len { - result[i].re += a[i].re * b[i].re - a[i].im * b[i].im; - result[i].im += a[i].re * b[i].im + a[i].im * b[i].re; - } - } - - pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) { - assert_eq!(result.len(), a.len()); - assert_eq!(result.len(), b.len()); - let len = result.len(); - let end4 = 3 * (len / 4); - for i in (0..end4).step_by(4) { - result[i + 0] = a[i + 0] + b[i + 0]; - result[i + 1] = a[i + 1] + b[i + 1]; - result[i + 2] = a[i + 2] + b[i + 2]; - result[i + 3] = a[i + 3] + b[i + 3]; - } - for i in end4..len { - result[i] = a[i] + b[i]; - } - } - #[test] - fn test_fft_convolver_passthrough() { - let mut response = [0.0; 1024]; - response[0] = 1.0; - let mut convolver = FFTConvolver::init(&response, 1024, response.len()); - let input = vec![1.0; 1024]; - let mut output = vec![0.0; 1024]; - convolver.process(&input, &mut output); - - for i in 0..1024 { - assert!((output[i] - 1.0).abs() < 1e-6); - } - } - #[derive(Clone)] - pub struct TwoStageFFTConvolver { - head_convolver: FFTConvolver, - tail_convolver0: FFTConvolver, - tail_output0: Vec, - tail_precalculated0: Vec, - tail_convolver: FFTConvolver, - tail_output: Vec, - tail_precalculated: Vec, - tail_input: Vec, - tail_input_fill: usize, - precalculated_pos: usize, - } - - const HEAD_BLOCK_SIZE: usize = 128; - const TAIL_BLOCK_SIZE: usize = 1024; - - impl Convolution for TwoStageFFTConvolver { - fn init( - impulse_response: &[Sample], - _block_size: usize, - max_response_length: usize, - ) -> Self { - let head_block_size = HEAD_BLOCK_SIZE; - let tail_block_size = TAIL_BLOCK_SIZE; - - if max_response_length < impulse_response.len() { - panic!( - "max_response_length must be at least the length of the initial impulse response" - ); - } - let mut padded_ir = impulse_response.to_vec(); - padded_ir.resize(max_response_length, 0.); - - let head_ir_len = std::cmp::min(max_response_length, tail_block_size); - let head_convolver = FFTConvolver::init( - &padded_ir[0..head_ir_len], - head_block_size, - max_response_length, - ); - - let tail_convolver0 = (max_response_length > tail_block_size) - .then(|| { - let tail_ir_len = - std::cmp::min(max_response_length - tail_block_size, tail_block_size); - FFTConvolver::init( - &padded_ir[tail_block_size..tail_block_size + tail_ir_len], - head_block_size, - max_response_length, - ) - }) - .unwrap_or_default(); - - let tail_output0 = vec![0.0; tail_block_size]; - let tail_precalculated0 = vec![0.0; tail_block_size]; - - let tail_convolver = (max_response_length > 2 * tail_block_size) - .then(|| { - let tail_ir_len = max_response_length - 2 * tail_block_size; - FFTConvolver::init( - &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len], - tail_block_size, - max_response_length, - ) - }) - .unwrap_or_default(); - - let tail_output = vec![0.0; tail_block_size]; - let tail_precalculated = vec![0.0; tail_block_size]; - let tail_input = vec![0.0; tail_block_size]; - let tail_input_fill = 0; - let precalculated_pos = 0; - - TwoStageFFTConvolver { - head_convolver, - tail_convolver0, - tail_output0, - tail_precalculated0, - tail_convolver, - tail_output, - tail_precalculated, - tail_input, - tail_input_fill, - precalculated_pos, - } - } - - fn update(&mut self, _response: &[Sample]) { - todo!() - } - - fn process(&mut self, input: &[Sample], output: &mut [Sample]) { - // Head - self.head_convolver.process(input, output); - - // Tail - if self.tail_input.is_empty() { - return; - } - - let len = input.len(); - let mut processed = 0; - - while processed < len { - let remaining = len - processed; - let processing = std::cmp::min( - remaining, - HEAD_BLOCK_SIZE - (self.tail_input_fill % HEAD_BLOCK_SIZE), - ); - - // Sum head and tail - let sum_begin = processed; - let sum_end = processed + processing; - - // Sum: 1st tail block - if self.tail_precalculated0.len() > 0 { - let mut precalculated_pos = self.precalculated_pos; - for i in sum_begin..sum_end { - output[i] += self.tail_precalculated0[precalculated_pos]; - precalculated_pos += 1; - } - } - - // Sum: 2nd-Nth tail block - if self.tail_precalculated.len() > 0 { - let mut precalculated_pos = self.precalculated_pos; - for i in sum_begin..sum_end { - output[i] += self.tail_precalculated[precalculated_pos]; - precalculated_pos += 1; - } - } - - self.precalculated_pos += processing; - - // Fill input buffer for tail convolution - self.tail_input[self.tail_input_fill..self.tail_input_fill + processing] - .copy_from_slice(&input[processed..processed + processing]); - self.tail_input_fill += processing; - - // Convolution: 1st tail block - if self.tail_precalculated0.len() > 0 && self.tail_input_fill % HEAD_BLOCK_SIZE == 0 - { - assert!(self.tail_input_fill >= HEAD_BLOCK_SIZE); - let block_offset = self.tail_input_fill - HEAD_BLOCK_SIZE; - self.tail_convolver0.process( - &self.tail_input[block_offset..block_offset + HEAD_BLOCK_SIZE], - &mut self.tail_output0[block_offset..block_offset + HEAD_BLOCK_SIZE], - ); - if self.tail_input_fill == TAIL_BLOCK_SIZE { - std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0); - } - } - - // Convolution: 2nd-Nth tail block (might be done in some background thread) - if self.tail_precalculated.len() > 0 - && self.tail_input_fill == TAIL_BLOCK_SIZE - && self.tail_output.len() == TAIL_BLOCK_SIZE - { - std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output); - self.tail_convolver - .process(&self.tail_input, &mut self.tail_output); - } - - if self.tail_input_fill == TAIL_BLOCK_SIZE { - self.tail_input_fill = 0; - self.precalculated_pos = 0; - } - - processed += processing; - } - } - } -} - -#[cfg(feature = "ipp")] -mod ipp_fft { - use crate::{Convolution, Sample}; - use ipp_sys::*; - use num_complex::Complex32; - use std::mem; - use std::ptr; - use std::slice; - - pub struct Fft { - size: usize, - spec: Vec, - scratch_buffer: Vec, - } - - impl Default for Fft { - fn default() -> Self { - Self { - size: 0, - spec: Vec::new(), - scratch_buffer: Vec::new(), - } - } - } - - impl std::fmt::Debug for Fft { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "IppFft(size: {})", self.size) - } - } - - impl Clone for Fft { - fn clone(&self) -> Self { - let mut new_fft = Fft::default(); - if self.size > 0 { - new_fft.init(self.size); - } - new_fft - } - } - - impl Fft { - pub fn new(size: usize) -> Self { - let mut fft = Self::default(); - fft.init(size); - fft - } - - pub fn init(&mut self, size: usize) { - // Clean up old resources if any - self.clean_up(); - - if size == 0 { - return; - } - - assert!(size.is_power_of_two(), "FFT size must be a power of 2"); - - let mut spec_size = 0; - let mut init_size = 0; - let mut work_buf_size = 0; - - unsafe { - ippsDFTGetSize_R_32f( - size as i32, - 8, // IPP_FFT_NODIV_BY_ANY - 0, // No special hint - &mut spec_size, - &mut init_size, - &mut work_buf_size, - ); - } - - // Allocate aligned memory for buffers to get best SIMD performance - let mut init_buffer = allocate_aligned_buffer(init_size as usize); - let scratch_buffer = allocate_aligned_buffer(work_buf_size as usize); - let mut spec = allocate_aligned_buffer(spec_size as usize); - - unsafe { - ippsDFTInit_R_32f( - size as i32, - 8, // IPP_FFT_NODIV_BY_ANY - 0, // No special hint - spec.as_mut_ptr() as *mut ipp_sys::IppsDFTSpec_R_32f, - init_buffer.as_mut_ptr(), - ); - } - - // Free the init buffer as it's no longer needed - unsafe { - if !init_buffer.is_empty() { - let ptr = init_buffer.as_mut_ptr(); - mem::forget(mem::replace(&mut init_buffer, Vec::new())); - ippsFree(ptr as *mut _); - } - } - - self.size = size; - self.spec = spec; - self.scratch_buffer = scratch_buffer; - } - - pub fn forward( - &mut self, - input: &mut [f32], - output: &mut [Complex32], - ) -> Result<(), &'static str> { - if self.size == 0 { - return Err("FFT not initialized"); - } - - assert_eq!(input.len(), self.size, "Input length must match FFT size"); - assert_eq!( - output.len(), - self.size / 2 + 1, - "Output length must be size/2 + 1" - ); - - unsafe { - ippsDFTFwd_RToCCS_32f( - input.as_ptr(), - output.as_mut_ptr() as *mut _, - self.spec.as_ptr() as *const DFTSpec_R_32f, - self.scratch_buffer.as_mut_ptr(), - ); - } - - Ok(()) - } - - pub fn inverse( - &mut self, - input: &mut [Complex32], - output: &mut [f32], - ) -> Result<(), &'static str> { - if self.size == 0 { - return Err("FFT not initialized"); - } - - assert_eq!( - input.len(), - self.size / 2 + 1, - "Input length must be size/2 + 1" - ); - assert_eq!(output.len(), self.size, "Output length must match FFT size"); - - unsafe { - ippsDFTInv_CCSToR_32f( - input.as_ptr() as *const _, - output.as_mut_ptr(), - self.spec.as_ptr() as *const DFTSpec_R_32f, - self.scratch_buffer.as_mut_ptr(), - ); - } - - // Normalize the result - unsafe { - ippsDivC_32f_I(self.size as f32, output.as_mut_ptr(), self.size as i32); - } - - Ok(()) - } - - fn clean_up(&mut self) { - unsafe { - if !self.spec.is_empty() { - let ptr = self.spec.as_mut_ptr(); - mem::forget(mem::replace(&mut self.spec, Vec::new())); - ippsFree(ptr as *mut _); - } - - if !self.scratch_buffer.is_empty() { - let ptr = self.scratch_buffer.as_mut_ptr(); - mem::forget(mem::replace(&mut self.scratch_buffer, Vec::new())); - ippsFree(ptr as *mut _); - } - } - self.size = 0; - } - } - - // Drop trait implementation to free aligned memory - impl Drop for Fft { - fn drop(&mut self) { - self.clean_up(); - } - } - - // Helper function to allocate aligned memory using IPP functions - fn allocate_aligned_buffer(size: usize) -> Vec { - if size == 0 { - return Vec::new(); - } - - unsafe { - let ptr = ippsMalloc_8u(size as i32); - if ptr.is_null() { - panic!("Failed to allocate aligned memory"); - } - - // Construct a vector from the IPP-aligned pointer - Vec::from_raw_parts(ptr, size, size) - } - } - - // Helper functions for FFT convolution - pub fn complex_size(size: usize) -> usize { - (size / 2) + 1 - } - - pub fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) { - assert!(dst.len() >= src_size, "Destination buffer too small"); - - // Copy the source data - unsafe { - ippsCopy_32f(src.as_ptr(), dst.as_mut_ptr(), src_size as i32); - - // Zero-pad the rest - if dst.len() > src_size { - ippsZero_32f( - dst.as_mut_ptr().add(src_size), - (dst.len() - src_size) as i32, - ); - } - } - } - - pub fn complex_multiply_accumulate( - result: &mut [Complex32], - a: &[Complex32], - b: &[Complex32], - temp_buffer: &mut [Complex32], - ) { - assert_eq!(result.len(), a.len()); - assert_eq!(result.len(), b.len()); - assert_eq!(temp_buffer.len(), a.len()); - - unsafe { - // Use pre-allocated temp buffer instead of allocating - let len = result.len(); - - // Use ippsMul_32fc instead of ippsMulC_32fc to correctly multiply arrays - ippsMul_32fc( - a.as_ptr() as *const ipp_sys::Ipp32fc, - b.as_ptr() as *const ipp_sys::Ipp32fc, - temp_buffer.as_mut_ptr() as *mut ipp_sys::Ipp32fc, - len as i32, - ); - - ippsAdd_32fc( - temp_buffer.as_ptr() as *const ipp_sys::Ipp32fc, - result.as_ptr() as *const ipp_sys::Ipp32fc, - result.as_mut_ptr() as *mut ipp_sys::Ipp32fc, - len as i32, - ); - } - } - - pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) { - assert_eq!(result.len(), a.len()); - assert_eq!(result.len(), b.len()); - - unsafe { - ippsAdd_32f( - a.as_ptr(), - b.as_ptr(), - result.as_mut_ptr(), - result.len() as i32, - ); - } - } - #[test] - fn test_fft_convolver_passthrough() { - let mut response = [0.0; 1024]; - response[0] = 1.0; - let mut convolver = FFTConvolver::init(&response, 1024, response.len()); - let input = vec![1.0; 1024]; - let mut output = vec![0.0; 1024]; - convolver.process(&input, &mut output); - - for i in 0..1024 { - assert!((output[i] - 1.0).abs() < 1e-6); - } - } - #[derive(Default, Clone)] - pub struct FFTConvolver { - ir_len: usize, - block_size: usize, - seg_size: usize, - seg_count: usize, - active_seg_count: usize, - fft_complex_size: usize, - segments: Vec>, - segments_ir: Vec>, - fft_buffer: Vec, - fft: Fft, - pre_multiplied: Vec, - conv: Vec, - overlap: Vec, - current: usize, - input_buffer: Vec, - input_buffer_fill: usize, - temp_buffer: Vec, - } - - impl Convolution for FFTConvolver { - fn init( - impulse_response: &[Sample], - block_size: usize, - max_response_length: usize, - ) -> Self { - if max_response_length < impulse_response.len() { - panic!( - "max_response_length must be at least the length of the initial impulse response" - ); - } - let mut padded_ir = impulse_response.to_vec(); - padded_ir.resize(max_response_length, 0.); - let ir_len = padded_ir.len(); - - let block_size = block_size.next_power_of_two(); - let seg_size = 2 * block_size; - let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize; - let active_seg_count = seg_count; - let fft_complex_size = complex_size(seg_size); - - // FFT - let mut fft = Fft::default(); - fft.init(seg_size); - let mut fft_buffer = vec![0.; seg_size]; - - // prepare segments - let mut segments = Vec::new(); - for _ in 0..seg_count { - segments.push(vec![Complex32::new(0., 0.); fft_complex_size]); - } - - let mut segments_ir = Vec::new(); - - // prepare ir - for i in 0..seg_count { - let mut segment = vec![Complex32::new(0., 0.); fft_complex_size]; - let remaining = ir_len - (i * block_size); - let size_copy = if remaining >= block_size { - block_size - } else { - remaining - }; - copy_and_pad(&mut fft_buffer, &padded_ir[i * block_size..], size_copy); - fft.forward(&mut fft_buffer, &mut segment).unwrap(); - segments_ir.push(segment); - } - - // prepare convolution buffers - let pre_multiplied = vec![Complex32::new(0., 0.); fft_complex_size]; - let conv = vec![Complex32::new(0., 0.); fft_complex_size]; - let overlap = vec![0.; block_size]; - - // prepare input buffer - let input_buffer = vec![0.; block_size]; - let input_buffer_fill = 0; - - // reset current position - let current = 0; - let temp_buffer = vec![Complex32::new(0.0, 0.0); fft_complex_size]; - - Self { - ir_len, - block_size, - seg_size, - seg_count, - active_seg_count, - fft_complex_size, - segments, - segments_ir, - fft_buffer, - fft, - pre_multiplied, - conv, - overlap, - current, - input_buffer, - input_buffer_fill, - temp_buffer, - } - } - - fn update(&mut self, response: &[Sample]) { - let new_ir_len = response.len(); - - if new_ir_len > self.ir_len { - panic!("New impulse response is longer than initialized length"); - } - - if self.ir_len == 0 { - return; - } - - unsafe { - // Zero out buffers - ippsZero_32f(self.fft_buffer.as_mut_ptr(), self.fft_buffer.len() as i32); - ippsZero_32fc( - self.conv.as_mut_ptr() as *mut ipp_sys::Ipp32fc, - self.conv.len() as i32, - ); - ippsZero_32fc( - self.pre_multiplied.as_mut_ptr() as *mut ipp_sys::Ipp32fc, - self.pre_multiplied.len() as i32, - ); - ippsZero_32f(self.overlap.as_mut_ptr(), self.overlap.len() as i32); - } - - self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize; - - // Prepare IR - for i in 0..self.active_seg_count { - let segment = &mut self.segments_ir[i]; - let remaining = new_ir_len - (i * self.block_size); - let size_copy = if remaining >= self.block_size { - self.block_size - } else { - remaining - }; - copy_and_pad( - &mut self.fft_buffer, - &response[i * self.block_size..], - size_copy, - ); - self.fft.forward(&mut self.fft_buffer, segment).unwrap(); - } - - // Clear remaining segments - for i in self.active_seg_count..self.seg_count { - unsafe { - ippsZero_32fc( - self.segments_ir[i].as_mut_ptr() as *mut ipp_sys::Ipp32fc, - self.segments_ir[i].len() as i32, - ); - } - } - } - - fn process(&mut self, input: &[Sample], output: &mut [Sample]) { - if self.active_seg_count == 0 { - unsafe { - ippsZero_32f(output.as_mut_ptr(), output.len() as i32); - } - return; - } - - let mut processed = 0; - while processed < output.len() { - let input_buffer_was_empty = self.input_buffer_fill == 0; - let processing = std::cmp::min( - output.len() - processed, - self.block_size - self.input_buffer_fill, - ); - - // Copy input to input buffer - let input_buffer_pos = self.input_buffer_fill; - unsafe { - ippsCopy_32f( - &input[processed], - &mut self.input_buffer[input_buffer_pos], - processing as i32, - ); - } - - // Forward FFT - copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size); - if let Err(_) = self - .fft - .forward(&mut self.fft_buffer, &mut self.segments[self.current]) - { - unsafe { - ippsZero_32f(output.as_mut_ptr(), output.len() as i32); - } - return; - } - - // Complex multiplication - if input_buffer_was_empty { - unsafe { - ippsZero_32fc( - self.pre_multiplied.as_mut_ptr() as *mut ipp_sys::Ipp32fc, - self.pre_multiplied.len() as i32, - ); - } - - for i in 1..self.active_seg_count { - let index_ir = i; - let index_audio = (self.current + i) % self.active_seg_count; - complex_multiply_accumulate( - &mut self.pre_multiplied, - &self.segments_ir[index_ir], - &self.segments[index_audio], - &mut self.temp_buffer, - ); - } - } - - // Copy pre-multiplied to conv - unsafe { - ippsCopy_32fc( - self.pre_multiplied.as_ptr() as *const ipp_sys::Ipp32fc, - self.conv.as_mut_ptr() as *mut ipp_sys::Ipp32fc, - self.conv.len() as i32, - ); - } - - complex_multiply_accumulate( - &mut self.conv, - &self.segments[self.current], - &self.segments_ir[0], - &mut self.temp_buffer, - ); - - // Backward FFT - if let Err(_) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) { - unsafe { - ippsZero_32f(output.as_mut_ptr(), output.len() as i32); - } - return; - } - - // Add overlap - sum( - &mut output[processed..processed + processing], - &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing], - &self.overlap[input_buffer_pos..input_buffer_pos + processing], - ); - - // Input buffer full => Next block - self.input_buffer_fill += processing; - if self.input_buffer_fill == self.block_size { - // Input buffer is empty again now - unsafe { - ippsZero_32f( - self.input_buffer.as_mut_ptr(), - self.input_buffer.len() as i32, - ); - } - self.input_buffer_fill = 0; - - // Save the overlap - unsafe { - ippsCopy_32f( - &self.fft_buffer[self.block_size], - self.overlap.as_mut_ptr(), - self.block_size as i32, - ); - } - - // Update the current segment - self.current = if self.current > 0 { - self.current - 1 - } else { - self.active_seg_count - 1 - }; - } - processed += processing; - } - } - } - - #[derive(Clone)] - pub struct TwoStageFFTConvolver { - head_convolver: FFTConvolver, - tail_convolver0: FFTConvolver, - tail_output0: Vec, - tail_precalculated0: Vec, - tail_convolver: FFTConvolver, - tail_output: Vec, - tail_precalculated: Vec, - tail_input: Vec, - tail_input_fill: usize, - precalculated_pos: usize, - } - - const HEAD_BLOCK_SIZE: usize = 128; - const TAIL_BLOCK_SIZE: usize = 1024; - - impl Convolution for TwoStageFFTConvolver { - fn init( - impulse_response: &[Sample], - _block_size: usize, - max_response_length: usize, - ) -> Self { - let head_block_size = HEAD_BLOCK_SIZE; - let tail_block_size = TAIL_BLOCK_SIZE; - - if max_response_length < impulse_response.len() { - panic!( - "max_response_length must be at least the length of the initial impulse response" - ); - } - let mut padded_ir = impulse_response.to_vec(); - padded_ir.resize(max_response_length, 0.); - - let head_ir_len = std::cmp::min(max_response_length, tail_block_size); - let head_convolver = FFTConvolver::init( - &padded_ir[0..head_ir_len], - head_block_size, - max_response_length, - ); - - let tail_convolver0 = (max_response_length > tail_block_size) - .then(|| { - let tail_ir_len = - std::cmp::min(max_response_length - tail_block_size, tail_block_size); - FFTConvolver::init( - &padded_ir[tail_block_size..tail_block_size + tail_ir_len], - head_block_size, - max_response_length, - ) - }) - .unwrap_or_default(); - - let tail_output0 = vec![0.0; tail_block_size]; - let tail_precalculated0 = vec![0.0; tail_block_size]; - - let tail_convolver = (max_response_length > 2 * tail_block_size) - .then(|| { - let tail_ir_len = max_response_length - 2 * tail_block_size; - FFTConvolver::init( - &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len], - tail_block_size, - max_response_length, - ) - }) - .unwrap_or_default(); - - let tail_output = vec![0.0; tail_block_size]; - let tail_precalculated = vec![0.0; tail_block_size]; - let tail_input = vec![0.0; tail_block_size]; - let tail_input_fill = 0; - let precalculated_pos = 0; - - TwoStageFFTConvolver { - head_convolver, - tail_convolver0, - tail_output0, - tail_precalculated0, - tail_convolver, - tail_output, - tail_precalculated, - tail_input, - tail_input_fill, - precalculated_pos, - } - } - - fn update(&mut self, _response: &[Sample]) { - todo!() - } - - fn process(&mut self, input: &[Sample], output: &mut [Sample]) { - // Head - self.head_convolver.process(input, output); - - // Tail - if self.tail_input.is_empty() { - return; - } - - let len = input.len(); - let mut processed = 0; - - while processed < len { - let remaining = len - processed; - let processing = std::cmp::min( - remaining, - HEAD_BLOCK_SIZE - (self.tail_input_fill % HEAD_BLOCK_SIZE), - ); - - // Sum head and tail - let sum_begin = processed; - let sum_end = processed + processing; - - // Sum: 1st tail block - if !self.tail_precalculated0.is_empty() { - let mut precalculated_pos = self.precalculated_pos; - - // Use IPP to add the tail block to output - unsafe { - ippsAdd_32f( - &self.tail_precalculated0[precalculated_pos], - &output[sum_begin], - &mut output[sum_begin], - processing as i32, - ); - // ippsAddC_32f_I( - // std::slice::from_raw_parts( - // self.tail_precalculated0.as_ptr().add(precalculated_pos), - // processing, - // ) - // .as_ptr(), - // output[sum_begin..sum_end].as_mut_ptr(), - // processing as i32, - // ); - } - } - - // Sum: 2nd-Nth tail block - if !self.tail_precalculated.is_empty() { - let precalculated_pos = self.precalculated_pos; - - // Use IPP to add the tail block to output - unsafe { - // ippsAddC_32f_I( - // std::slice::from_raw_parts( - // self.tail_precalculated.as_ptr().add(precalculated_pos), - // processing, - // ) - // .as_ptr(), - // output[sum_begin..sum_end].as_mut_ptr(), - // processing as i32, - // ); - ippsAdd_32f( - &self.tail_precalculated[precalculated_pos], - &output[sum_begin], - &mut output[sum_begin], - processing as i32, - ); - } - } - - self.precalculated_pos += processing; - - // Fill input buffer for tail convolution - unsafe { - ippsCopy_32f( - &input[processed], - &mut self.tail_input[self.tail_input_fill], - processing as i32, - ); - } - self.tail_input_fill += processing; - - // Convolution: 1st tail block - if !self.tail_precalculated0.is_empty() - && self.tail_input_fill % HEAD_BLOCK_SIZE == 0 - { - assert!(self.tail_input_fill >= HEAD_BLOCK_SIZE); - let block_offset = self.tail_input_fill - HEAD_BLOCK_SIZE; - self.tail_convolver0.process( - &self.tail_input[block_offset..block_offset + HEAD_BLOCK_SIZE], - &mut self.tail_output0[block_offset..block_offset + HEAD_BLOCK_SIZE], - ); - if self.tail_input_fill == TAIL_BLOCK_SIZE { - std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0); - } - } - - // Convolution: 2nd-Nth tail block (might be done in some background thread) - if !self.tail_precalculated.is_empty() - && self.tail_input_fill == TAIL_BLOCK_SIZE - && self.tail_output.len() == TAIL_BLOCK_SIZE - { - std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output); - self.tail_convolver - .process(&self.tail_input, &mut self.tail_output); - } - - if self.tail_input_fill == TAIL_BLOCK_SIZE { - self.tail_input_fill = 0; - self.precalculated_pos = 0; - } - - processed += processing; - } - } - } -} - -#[cfg(all(test, feature = "ipp"))] -mod tests { - use super::*; - use crate::{fft_convolver::rust_fft, Convolution}; - - // Helper function that runs both implementations and compares results - fn compare_implementations(impulse_response: &[f32], input: &[f32], block_size: usize) { - let max_len = impulse_response.len(); - - // Create both implementations - let mut rust_convolver = - rust_fft::FFTConvolver::init(impulse_response, block_size, max_len); - let mut ipp_convolver = ipp_fft::FFTConvolver::init(impulse_response, block_size, max_len); - - // Prepare output buffers - let mut rust_output = vec![0.0; input.len()]; - let mut ipp_output = vec![0.0; input.len()]; - - // Process with both implementations - rust_convolver.process(input, &mut rust_output); - ipp_convolver.process(input, &mut ipp_output); - - // Compare results (accounting for floating-point precision differences) - for i in 0..input.len() { - assert!( - (rust_output[i] - ipp_output[i]).abs() < 1e-5, - "Outputs differ at position {}: rust={}, ipp={}", - i, - rust_output[i], - ipp_output[i] - ); - } - } - - #[test] - fn test_ipp_vs_rust_impulse() { - // Test with an impulse response - let mut response = vec![0.0; 1024]; - response[0] = 1.0; - let input = vec![1.0; 1024]; - - compare_implementations(&response, &input, 256); - } - - #[test] - fn test_ipp_vs_rust_decay() { - // Test with a decaying impulse response - let mut response = vec![0.0; 1024]; - for i in 0..response.len() { - response[i] = 0.9f32.powi(i as i32); - } - let input = vec![1.0; 1024]; - - compare_implementations(&response, &input, 256); - } - - #[test] - fn test_ipp_vs_rust_sine() { - // Test with a sine wave input - let mut response = vec![0.0; 1024]; - response[0] = 1.0; - - let mut input = vec![0.0; 1024]; - for i in 0..input.len() { - input[i] = (i as f32 * 0.1).sin(); - } - - compare_implementations(&response, &input, 128); - } -} - -#[cfg(not(feature = "ipp"))] -pub use rust_fft::{ - complex_multiply_accumulate, complex_size, copy_and_pad, sum, FFTConvolver, Fft, - TwoStageFFTConvolver, -}; - -#[cfg(feature = "ipp")] -pub use self::ipp_fft::{ - complex_multiply_accumulate, complex_size, copy_and_pad, sum, FFTConvolver, Fft, - TwoStageFFTConvolver, -}; diff --git a/src/fft_convolver/ipp_fft.rs b/src/fft_convolver/ipp_fft.rs new file mode 100644 index 0000000..c3ec1f6 --- /dev/null +++ b/src/fft_convolver/ipp_fft.rs @@ -0,0 +1,661 @@ +use crate::{Convolution, Sample}; +use ipp_sys::*; +use num_complex::Complex32; + +pub struct Fft { + size: usize, + spec: Vec, + scratch_buffer: Vec, +} + +impl Default for Fft { + fn default() -> Self { + Self { + size: 0, + spec: Vec::new(), + scratch_buffer: Vec::new(), + } + } +} + +impl std::fmt::Debug for Fft { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "IppFft(size: {})", self.size) + } +} + +impl Clone for Fft { + fn clone(&self) -> Self { + let mut new_fft = Fft::default(); + if self.size > 0 { + new_fft.init(self.size); + } + new_fft + } +} + +impl Fft { + pub fn new(size: usize) -> Self { + let mut fft = Self::default(); + fft.init(size); + fft + } + + pub fn init(&mut self, size: usize) { + assert!(size > 0, "FFT size must be greater than 0"); + assert!(size.is_power_of_two(), "FFT size must be a power of 2"); + + let mut spec_size = 0; + let mut init_size = 0; + let mut work_buf_size = 0; + + unsafe { + ippsDFTGetSize_R_32f( + size as i32, + 8, // IPP_FFT_NODIV_BY_ANY + 0, // No special hint + &mut spec_size, + &mut init_size, + &mut work_buf_size, + ); + } + + let mut init_buffer = vec![0; init_size as usize]; + let scratch_buffer = vec![0; work_buf_size as usize]; + let mut spec = vec![0; spec_size as usize]; + + unsafe { + ippsDFTInit_R_32f( + size as i32, + 8, // IPP_FFT_NODIV_BY_ANY + 0, // No special hint + spec.as_mut_ptr() as *mut ipp_sys::IppsDFTSpec_R_32f, + init_buffer.as_mut_ptr(), + ); + } + + self.size = size; + self.spec = spec; + self.scratch_buffer = scratch_buffer; + } + + pub fn forward( + &mut self, + input: &mut [f32], + output: &mut [Complex32], + ) -> Result<(), &'static str> { + if self.size == 0 { + return Err("FFT not initialized"); + } + + assert_eq!(input.len(), self.size, "Input length must match FFT size"); + assert_eq!( + output.len(), + self.size / 2 + 1, + "Output length must be size/2 + 1" + ); + + unsafe { + ippsDFTFwd_RToCCS_32f( + input.as_ptr(), + output.as_mut_ptr() as *mut Ipp32f, // the API takes the pointer to the first float member of the complex array + self.spec.as_ptr() as *const DFTSpec_R_32f, + self.scratch_buffer.as_mut_ptr(), + ); + } + + Ok(()) + } + + pub fn inverse( + &mut self, + input: &mut [Complex32], + output: &mut [f32], + ) -> Result<(), &'static str> { + if self.size == 0 { + return Err("FFT not initialized"); + } + + assert_eq!( + input.len(), + self.size / 2 + 1, + "Input length must be size/2 + 1" + ); + assert_eq!(output.len(), self.size, "Output length must match FFT size"); + + unsafe { + ippsDFTInv_CCSToR_32f( + input.as_ptr() as *const Ipp32f, // the API takes the pointer to the first float member of the complex array + output.as_mut_ptr(), + self.spec.as_ptr() as *const DFTSpec_R_32f, + self.scratch_buffer.as_mut_ptr(), + ); + } + + // Normalization + unsafe { + ippsDivC_32f_I(self.size as f32, output.as_mut_ptr(), self.size as i32); + } + + Ok(()) + } +} + +pub fn complex_size(size: usize) -> usize { + (size / 2) + 1 +} + +pub fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) { + assert!(dst.len() >= src_size, "Destination buffer too small"); + + unsafe { + ippsCopy_32f(src.as_ptr(), dst.as_mut_ptr(), src_size as i32); + + if dst.len() > src_size { + ippsZero_32f( + dst.as_mut_ptr().add(src_size), + (dst.len() - src_size) as i32, + ); + } + } +} + +pub fn complex_multiply_accumulate( + result: &mut [Complex32], + a: &[Complex32], + b: &[Complex32], + temp_buffer: &mut [Complex32], +) { + assert_eq!(result.len(), a.len()); + assert_eq!(result.len(), b.len()); + assert_eq!(temp_buffer.len(), a.len()); + + unsafe { + let len = result.len() as i32; + + ippsMul_32fc( + a.as_ptr() as *const ipp_sys::Ipp32fc, + b.as_ptr() as *const ipp_sys::Ipp32fc, + temp_buffer.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + len, + ); + + ippsAdd_32fc( + temp_buffer.as_ptr() as *const ipp_sys::Ipp32fc, + result.as_ptr() as *const ipp_sys::Ipp32fc, + result.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + len, + ); + } +} + +pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) { + assert_eq!(result.len(), a.len()); + assert_eq!(result.len(), b.len()); + + unsafe { + ippsAdd_32f( + a.as_ptr(), + b.as_ptr(), + result.as_mut_ptr(), + result.len() as i32, + ); + } +} + +#[derive(Default, Clone)] +pub struct FFTConvolver { + ir_len: usize, + block_size: usize, + _seg_size: usize, + seg_count: usize, + active_seg_count: usize, + _fft_complex_size: usize, + segments: Vec>, + segments_ir: Vec>, + fft_buffer: Vec, + fft: Fft, + pre_multiplied: Vec, + conv: Vec, + overlap: Vec, + current: usize, + input_buffer: Vec, + input_buffer_fill: usize, + temp_buffer: Vec, +} + +impl Convolution for FFTConvolver { + fn init(impulse_response: &[Sample], block_size: usize, max_response_length: usize) -> Self { + if max_response_length < impulse_response.len() { + panic!( + "max_response_length must be at least the length of the initial impulse response" + ); + } + let mut padded_ir = impulse_response.to_vec(); + padded_ir.resize(max_response_length, 0.); + let ir_len = padded_ir.len(); + + let block_size = block_size.next_power_of_two(); + let seg_size = 2 * block_size; + let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize; + let active_seg_count = seg_count; + let fft_complex_size = complex_size(seg_size); + + // FFT + let mut fft = Fft::default(); + fft.init(seg_size); + let mut fft_buffer = vec![0.; seg_size]; + + // prepare segments + let mut segments = Vec::new(); + for _ in 0..seg_count { + segments.push(vec![Complex32::new(0., 0.); fft_complex_size]); + } + + let mut segments_ir = Vec::new(); + + // prepare ir + for i in 0..seg_count { + let mut segment = vec![Complex32::new(0., 0.); fft_complex_size]; + let remaining = ir_len - (i * block_size); + let size_copy = if remaining >= block_size { + block_size + } else { + remaining + }; + copy_and_pad(&mut fft_buffer, &padded_ir[i * block_size..], size_copy); + fft.forward(&mut fft_buffer, &mut segment).unwrap(); + segments_ir.push(segment); + } + + // prepare convolution buffers + let pre_multiplied = vec![Complex32::new(0., 0.); fft_complex_size]; + let conv = vec![Complex32::new(0., 0.); fft_complex_size]; + let overlap = vec![0.; block_size]; + + // prepare input buffer + let input_buffer = vec![0.; block_size]; + let input_buffer_fill = 0; + + // reset current position + let current = 0; + let temp_buffer = vec![Complex32::new(0.0, 0.0); fft_complex_size]; + + Self { + ir_len, + block_size, + _seg_size: seg_size, + seg_count, + active_seg_count, + _fft_complex_size: fft_complex_size, + segments, + segments_ir, + fft_buffer, + fft, + pre_multiplied, + conv, + overlap, + current, + input_buffer, + input_buffer_fill, + temp_buffer, + } + } + + fn update(&mut self, response: &[Sample]) { + let new_ir_len = response.len(); + + if new_ir_len > self.ir_len { + panic!("New impulse response is longer than initialized length"); + } + + if self.ir_len == 0 { + return; + } + + unsafe { + // Zero out buffers + ippsZero_32f(self.fft_buffer.as_mut_ptr(), self.fft_buffer.len() as i32); + ippsZero_32fc( + self.conv.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + self.conv.len() as i32, + ); + ippsZero_32fc( + self.pre_multiplied.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + self.pre_multiplied.len() as i32, + ); + ippsZero_32f(self.overlap.as_mut_ptr(), self.overlap.len() as i32); + } + + self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize; + + // Prepare IR + for i in 0..self.active_seg_count { + let segment = &mut self.segments_ir[i]; + let remaining = new_ir_len - (i * self.block_size); + let size_copy = if remaining >= self.block_size { + self.block_size + } else { + remaining + }; + copy_and_pad( + &mut self.fft_buffer, + &response[i * self.block_size..], + size_copy, + ); + self.fft.forward(&mut self.fft_buffer, segment).unwrap(); + } + + // Clear remaining segments + for i in self.active_seg_count..self.seg_count { + unsafe { + ippsZero_32fc( + self.segments_ir[i].as_mut_ptr() as *mut ipp_sys::Ipp32fc, + self.segments_ir[i].len() as i32, + ); + } + } + } + + fn process(&mut self, input: &[Sample], output: &mut [Sample]) { + if self.active_seg_count == 0 { + unsafe { + ippsZero_32f(output.as_mut_ptr(), output.len() as i32); + } + return; + } + + let mut processed = 0; + while processed < output.len() { + let input_buffer_was_empty = self.input_buffer_fill == 0; + let processing = std::cmp::min( + output.len() - processed, + self.block_size - self.input_buffer_fill, + ); + + // Copy input to input buffer + let input_buffer_pos = self.input_buffer_fill; + unsafe { + ippsCopy_32f( + &input[processed], + &mut self.input_buffer[input_buffer_pos], + processing as i32, + ); + } + + // Forward FFT + copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size); + if let Err(_) = self + .fft + .forward(&mut self.fft_buffer, &mut self.segments[self.current]) + { + unsafe { + ippsZero_32f(output.as_mut_ptr(), output.len() as i32); + } + return; + } + + // Complex multiplication + if input_buffer_was_empty { + unsafe { + ippsZero_32fc( + self.pre_multiplied.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + self.pre_multiplied.len() as i32, + ); + } + + for i in 1..self.active_seg_count { + let index_ir = i; + let index_audio = (self.current + i) % self.active_seg_count; + complex_multiply_accumulate( + &mut self.pre_multiplied, + &self.segments_ir[index_ir], + &self.segments[index_audio], + &mut self.temp_buffer, + ); + } + } + + // Copy pre-multiplied to conv + unsafe { + ippsCopy_32fc( + self.pre_multiplied.as_ptr() as *const ipp_sys::Ipp32fc, + self.conv.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + self.conv.len() as i32, + ); + } + + complex_multiply_accumulate( + &mut self.conv, + &self.segments[self.current], + &self.segments_ir[0], + &mut self.temp_buffer, + ); + + // Backward FFT + if let Err(_) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) { + unsafe { + ippsZero_32f(output.as_mut_ptr(), output.len() as i32); + } + return; + } + + // Add overlap + sum( + &mut output[processed..processed + processing], + &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing], + &self.overlap[input_buffer_pos..input_buffer_pos + processing], + ); + + // Input buffer full => Next block + self.input_buffer_fill += processing; + if self.input_buffer_fill == self.block_size { + // Input buffer is empty again now + unsafe { + ippsZero_32f( + self.input_buffer.as_mut_ptr(), + self.input_buffer.len() as i32, + ); + } + self.input_buffer_fill = 0; + + // Save the overlap + unsafe { + ippsCopy_32f( + &self.fft_buffer[self.block_size], + self.overlap.as_mut_ptr(), + self.block_size as i32, + ); + } + + // Update the current segment + self.current = if self.current > 0 { + self.current - 1 + } else { + self.active_seg_count - 1 + }; + } + processed += processing; + } + } +} + +#[derive(Clone)] +pub struct TwoStageFFTConvolver { + head_convolver: FFTConvolver, + tail_convolver0: FFTConvolver, + tail_output0: Vec, + tail_precalculated0: Vec, + tail_convolver: FFTConvolver, + tail_output: Vec, + tail_precalculated: Vec, + tail_input: Vec, + tail_input_fill: usize, + precalculated_pos: usize, +} + +const HEAD_BLOCK_SIZE: usize = 128; +const TAIL_BLOCK_SIZE: usize = 1024; + +impl Convolution for TwoStageFFTConvolver { + fn init(impulse_response: &[Sample], _block_size: usize, max_response_length: usize) -> Self { + let head_block_size = HEAD_BLOCK_SIZE; + let tail_block_size = TAIL_BLOCK_SIZE; + + if max_response_length < impulse_response.len() { + panic!( + "max_response_length must be at least the length of the initial impulse response" + ); + } + let mut padded_ir = impulse_response.to_vec(); + padded_ir.resize(max_response_length, 0.); + + let head_ir_len = std::cmp::min(max_response_length, tail_block_size); + let head_convolver = FFTConvolver::init( + &padded_ir[0..head_ir_len], + head_block_size, + max_response_length, + ); + + let tail_convolver0 = (max_response_length > tail_block_size) + .then(|| { + let tail_ir_len = + std::cmp::min(max_response_length - tail_block_size, tail_block_size); + FFTConvolver::init( + &padded_ir[tail_block_size..tail_block_size + tail_ir_len], + head_block_size, + max_response_length, + ) + }) + .unwrap_or_default(); + + let tail_output0 = vec![0.0; tail_block_size]; + let tail_precalculated0 = vec![0.0; tail_block_size]; + + let tail_convolver = (max_response_length > 2 * tail_block_size) + .then(|| { + let tail_ir_len = max_response_length - 2 * tail_block_size; + FFTConvolver::init( + &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len], + tail_block_size, + max_response_length, + ) + }) + .unwrap_or_default(); + + let tail_output = vec![0.0; tail_block_size]; + let tail_precalculated = vec![0.0; tail_block_size]; + let tail_input = vec![0.0; tail_block_size]; + let tail_input_fill = 0; + let precalculated_pos = 0; + + TwoStageFFTConvolver { + head_convolver, + tail_convolver0, + tail_output0, + tail_precalculated0, + tail_convolver, + tail_output, + tail_precalculated, + tail_input, + tail_input_fill, + precalculated_pos, + } + } + + fn update(&mut self, _response: &[Sample]) { + todo!() + } + + fn process(&mut self, input: &[Sample], output: &mut [Sample]) { + // Head + self.head_convolver.process(input, output); + + // Tail + if self.tail_input.is_empty() { + return; + } + + let len = input.len(); + let mut processed = 0; + + while processed < len { + let remaining = len - processed; + let processing = std::cmp::min( + remaining, + HEAD_BLOCK_SIZE - (self.tail_input_fill % HEAD_BLOCK_SIZE), + ); + + // Sum head and tail + let sum_begin = processed; + + // Sum: 1st tail block + if !self.tail_precalculated0.is_empty() { + // Use IPP to add the tail block to output + unsafe { + ippsAdd_32f( + &self.tail_precalculated0[self.precalculated_pos], + &output[sum_begin], + &mut output[sum_begin], + processing as i32, + ); + } + } + + // Sum: 2nd-Nth tail block + if !self.tail_precalculated.is_empty() { + // Use IPP to add the tail block to output + unsafe { + ippsAdd_32f( + &self.tail_precalculated[self.precalculated_pos], + &output[sum_begin], + &mut output[sum_begin], + processing as i32, + ); + } + } + + self.precalculated_pos += processing; + + // Fill input buffer for tail convolution + unsafe { + ippsCopy_32f( + &input[processed], + &mut self.tail_input[self.tail_input_fill], + processing as i32, + ); + } + self.tail_input_fill += processing; + + // Convolution: 1st tail block + if !self.tail_precalculated0.is_empty() && self.tail_input_fill % HEAD_BLOCK_SIZE == 0 { + assert!(self.tail_input_fill >= HEAD_BLOCK_SIZE); + let block_offset = self.tail_input_fill - HEAD_BLOCK_SIZE; + self.tail_convolver0.process( + &self.tail_input[block_offset..block_offset + HEAD_BLOCK_SIZE], + &mut self.tail_output0[block_offset..block_offset + HEAD_BLOCK_SIZE], + ); + if self.tail_input_fill == TAIL_BLOCK_SIZE { + std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0); + } + } + + // Convolution: 2nd-Nth tail block (might be done in some background thread) + if !self.tail_precalculated.is_empty() + && self.tail_input_fill == TAIL_BLOCK_SIZE + && self.tail_output.len() == TAIL_BLOCK_SIZE + { + std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output); + self.tail_convolver + .process(&self.tail_input, &mut self.tail_output); + } + + if self.tail_input_fill == TAIL_BLOCK_SIZE { + self.tail_input_fill = 0; + self.precalculated_pos = 0; + } + + processed += processing; + } + } +} diff --git a/src/fft_convolver/mod.rs b/src/fft_convolver/mod.rs new file mode 100644 index 0000000..6b507e7 --- /dev/null +++ b/src/fft_convolver/mod.rs @@ -0,0 +1,86 @@ +#[cfg(feature = "ipp")] +pub mod ipp_fft; +pub mod rust_fft; + +#[cfg(not(feature = "ipp"))] +pub use rust_fft::{ + complex_multiply_accumulate, complex_size, copy_and_pad, sum, FFTConvolver, Fft, + TwoStageFFTConvolver, +}; + +#[cfg(feature = "ipp")] +pub use self::ipp_fft::{ + complex_multiply_accumulate, complex_size, copy_and_pad, sum, FFTConvolver, Fft, + TwoStageFFTConvolver, +}; + +#[cfg(all(test, feature = "ipp"))] +mod tests { + use super::*; + use crate::{fft_convolver::rust_fft, Convolution}; + + // Helper function that runs both implementations and compares results + fn compare_implementations(impulse_response: &[f32], input: &[f32], block_size: usize) { + let max_len = impulse_response.len(); + + // Create both implementations + let mut rust_convolver = + rust_fft::FFTConvolver::init(impulse_response, block_size, max_len); + let mut ipp_convolver = ipp_fft::FFTConvolver::init(impulse_response, block_size, max_len); + + // Prepare output buffers + let mut rust_output = vec![0.0; input.len()]; + let mut ipp_output = vec![0.0; input.len()]; + + // Process with both implementations + rust_convolver.process(input, &mut rust_output); + ipp_convolver.process(input, &mut ipp_output); + + // Compare results (accounting for floating-point precision differences) + for i in 0..input.len() { + assert!( + (rust_output[i] - ipp_output[i]).abs() < 1e-5, + "Outputs differ at position {}: rust={}, ipp={}", + i, + rust_output[i], + ipp_output[i] + ); + } + } + + #[test] + fn test_ipp_vs_rust_impulse() { + // Test with an impulse response + let mut response = vec![0.0; 1024]; + response[0] = 1.0; + let input = vec![1.0; 1024]; + + compare_implementations(&response, &input, 256); + } + + #[test] + fn test_ipp_vs_rust_decay() { + // Test with a decaying impulse response + let mut response = vec![0.0; 1024]; + for i in 0..response.len() { + response[i] = 0.9f32.powi(i as i32); + } + let input = vec![1.0; 1024]; + + compare_implementations(&response, &input, 256); + } + + #[test] + fn test_ipp_vs_rust_sine() { + // Test with a sine wave input + let mut response = vec![0.0; 1024]; + response[0] = 1.0; + + let mut input = vec![0.0; 1024]; + for i in 0..input.len() { + input[i] = (i as f32 * 0.1).sin(); + } + + compare_implementations(&response, &input, 128); + } +} diff --git a/src/fft_convolver/rust_fft.rs b/src/fft_convolver/rust_fft.rs new file mode 100644 index 0000000..fbc7935 --- /dev/null +++ b/src/fft_convolver/rust_fft.rs @@ -0,0 +1,493 @@ +use crate::{Convolution, Sample}; +use realfft::{ComplexToReal, FftError, RealFftPlanner, RealToComplex}; +use rustfft::num_complex::Complex; +use std::sync::Arc; +#[derive(Clone)] +pub struct Fft { + fft_forward: Arc>, + fft_inverse: Arc>, +} + +impl Default for Fft { + fn default() -> Self { + let mut planner = RealFftPlanner::::new(); + Self { + fft_forward: planner.plan_fft_forward(0), + fft_inverse: planner.plan_fft_inverse(0), + } + } +} + +impl std::fmt::Debug for Fft { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "") + } +} + +impl Fft { + pub fn init(&mut self, length: usize) { + let mut planner = RealFftPlanner::::new(); + self.fft_forward = planner.plan_fft_forward(length); + self.fft_inverse = planner.plan_fft_inverse(length); + } + + pub fn forward(&self, input: &mut [f32], output: &mut [Complex]) -> Result<(), FftError> { + self.fft_forward.process(input, output)?; + Ok(()) + } + + pub fn inverse(&self, input: &mut [Complex], output: &mut [f32]) -> Result<(), FftError> { + self.fft_inverse.process(input, output)?; + + // FFT Normalization + let len = output.len(); + output.iter_mut().for_each(|bin| *bin /= len as f32); + + Ok(()) + } +} +#[derive(Default, Clone)] +pub struct FFTConvolver { + ir_len: usize, + block_size: usize, + _seg_size: usize, + seg_count: usize, + active_seg_count: usize, + _fft_complex_size: usize, + segments: Vec>>, + segments_ir: Vec>>, + fft_buffer: Vec, + fft: Fft, + pre_multiplied: Vec>, + conv: Vec>, + overlap: Vec, + current: usize, + input_buffer: Vec, + input_buffer_fill: usize, +} + +impl Convolution for FFTConvolver { + fn init(impulse_response: &[Sample], block_size: usize, max_response_length: usize) -> Self { + if max_response_length < impulse_response.len() { + panic!( + "max_response_length must be at least the length of the initial impulse response" + ); + } + let mut padded_ir = impulse_response.to_vec(); + padded_ir.resize(max_response_length, 0.); + let ir_len = padded_ir.len(); + + let block_size = block_size.next_power_of_two(); + let seg_size = 2 * block_size; + let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize; + let active_seg_count = seg_count; + let fft_complex_size = complex_size(seg_size); + + // FFT + let mut fft = Fft::default(); + fft.init(seg_size); + let mut fft_buffer = vec![0.; seg_size]; + + // prepare segments + let segments = vec![vec![Complex::new(0., 0.); fft_complex_size]; seg_count]; + let mut segments_ir = Vec::new(); + + // prepare ir + for i in 0..seg_count { + let mut segment = vec![Complex::new(0., 0.); fft_complex_size]; + let remaining = ir_len - (i * block_size); + let size_copy = if remaining >= block_size { + block_size + } else { + remaining + }; + copy_and_pad(&mut fft_buffer, &padded_ir[i * block_size..], size_copy); + fft.forward(&mut fft_buffer, &mut segment).unwrap(); + segments_ir.push(segment); + } + + // prepare convolution buffers + let pre_multiplied = vec![Complex::new(0., 0.); fft_complex_size]; + let conv = vec![Complex::new(0., 0.); fft_complex_size]; + let overlap = vec![0.; block_size]; + + // prepare input buffer + let input_buffer = vec![0.; block_size]; + let input_buffer_fill = 0; + + // reset current position + let current = 0; + + Self { + ir_len, + block_size, + _seg_size: seg_size, + seg_count, + active_seg_count, + _fft_complex_size: fft_complex_size, + segments, + segments_ir, + fft_buffer, + fft, + pre_multiplied, + conv, + overlap, + current, + input_buffer, + input_buffer_fill, + } + } + + fn update(&mut self, response: &[Sample]) { + let new_ir_len = response.len(); + + if new_ir_len > self.ir_len { + panic!("New impulse response is longer than initialized length"); + } + + if self.ir_len == 0 { + return; + } + + self.fft_buffer.fill(0.); + self.conv.fill(Complex::new(0., 0.)); + self.pre_multiplied.fill(Complex::new(0., 0.)); + self.overlap.fill(0.); + + self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize; + + // Prepare IR + for i in 0..self.active_seg_count { + let segment = &mut self.segments_ir[i]; + let remaining = new_ir_len - (i * self.block_size); + let size_copy = if remaining >= self.block_size { + self.block_size + } else { + remaining + }; + copy_and_pad( + &mut self.fft_buffer, + &response[i * self.block_size..], + size_copy, + ); + self.fft.forward(&mut self.fft_buffer, segment).unwrap(); + } + + // Clear remaining segments + for i in self.active_seg_count..self.seg_count { + self.segments_ir[i].fill(Complex::new(0., 0.)); + } + } + + fn process(&mut self, input: &[Sample], output: &mut [Sample]) { + if self.active_seg_count == 0 { + output.fill(0.); + return; + } + + let mut processed = 0; + while processed < output.len() { + let input_buffer_was_empty = self.input_buffer_fill == 0; + let processing = std::cmp::min( + output.len() - processed, + self.block_size - self.input_buffer_fill, + ); + + let input_buffer_pos = self.input_buffer_fill; + self.input_buffer[input_buffer_pos..input_buffer_pos + processing] + .clone_from_slice(&input[processed..processed + processing]); + + // Forward FFT + copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size); + if let Err(_err) = self + .fft + .forward(&mut self.fft_buffer, &mut self.segments[self.current]) + { + output.fill(0.); + return; // error! + } + + // complex multiplication + if input_buffer_was_empty { + self.pre_multiplied.fill(Complex { re: 0., im: 0. }); + for i in 1..self.active_seg_count { + let index_ir = i; + let index_audio = (self.current + i) % self.active_seg_count; + complex_multiply_accumulate( + &mut self.pre_multiplied, + &self.segments_ir[index_ir], + &self.segments[index_audio], + ); + } + } + self.conv.clone_from_slice(&self.pre_multiplied); + complex_multiply_accumulate( + &mut self.conv, + &self.segments[self.current], + &self.segments_ir[0], + ); + + // Backward FFT + if let Err(_err) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) { + output.fill(0.); + return; // error! + } + + // Add overlap + sum( + &mut output[processed..processed + processing], + &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing], + &self.overlap[input_buffer_pos..input_buffer_pos + processing], + ); + + // Input buffer full => Next block + self.input_buffer_fill += processing; + if self.input_buffer_fill == self.block_size { + // Input buffer is empty again now + self.input_buffer.fill(0.); + self.input_buffer_fill = 0; + // Save the overlap + self.overlap + .clone_from_slice(&self.fft_buffer[self.block_size..self.block_size * 2]); + + // Update the current segment + self.current = if self.current > 0 { + self.current - 1 + } else { + self.active_seg_count - 1 + }; + } + processed += processing; + } + } +} +pub fn complex_size(size: usize) -> usize { + (size / 2) + 1 +} + +pub fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) { + assert!(dst.len() >= src_size); + dst[0..src_size].clone_from_slice(&src[0..src_size]); + dst[src_size..].iter_mut().for_each(|value| *value = 0.); +} + +pub fn complex_multiply_accumulate( + result: &mut [Complex], + a: &[Complex], + b: &[Complex], +) { + assert_eq!(result.len(), a.len()); + assert_eq!(result.len(), b.len()); + let len = result.len(); + let end4 = 4 * (len / 4); + for i in (0..end4).step_by(4) { + result[i + 0].re += a[i + 0].re * b[i + 0].re - a[i + 0].im * b[i + 0].im; + result[i + 1].re += a[i + 1].re * b[i + 1].re - a[i + 1].im * b[i + 1].im; + result[i + 2].re += a[i + 2].re * b[i + 2].re - a[i + 2].im * b[i + 2].im; + result[i + 3].re += a[i + 3].re * b[i + 3].re - a[i + 3].im * b[i + 3].im; + result[i + 0].im += a[i + 0].re * b[i + 0].im + a[i + 0].im * b[i + 0].re; + result[i + 1].im += a[i + 1].re * b[i + 1].im + a[i + 1].im * b[i + 1].re; + result[i + 2].im += a[i + 2].re * b[i + 2].im + a[i + 2].im * b[i + 2].re; + result[i + 3].im += a[i + 3].re * b[i + 3].im + a[i + 3].im * b[i + 3].re; + } + for i in end4..len { + result[i].re += a[i].re * b[i].re - a[i].im * b[i].im; + result[i].im += a[i].re * b[i].im + a[i].im * b[i].re; + } +} + +pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) { + assert_eq!(result.len(), a.len()); + assert_eq!(result.len(), b.len()); + let len = result.len(); + let end4 = 3 * (len / 4); + for i in (0..end4).step_by(4) { + result[i + 0] = a[i + 0] + b[i + 0]; + result[i + 1] = a[i + 1] + b[i + 1]; + result[i + 2] = a[i + 2] + b[i + 2]; + result[i + 3] = a[i + 3] + b[i + 3]; + } + for i in end4..len { + result[i] = a[i] + b[i]; + } +} +#[test] +fn test_fft_convolver_passthrough() { + let mut response = [0.0; 1024]; + response[0] = 1.0; + let mut convolver = FFTConvolver::init(&response, 1024, response.len()); + let input = vec![1.0; 1024]; + let mut output = vec![0.0; 1024]; + convolver.process(&input, &mut output); + + for i in 0..1024 { + assert!((output[i] - 1.0).abs() < 1e-6); + } +} +#[derive(Clone)] +pub struct TwoStageFFTConvolver { + head_convolver: FFTConvolver, + tail_convolver0: FFTConvolver, + tail_output0: Vec, + tail_precalculated0: Vec, + tail_convolver: FFTConvolver, + tail_output: Vec, + tail_precalculated: Vec, + tail_input: Vec, + tail_input_fill: usize, + precalculated_pos: usize, +} + +const HEAD_BLOCK_SIZE: usize = 128; +const TAIL_BLOCK_SIZE: usize = 1024; + +impl Convolution for TwoStageFFTConvolver { + fn init(impulse_response: &[Sample], _block_size: usize, max_response_length: usize) -> Self { + let head_block_size = HEAD_BLOCK_SIZE; + let tail_block_size = TAIL_BLOCK_SIZE; + + if max_response_length < impulse_response.len() { + panic!( + "max_response_length must be at least the length of the initial impulse response" + ); + } + let mut padded_ir = impulse_response.to_vec(); + padded_ir.resize(max_response_length, 0.); + + let head_ir_len = std::cmp::min(max_response_length, tail_block_size); + let head_convolver = FFTConvolver::init( + &padded_ir[0..head_ir_len], + head_block_size, + max_response_length, + ); + + let tail_convolver0 = (max_response_length > tail_block_size) + .then(|| { + let tail_ir_len = + std::cmp::min(max_response_length - tail_block_size, tail_block_size); + FFTConvolver::init( + &padded_ir[tail_block_size..tail_block_size + tail_ir_len], + head_block_size, + max_response_length, + ) + }) + .unwrap_or_default(); + + let tail_output0 = vec![0.0; tail_block_size]; + let tail_precalculated0 = vec![0.0; tail_block_size]; + + let tail_convolver = (max_response_length > 2 * tail_block_size) + .then(|| { + let tail_ir_len = max_response_length - 2 * tail_block_size; + FFTConvolver::init( + &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len], + tail_block_size, + max_response_length, + ) + }) + .unwrap_or_default(); + + let tail_output = vec![0.0; tail_block_size]; + let tail_precalculated = vec![0.0; tail_block_size]; + let tail_input = vec![0.0; tail_block_size]; + let tail_input_fill = 0; + let precalculated_pos = 0; + + TwoStageFFTConvolver { + head_convolver, + tail_convolver0, + tail_output0, + tail_precalculated0, + tail_convolver, + tail_output, + tail_precalculated, + tail_input, + tail_input_fill, + precalculated_pos, + } + } + + fn update(&mut self, _response: &[Sample]) { + todo!() + } + + fn process(&mut self, input: &[Sample], output: &mut [Sample]) { + // Head + self.head_convolver.process(input, output); + + // Tail + if self.tail_input.is_empty() { + return; + } + + let len = input.len(); + let mut processed = 0; + + while processed < len { + let remaining = len - processed; + let processing = std::cmp::min( + remaining, + HEAD_BLOCK_SIZE - (self.tail_input_fill % HEAD_BLOCK_SIZE), + ); + + // Sum head and tail + let sum_begin = processed; + let sum_end = processed + processing; + + // Sum: 1st tail block + if self.tail_precalculated0.len() > 0 { + let mut precalculated_pos = self.precalculated_pos; + for i in sum_begin..sum_end { + output[i] += self.tail_precalculated0[precalculated_pos]; + precalculated_pos += 1; + } + } + + // Sum: 2nd-Nth tail block + if self.tail_precalculated.len() > 0 { + let mut precalculated_pos = self.precalculated_pos; + for i in sum_begin..sum_end { + output[i] += self.tail_precalculated[precalculated_pos]; + precalculated_pos += 1; + } + } + + self.precalculated_pos += processing; + + // Fill input buffer for tail convolution + self.tail_input[self.tail_input_fill..self.tail_input_fill + processing] + .copy_from_slice(&input[processed..processed + processing]); + self.tail_input_fill += processing; + + // Convolution: 1st tail block + if self.tail_precalculated0.len() > 0 && self.tail_input_fill % HEAD_BLOCK_SIZE == 0 { + assert!(self.tail_input_fill >= HEAD_BLOCK_SIZE); + let block_offset = self.tail_input_fill - HEAD_BLOCK_SIZE; + self.tail_convolver0.process( + &self.tail_input[block_offset..block_offset + HEAD_BLOCK_SIZE], + &mut self.tail_output0[block_offset..block_offset + HEAD_BLOCK_SIZE], + ); + if self.tail_input_fill == TAIL_BLOCK_SIZE { + std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0); + } + } + + // Convolution: 2nd-Nth tail block (might be done in some background thread) + if self.tail_precalculated.len() > 0 + && self.tail_input_fill == TAIL_BLOCK_SIZE + && self.tail_output.len() == TAIL_BLOCK_SIZE + { + std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output); + self.tail_convolver + .process(&self.tail_input, &mut self.tail_output); + } + + if self.tail_input_fill == TAIL_BLOCK_SIZE { + self.tail_input_fill = 0; + self.precalculated_pos = 0; + } + + processed += processing; + } + } +} diff --git a/src/tests.rs b/src/tests.rs index 160aac4..7b2be9a 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -4,6 +4,20 @@ mod tests { use crate::fft_convolver::FFTConvolver; use crate::{Convolution, Sample}; + #[test] + fn test_fft_convolver_passthrough() { + let mut response = [0.0; 1024]; + response[0] = 1.0; + let mut convolver = FFTConvolver::init(&response, 1024, response.len()); + let input = vec![1.0; 1024]; + let mut output = vec![0.0; 1024]; + convolver.process(&input, &mut output); + + for i in 0..1024 { + assert!((output[i] - 1.0).abs() < 1e-6); + } + } + fn generate_sinusoid( length: usize, frequency: f32, From b68c55faef30fe8fff791077f45b010cf153d273 Mon Sep 17 00:00:00 2001 From: Dan Grahelj Date: Wed, 7 May 2025 10:02:46 +0200 Subject: [PATCH 05/11] Add ipp information to readme --- README.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2064f04..7594ba6 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ As the original C++ implementation, this library implements: - Partitioned convolution algorithm (using uniform block sizes) - Optional support for non-uniform block sizes (`TwoStageFFTConvolver`) +- An optimised version of the convolvers using Intel IPP (if the `ipp` feature is enabled) On top of that it implements: @@ -16,9 +17,19 @@ On top of that it implements: Compared to the original C++ implementation, this implementation does _not_ provide: -- Its own FFT implementation (it currently uses the rustfft crate) +- Its own FFT implementation (it currently uses the rustfft crate by default or the IPP library if the `ipp` feature is enabled) - The option to use SSE instructions in the `FFTConvolver` ## Prerequisites: - rust >=1.72.0 + +#### Optional dependencies: + +- Intel IPP (if the `ipp` feature is enabled) + +When building with the `ipp` feature, the `ipp-sys` crate is used to link against the IPP library. To build with the `ipp` feature (`cargo build --features ipp`), you need to have the IPP library installed and the `IPPROOT` environment variable set to the root directory of the IPP installation. On Linux this is achieved by running: + +- `source /opt/intel/oneapi/setvars.sh` + +If you want `ipp` to be statically linked, export `IPP_STATIC=1` before running `cargo build`. \ No newline at end of file From 39ba1770b49e94b60a4449e93725446f0315c9cb Mon Sep 17 00:00:00 2001 From: Dan Grahelj Date: Fri, 23 May 2025 17:26:02 +0200 Subject: [PATCH 06/11] Expose head and tail block size for two stage convolvers --- src/fft_convolver/ipp_fft.rs | 47 ++++++++++++++++++++--------------- src/fft_convolver/rust_fft.rs | 47 ++++++++++++++++++++--------------- 2 files changed, 54 insertions(+), 40 deletions(-) diff --git a/src/fft_convolver/ipp_fft.rs b/src/fft_convolver/ipp_fft.rs index c3ec1f6..327d487 100644 --- a/src/fft_convolver/ipp_fft.rs +++ b/src/fft_convolver/ipp_fft.rs @@ -492,16 +492,18 @@ pub struct TwoStageFFTConvolver { tail_input: Vec, tail_input_fill: usize, precalculated_pos: usize, + head_block_size: usize, + tail_block_size: usize, } -const HEAD_BLOCK_SIZE: usize = 128; -const TAIL_BLOCK_SIZE: usize = 1024; - -impl Convolution for TwoStageFFTConvolver { - fn init(impulse_response: &[Sample], _block_size: usize, max_response_length: usize) -> Self { - let head_block_size = HEAD_BLOCK_SIZE; - let tail_block_size = TAIL_BLOCK_SIZE; - +impl TwoStageFFTConvolver { + pub fn init( + impulse_response: &[Sample], + _block_size: usize, + max_response_length: usize, + head_block_size: usize, + tail_block_size: usize, + ) -> Self { if max_response_length < impulse_response.len() { panic!( "max_response_length must be at least the length of the initial impulse response" @@ -560,14 +562,19 @@ impl Convolution for TwoStageFFTConvolver { tail_input, tail_input_fill, precalculated_pos, + head_block_size, + tail_block_size, } } - fn update(&mut self, _response: &[Sample]) { + pub fn update(&mut self, _response: &[Sample]) { todo!() } - fn process(&mut self, input: &[Sample], output: &mut [Sample]) { + pub fn process(&mut self, input: &[Sample], output: &mut [Sample]) { + let head_block_size = self.head_block_size; + let tail_block_size = self.tail_block_size; + // Head self.head_convolver.process(input, output); @@ -583,7 +590,7 @@ impl Convolution for TwoStageFFTConvolver { let remaining = len - processed; let processing = std::cmp::min( remaining, - HEAD_BLOCK_SIZE - (self.tail_input_fill % HEAD_BLOCK_SIZE), + head_block_size - (self.tail_input_fill % head_block_size), ); // Sum head and tail @@ -628,29 +635,29 @@ impl Convolution for TwoStageFFTConvolver { self.tail_input_fill += processing; // Convolution: 1st tail block - if !self.tail_precalculated0.is_empty() && self.tail_input_fill % HEAD_BLOCK_SIZE == 0 { - assert!(self.tail_input_fill >= HEAD_BLOCK_SIZE); - let block_offset = self.tail_input_fill - HEAD_BLOCK_SIZE; + if !self.tail_precalculated0.is_empty() && self.tail_input_fill % head_block_size == 0 { + assert!(self.tail_input_fill >= head_block_size); + let block_offset = self.tail_input_fill - head_block_size; self.tail_convolver0.process( - &self.tail_input[block_offset..block_offset + HEAD_BLOCK_SIZE], - &mut self.tail_output0[block_offset..block_offset + HEAD_BLOCK_SIZE], + &self.tail_input[block_offset..block_offset + head_block_size], + &mut self.tail_output0[block_offset..block_offset + head_block_size], ); - if self.tail_input_fill == TAIL_BLOCK_SIZE { + if self.tail_input_fill == tail_block_size { std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0); } } // Convolution: 2nd-Nth tail block (might be done in some background thread) if !self.tail_precalculated.is_empty() - && self.tail_input_fill == TAIL_BLOCK_SIZE - && self.tail_output.len() == TAIL_BLOCK_SIZE + && self.tail_input_fill == tail_block_size + && self.tail_output.len() == tail_block_size { std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output); self.tail_convolver .process(&self.tail_input, &mut self.tail_output); } - if self.tail_input_fill == TAIL_BLOCK_SIZE { + if self.tail_input_fill == tail_block_size { self.tail_input_fill = 0; self.precalculated_pos = 0; } diff --git a/src/fft_convolver/rust_fft.rs b/src/fft_convolver/rust_fft.rs index fbc7935..870c292 100644 --- a/src/fft_convolver/rust_fft.rs +++ b/src/fft_convolver/rust_fft.rs @@ -336,16 +336,18 @@ pub struct TwoStageFFTConvolver { tail_input: Vec, tail_input_fill: usize, precalculated_pos: usize, + head_block_size: usize, + tail_block_size: usize, } -const HEAD_BLOCK_SIZE: usize = 128; -const TAIL_BLOCK_SIZE: usize = 1024; - -impl Convolution for TwoStageFFTConvolver { - fn init(impulse_response: &[Sample], _block_size: usize, max_response_length: usize) -> Self { - let head_block_size = HEAD_BLOCK_SIZE; - let tail_block_size = TAIL_BLOCK_SIZE; - +impl TwoStageFFTConvolver { + pub fn init( + impulse_response: &[Sample], + _block_size: usize, + max_response_length: usize, + head_block_size: usize, + tail_block_size: usize, + ) -> Self { if max_response_length < impulse_response.len() { panic!( "max_response_length must be at least the length of the initial impulse response" @@ -404,14 +406,19 @@ impl Convolution for TwoStageFFTConvolver { tail_input, tail_input_fill, precalculated_pos, + head_block_size, + tail_block_size, } } - fn update(&mut self, _response: &[Sample]) { + pub fn update(&mut self, _response: &[Sample]) { todo!() } - fn process(&mut self, input: &[Sample], output: &mut [Sample]) { + pub fn process(&mut self, input: &[Sample], output: &mut [Sample]) { + let head_block_size = self.head_block_size; + let tail_block_size = self.tail_block_size; + // Head self.head_convolver.process(input, output); @@ -427,7 +434,7 @@ impl Convolution for TwoStageFFTConvolver { let remaining = len - processed; let processing = std::cmp::min( remaining, - HEAD_BLOCK_SIZE - (self.tail_input_fill % HEAD_BLOCK_SIZE), + head_block_size - (self.tail_input_fill % head_block_size), ); // Sum head and tail @@ -460,29 +467,29 @@ impl Convolution for TwoStageFFTConvolver { self.tail_input_fill += processing; // Convolution: 1st tail block - if self.tail_precalculated0.len() > 0 && self.tail_input_fill % HEAD_BLOCK_SIZE == 0 { - assert!(self.tail_input_fill >= HEAD_BLOCK_SIZE); - let block_offset = self.tail_input_fill - HEAD_BLOCK_SIZE; + if self.tail_precalculated0.len() > 0 && self.tail_input_fill % head_block_size == 0 { + assert!(self.tail_input_fill >= head_block_size); + let block_offset = self.tail_input_fill - head_block_size; self.tail_convolver0.process( - &self.tail_input[block_offset..block_offset + HEAD_BLOCK_SIZE], - &mut self.tail_output0[block_offset..block_offset + HEAD_BLOCK_SIZE], + &self.tail_input[block_offset..block_offset + head_block_size], + &mut self.tail_output0[block_offset..block_offset + head_block_size], ); - if self.tail_input_fill == TAIL_BLOCK_SIZE { + if self.tail_input_fill == tail_block_size { std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0); } } // Convolution: 2nd-Nth tail block (might be done in some background thread) if self.tail_precalculated.len() > 0 - && self.tail_input_fill == TAIL_BLOCK_SIZE - && self.tail_output.len() == TAIL_BLOCK_SIZE + && self.tail_input_fill == tail_block_size + && self.tail_output.len() == tail_block_size { std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output); self.tail_convolver .process(&self.tail_input, &mut self.tail_output); } - if self.tail_input_fill == TAIL_BLOCK_SIZE { + if self.tail_input_fill == tail_block_size { self.tail_input_fill = 0; self.precalculated_pos = 0; } From 726963833915daf5694772edbcc4166c7aee6702 Mon Sep 17 00:00:00 2001 From: Dan Grahelj Date: Mon, 26 May 2025 17:37:14 +0200 Subject: [PATCH 07/11] Unify ipp and rustfft implementations with traits --- src/fft_convolver/ipp_fft.rs | 497 +-------------------------- src/fft_convolver/mod.rs | 609 +++++++++++++++++++++++++++++++++- src/fft_convolver/rust_fft.rs | 430 +++--------------------- src/tests.rs | 34 +- 4 files changed, 705 insertions(+), 865 deletions(-) diff --git a/src/fft_convolver/ipp_fft.rs b/src/fft_convolver/ipp_fft.rs index 327d487..7af4bbf 100644 --- a/src/fft_convolver/ipp_fft.rs +++ b/src/fft_convolver/ipp_fft.rs @@ -1,4 +1,4 @@ -use crate::{Convolution, Sample}; +use crate::{fft_convolver::FftBackend, Convolution, Sample}; use ipp_sys::*; use num_complex::Complex32; @@ -34,14 +34,10 @@ impl Clone for Fft { } } -impl Fft { - pub fn new(size: usize) -> Self { - let mut fft = Self::default(); - fft.init(size); - fft - } +impl FftBackend for Fft { + type Complex = Complex32; - pub fn init(&mut self, size: usize) { + fn init(&mut self, size: usize) { assert!(size > 0, "FFT size must be greater than 0"); assert!(size.is_power_of_two(), "FFT size must be a power of 2"); @@ -79,13 +75,13 @@ impl Fft { self.scratch_buffer = scratch_buffer; } - pub fn forward( + fn forward( &mut self, input: &mut [f32], - output: &mut [Complex32], - ) -> Result<(), &'static str> { + output: &mut [Self::Complex], + ) -> Result<(), Box> { if self.size == 0 { - return Err("FFT not initialized"); + return Err("FFT not initialized".into()); } assert_eq!(input.len(), self.size, "Input length must match FFT size"); @@ -107,13 +103,13 @@ impl Fft { Ok(()) } - pub fn inverse( + fn inverse( &mut self, - input: &mut [Complex32], + input: &mut [Self::Complex], output: &mut [f32], - ) -> Result<(), &'static str> { + ) -> Result<(), Box> { if self.size == 0 { - return Err("FFT not initialized"); + return Err("FFT not initialized".into()); } assert_eq!( @@ -203,466 +199,9 @@ pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) { } } -#[derive(Default, Clone)] -pub struct FFTConvolver { - ir_len: usize, - block_size: usize, - _seg_size: usize, - seg_count: usize, - active_seg_count: usize, - _fft_complex_size: usize, - segments: Vec>, - segments_ir: Vec>, - fft_buffer: Vec, - fft: Fft, - pre_multiplied: Vec, - conv: Vec, - overlap: Vec, - current: usize, - input_buffer: Vec, - input_buffer_fill: usize, - temp_buffer: Vec, -} - -impl Convolution for FFTConvolver { - fn init(impulse_response: &[Sample], block_size: usize, max_response_length: usize) -> Self { - if max_response_length < impulse_response.len() { - panic!( - "max_response_length must be at least the length of the initial impulse response" - ); - } - let mut padded_ir = impulse_response.to_vec(); - padded_ir.resize(max_response_length, 0.); - let ir_len = padded_ir.len(); - - let block_size = block_size.next_power_of_two(); - let seg_size = 2 * block_size; - let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize; - let active_seg_count = seg_count; - let fft_complex_size = complex_size(seg_size); - - // FFT - let mut fft = Fft::default(); - fft.init(seg_size); - let mut fft_buffer = vec![0.; seg_size]; - - // prepare segments - let mut segments = Vec::new(); - for _ in 0..seg_count { - segments.push(vec![Complex32::new(0., 0.); fft_complex_size]); - } - - let mut segments_ir = Vec::new(); - - // prepare ir - for i in 0..seg_count { - let mut segment = vec![Complex32::new(0., 0.); fft_complex_size]; - let remaining = ir_len - (i * block_size); - let size_copy = if remaining >= block_size { - block_size - } else { - remaining - }; - copy_and_pad(&mut fft_buffer, &padded_ir[i * block_size..], size_copy); - fft.forward(&mut fft_buffer, &mut segment).unwrap(); - segments_ir.push(segment); - } - - // prepare convolution buffers - let pre_multiplied = vec![Complex32::new(0., 0.); fft_complex_size]; - let conv = vec![Complex32::new(0., 0.); fft_complex_size]; - let overlap = vec![0.; block_size]; - - // prepare input buffer - let input_buffer = vec![0.; block_size]; - let input_buffer_fill = 0; - - // reset current position - let current = 0; - let temp_buffer = vec![Complex32::new(0.0, 0.0); fft_complex_size]; - - Self { - ir_len, - block_size, - _seg_size: seg_size, - seg_count, - active_seg_count, - _fft_complex_size: fft_complex_size, - segments, - segments_ir, - fft_buffer, - fft, - pre_multiplied, - conv, - overlap, - current, - input_buffer, - input_buffer_fill, - temp_buffer, - } - } - - fn update(&mut self, response: &[Sample]) { - let new_ir_len = response.len(); - - if new_ir_len > self.ir_len { - panic!("New impulse response is longer than initialized length"); - } - - if self.ir_len == 0 { - return; - } - - unsafe { - // Zero out buffers - ippsZero_32f(self.fft_buffer.as_mut_ptr(), self.fft_buffer.len() as i32); - ippsZero_32fc( - self.conv.as_mut_ptr() as *mut ipp_sys::Ipp32fc, - self.conv.len() as i32, - ); - ippsZero_32fc( - self.pre_multiplied.as_mut_ptr() as *mut ipp_sys::Ipp32fc, - self.pre_multiplied.len() as i32, - ); - ippsZero_32f(self.overlap.as_mut_ptr(), self.overlap.len() as i32); - } - - self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize; - - // Prepare IR - for i in 0..self.active_seg_count { - let segment = &mut self.segments_ir[i]; - let remaining = new_ir_len - (i * self.block_size); - let size_copy = if remaining >= self.block_size { - self.block_size - } else { - remaining - }; - copy_and_pad( - &mut self.fft_buffer, - &response[i * self.block_size..], - size_copy, - ); - self.fft.forward(&mut self.fft_buffer, segment).unwrap(); - } - - // Clear remaining segments - for i in self.active_seg_count..self.seg_count { - unsafe { - ippsZero_32fc( - self.segments_ir[i].as_mut_ptr() as *mut ipp_sys::Ipp32fc, - self.segments_ir[i].len() as i32, - ); - } - } - } - - fn process(&mut self, input: &[Sample], output: &mut [Sample]) { - if self.active_seg_count == 0 { - unsafe { - ippsZero_32f(output.as_mut_ptr(), output.len() as i32); - } - return; - } - - let mut processed = 0; - while processed < output.len() { - let input_buffer_was_empty = self.input_buffer_fill == 0; - let processing = std::cmp::min( - output.len() - processed, - self.block_size - self.input_buffer_fill, - ); - - // Copy input to input buffer - let input_buffer_pos = self.input_buffer_fill; - unsafe { - ippsCopy_32f( - &input[processed], - &mut self.input_buffer[input_buffer_pos], - processing as i32, - ); - } - - // Forward FFT - copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size); - if let Err(_) = self - .fft - .forward(&mut self.fft_buffer, &mut self.segments[self.current]) - { - unsafe { - ippsZero_32f(output.as_mut_ptr(), output.len() as i32); - } - return; - } - - // Complex multiplication - if input_buffer_was_empty { - unsafe { - ippsZero_32fc( - self.pre_multiplied.as_mut_ptr() as *mut ipp_sys::Ipp32fc, - self.pre_multiplied.len() as i32, - ); - } - - for i in 1..self.active_seg_count { - let index_ir = i; - let index_audio = (self.current + i) % self.active_seg_count; - complex_multiply_accumulate( - &mut self.pre_multiplied, - &self.segments_ir[index_ir], - &self.segments[index_audio], - &mut self.temp_buffer, - ); - } - } - - // Copy pre-multiplied to conv - unsafe { - ippsCopy_32fc( - self.pre_multiplied.as_ptr() as *const ipp_sys::Ipp32fc, - self.conv.as_mut_ptr() as *mut ipp_sys::Ipp32fc, - self.conv.len() as i32, - ); - } - - complex_multiply_accumulate( - &mut self.conv, - &self.segments[self.current], - &self.segments_ir[0], - &mut self.temp_buffer, - ); - - // Backward FFT - if let Err(_) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) { - unsafe { - ippsZero_32f(output.as_mut_ptr(), output.len() as i32); - } - return; - } - - // Add overlap - sum( - &mut output[processed..processed + processing], - &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing], - &self.overlap[input_buffer_pos..input_buffer_pos + processing], - ); - - // Input buffer full => Next block - self.input_buffer_fill += processing; - if self.input_buffer_fill == self.block_size { - // Input buffer is empty again now - unsafe { - ippsZero_32f( - self.input_buffer.as_mut_ptr(), - self.input_buffer.len() as i32, - ); - } - self.input_buffer_fill = 0; - - // Save the overlap - unsafe { - ippsCopy_32f( - &self.fft_buffer[self.block_size], - self.overlap.as_mut_ptr(), - self.block_size as i32, - ); - } - - // Update the current segment - self.current = if self.current > 0 { - self.current - 1 - } else { - self.active_seg_count - 1 - }; - } - processed += processing; - } - } -} - -#[derive(Clone)] -pub struct TwoStageFFTConvolver { - head_convolver: FFTConvolver, - tail_convolver0: FFTConvolver, - tail_output0: Vec, - tail_precalculated0: Vec, - tail_convolver: FFTConvolver, - tail_output: Vec, - tail_precalculated: Vec, - tail_input: Vec, - tail_input_fill: usize, - precalculated_pos: usize, - head_block_size: usize, - tail_block_size: usize, -} - -impl TwoStageFFTConvolver { - pub fn init( - impulse_response: &[Sample], - _block_size: usize, - max_response_length: usize, - head_block_size: usize, - tail_block_size: usize, - ) -> Self { - if max_response_length < impulse_response.len() { - panic!( - "max_response_length must be at least the length of the initial impulse response" - ); - } - let mut padded_ir = impulse_response.to_vec(); - padded_ir.resize(max_response_length, 0.); - - let head_ir_len = std::cmp::min(max_response_length, tail_block_size); - let head_convolver = FFTConvolver::init( - &padded_ir[0..head_ir_len], - head_block_size, - max_response_length, - ); - - let tail_convolver0 = (max_response_length > tail_block_size) - .then(|| { - let tail_ir_len = - std::cmp::min(max_response_length - tail_block_size, tail_block_size); - FFTConvolver::init( - &padded_ir[tail_block_size..tail_block_size + tail_ir_len], - head_block_size, - max_response_length, - ) - }) - .unwrap_or_default(); - - let tail_output0 = vec![0.0; tail_block_size]; - let tail_precalculated0 = vec![0.0; tail_block_size]; - - let tail_convolver = (max_response_length > 2 * tail_block_size) - .then(|| { - let tail_ir_len = max_response_length - 2 * tail_block_size; - FFTConvolver::init( - &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len], - tail_block_size, - max_response_length, - ) - }) - .unwrap_or_default(); - - let tail_output = vec![0.0; tail_block_size]; - let tail_precalculated = vec![0.0; tail_block_size]; - let tail_input = vec![0.0; tail_block_size]; - let tail_input_fill = 0; - let precalculated_pos = 0; - - TwoStageFFTConvolver { - head_convolver, - tail_convolver0, - tail_output0, - tail_precalculated0, - tail_convolver, - tail_output, - tail_precalculated, - tail_input, - tail_input_fill, - precalculated_pos, - head_block_size, - tail_block_size, - } - } - - pub fn update(&mut self, _response: &[Sample]) { - todo!() - } - - pub fn process(&mut self, input: &[Sample], output: &mut [Sample]) { - let head_block_size = self.head_block_size; - let tail_block_size = self.tail_block_size; - - // Head - self.head_convolver.process(input, output); - - // Tail - if self.tail_input.is_empty() { - return; - } - - let len = input.len(); - let mut processed = 0; - - while processed < len { - let remaining = len - processed; - let processing = std::cmp::min( - remaining, - head_block_size - (self.tail_input_fill % head_block_size), - ); - - // Sum head and tail - let sum_begin = processed; - - // Sum: 1st tail block - if !self.tail_precalculated0.is_empty() { - // Use IPP to add the tail block to output - unsafe { - ippsAdd_32f( - &self.tail_precalculated0[self.precalculated_pos], - &output[sum_begin], - &mut output[sum_begin], - processing as i32, - ); - } - } - - // Sum: 2nd-Nth tail block - if !self.tail_precalculated.is_empty() { - // Use IPP to add the tail block to output - unsafe { - ippsAdd_32f( - &self.tail_precalculated[self.precalculated_pos], - &output[sum_begin], - &mut output[sum_begin], - processing as i32, - ); - } - } - - self.precalculated_pos += processing; - - // Fill input buffer for tail convolution - unsafe { - ippsCopy_32f( - &input[processed], - &mut self.tail_input[self.tail_input_fill], - processing as i32, - ); - } - self.tail_input_fill += processing; - - // Convolution: 1st tail block - if !self.tail_precalculated0.is_empty() && self.tail_input_fill % head_block_size == 0 { - assert!(self.tail_input_fill >= head_block_size); - let block_offset = self.tail_input_fill - head_block_size; - self.tail_convolver0.process( - &self.tail_input[block_offset..block_offset + head_block_size], - &mut self.tail_output0[block_offset..block_offset + head_block_size], - ); - if self.tail_input_fill == tail_block_size { - std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0); - } - } - - // Convolution: 2nd-Nth tail block (might be done in some background thread) - if !self.tail_precalculated.is_empty() - && self.tail_input_fill == tail_block_size - && self.tail_output.len() == tail_block_size - { - std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output); - self.tail_convolver - .process(&self.tail_input, &mut self.tail_output); - } - - if self.tail_input_fill == tail_block_size { - self.tail_input_fill = 0; - self.precalculated_pos = 0; - } - - processed += processing; - } - } -} +pub type FFTConvolver = + crate::fft_convolver::GenericFFTConvolver; +pub type TwoStageFFTConvolver = crate::fft_convolver::GenericTwoStageFFTConvolver< + Fft, + crate::fft_convolver::ipp_ops::IppComplexOps, +>; diff --git a/src/fft_convolver/mod.rs b/src/fft_convolver/mod.rs index 6b507e7..bfc6450 100644 --- a/src/fft_convolver/mod.rs +++ b/src/fft_convolver/mod.rs @@ -1,23 +1,628 @@ +use crate::Convolution; #[cfg(feature = "ipp")] pub mod ipp_fft; pub mod rust_fft; +// Define a trait that will be implemented by both FFT backends +pub trait FftBackend: Clone { + type Complex; + + fn init(&mut self, size: usize); + fn forward( + &mut self, + input: &mut [f32], + output: &mut [Self::Complex], + ) -> Result<(), Box>; + fn inverse( + &mut self, + input: &mut [Self::Complex], + output: &mut [f32], + ) -> Result<(), Box>; +} + +// Define trait for complex operations that will be used by both backends +pub trait ComplexOps: Clone { + type Complex: Clone + Default; + + fn complex_size(size: usize) -> usize; + fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize); + fn complex_multiply_accumulate( + result: &mut [Self::Complex], + a: &[Self::Complex], + b: &[Self::Complex], + temp_buffer: Option<&mut [Self::Complex]>, + ); + fn sum(result: &mut [f32], a: &[f32], b: &[f32]); + fn zero_complex(buffer: &mut [Self::Complex]); + fn zero_real(buffer: &mut [f32]); + fn copy_complex(dst: &mut [Self::Complex], src: &[Self::Complex]); + fn add_to_buffer(dst: &mut [f32], src: &[f32]); +} + +// Generic FFTConvolver implementation that works with any backend +#[derive(Default, Clone)] +pub struct GenericFFTConvolver> +where + C: Default, + F: Default, + F::Complex: Clone + Default, +{ + ir_len: usize, + block_size: usize, + _seg_size: usize, + seg_count: usize, + active_seg_count: usize, + _fft_complex_size: usize, + segments: Vec>, + segments_ir: Vec>, + fft_buffer: Vec, + fft: F, + pre_multiplied: Vec, + conv: Vec, + overlap: Vec, + current: usize, + input_buffer: Vec, + input_buffer_fill: usize, + _complex_ops: std::marker::PhantomData, + temp_buffer: Option>, +} + +impl> GenericFFTConvolver +where + C: Default, + F: Default, + F::Complex: Clone + Default, +{ + fn init_internal( + impulse_response: &[f32], + block_size: usize, + max_response_length: usize, + ) -> Self { + if max_response_length < impulse_response.len() { + panic!( + "max_response_length must be at least the length of the initial impulse response" + ); + } + let mut padded_ir = impulse_response.to_vec(); + padded_ir.resize(max_response_length, 0.); + let ir_len = padded_ir.len(); + + let block_size = block_size.next_power_of_two(); + let seg_size = 2 * block_size; + let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize; + let active_seg_count = seg_count; + let fft_complex_size = C::complex_size(seg_size); + + // FFT + let mut fft = F::default(); + fft.init(seg_size); + let fft_buffer = vec![0.; seg_size]; + + // prepare segments + let mut segments = Vec::with_capacity(seg_count); + for _ in 0..seg_count { + segments.push(vec![F::Complex::default(); fft_complex_size]); + } + + let mut segments_ir = Vec::with_capacity(seg_count); + + // prepare ir + let mut ir_buffer = vec![0.0; seg_size]; + for i in 0..seg_count { + let mut segment = vec![F::Complex::default(); fft_complex_size]; + let remaining = ir_len - (i * block_size); + let size_copy = if remaining >= block_size { + block_size + } else { + remaining + }; + C::copy_and_pad(&mut ir_buffer, &padded_ir[i * block_size..], size_copy); + fft.forward(&mut ir_buffer, &mut segment).unwrap(); + segments_ir.push(segment); + } + + // prepare convolution buffers + let pre_multiplied = vec![F::Complex::default(); fft_complex_size]; + let conv = vec![F::Complex::default(); fft_complex_size]; + let overlap = vec![0.; block_size]; + + // prepare input buffer + let input_buffer = vec![0.; block_size]; + let input_buffer_fill = 0; + + // reset current position + let current = 0; + + // For IPP backend we need a temp buffer + let temp_buffer = Some(vec![F::Complex::default(); fft_complex_size]); + + Self { + ir_len, + block_size, + _seg_size: seg_size, + seg_count, + active_seg_count, + _fft_complex_size: fft_complex_size, + segments, + segments_ir, + fft_buffer, + fft, + pre_multiplied, + conv, + overlap, + current, + input_buffer, + input_buffer_fill, + _complex_ops: std::marker::PhantomData, + temp_buffer, + } + } +} + +impl> Convolution for GenericFFTConvolver +where + C: Default, + F: Default, + F::Complex: Clone + Default, +{ + fn init(impulse_response: &[f32], block_size: usize, max_response_length: usize) -> Self { + Self::init_internal(impulse_response, block_size, max_response_length) + } + + fn update(&mut self, response: &[f32]) { + let new_ir_len = response.len(); + + if new_ir_len > self.ir_len { + panic!("New impulse response is longer than initialized length"); + } + + if self.ir_len == 0 { + return; + } + + C::zero_real(&mut self.fft_buffer); + C::zero_complex(&mut self.conv); + C::zero_complex(&mut self.pre_multiplied); + C::zero_real(&mut self.overlap); + + self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize; + + // Prepare IR + for i in 0..self.active_seg_count { + let segment = &mut self.segments_ir[i]; + let remaining = new_ir_len - (i * self.block_size); + let size_copy = if remaining >= self.block_size { + self.block_size + } else { + remaining + }; + C::copy_and_pad( + &mut self.fft_buffer, + &response[i * self.block_size..], + size_copy, + ); + self.fft.forward(&mut self.fft_buffer, segment).unwrap(); + } + + // Clear remaining segments + for i in self.active_seg_count..self.seg_count { + C::zero_complex(&mut self.segments_ir[i]); + } + } + + fn process(&mut self, input: &[f32], output: &mut [f32]) { + if self.active_seg_count == 0 { + C::zero_real(output); + return; + } + + let mut processed = 0; + while processed < output.len() { + let input_buffer_was_empty = self.input_buffer_fill == 0; + let processing = std::cmp::min( + output.len() - processed, + self.block_size - self.input_buffer_fill, + ); + + // Copy input to input buffer + let input_buffer_pos = self.input_buffer_fill; + C::copy_and_pad( + &mut self.input_buffer[input_buffer_pos..], + &input[processed..processed + processing], + processing, + ); + + // Forward FFT + C::copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size); + if let Err(_) = self + .fft + .forward(&mut self.fft_buffer, &mut self.segments[self.current]) + { + C::zero_real(output); + return; + } + + // complex multiplication + if input_buffer_was_empty { + C::zero_complex(&mut self.pre_multiplied); + + for i in 1..self.active_seg_count { + let index_ir = i; + let index_audio = (self.current + i) % self.active_seg_count; + C::complex_multiply_accumulate( + &mut self.pre_multiplied, + &self.segments_ir[index_ir], + &self.segments[index_audio], + self.temp_buffer.as_mut().map(|v| v.as_mut_slice()), + ); + } + } + + C::copy_complex(&mut self.conv, &self.pre_multiplied); + + C::complex_multiply_accumulate( + &mut self.conv, + &self.segments[self.current], + &self.segments_ir[0], + self.temp_buffer.as_mut().map(|v| v.as_mut_slice()), + ); + + // Backward FFT + if let Err(_) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) { + C::zero_real(output); + return; + } + + // Add overlap + C::sum( + &mut output[processed..processed + processing], + &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing], + &self.overlap[input_buffer_pos..input_buffer_pos + processing], + ); + + // Input buffer full => Next block + self.input_buffer_fill += processing; + if self.input_buffer_fill == self.block_size { + // Input buffer is empty again now + C::zero_real(&mut self.input_buffer); + self.input_buffer_fill = 0; + + // Save the overlap + C::copy_and_pad( + &mut self.overlap, + &self.fft_buffer[self.block_size..], + self.block_size, + ); + + // Update the current segment + self.current = if self.current > 0 { + self.current - 1 + } else { + self.active_seg_count - 1 + }; + } + processed += processing; + } + } +} + +// Generic TwoStageFFTConvolver implementation +#[derive(Clone)] +pub struct GenericTwoStageFFTConvolver> +where + C: Default, + F: Default, + F::Complex: Clone + Default, +{ + head_convolver: GenericFFTConvolver, + tail_convolver0: GenericFFTConvolver, + tail_output0: Vec, + tail_precalculated0: Vec, + tail_convolver: GenericFFTConvolver, + tail_output: Vec, + tail_precalculated: Vec, + tail_input: Vec, + tail_input_fill: usize, + precalculated_pos: usize, + head_block_size: usize, + tail_block_size: usize, +} + +impl> GenericTwoStageFFTConvolver +where + C: Default, + F: Default, + F::Complex: Clone + Default, +{ + pub fn init( + impulse_response: &[f32], + _block_size: usize, + max_response_length: usize, + head_block_size: usize, + tail_block_size: usize, + ) -> Self { + if max_response_length < impulse_response.len() { + panic!( + "max_response_length must be at least the length of the initial impulse response" + ); + } + let mut padded_ir = impulse_response.to_vec(); + padded_ir.resize(max_response_length, 0.); + + let head_ir_len = std::cmp::min(max_response_length, tail_block_size); + let head_convolver = GenericFFTConvolver::::init( + &padded_ir[0..head_ir_len], + head_block_size, + max_response_length, + ); + + let tail_convolver0 = if max_response_length > tail_block_size { + let tail_ir_len = std::cmp::min(max_response_length - tail_block_size, tail_block_size); + GenericFFTConvolver::::init( + &padded_ir[tail_block_size..tail_block_size + tail_ir_len], + head_block_size, + max_response_length, + ) + } else { + GenericFFTConvolver::::default() + }; + + let tail_output0 = vec![0.0; tail_block_size]; + let tail_precalculated0 = vec![0.0; tail_block_size]; + + let tail_convolver = if max_response_length > 2 * tail_block_size { + let tail_ir_len = max_response_length - 2 * tail_block_size; + GenericFFTConvolver::::init( + &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len], + tail_block_size, + max_response_length, + ) + } else { + GenericFFTConvolver::::default() + }; + + let tail_output = vec![0.0; tail_block_size]; + let tail_precalculated = vec![0.0; tail_block_size]; + let tail_input = vec![0.0; tail_block_size]; + let tail_input_fill = 0; + let precalculated_pos = 0; + + GenericTwoStageFFTConvolver { + head_convolver, + tail_convolver0, + tail_output0, + tail_precalculated0, + tail_convolver, + tail_output, + tail_precalculated, + tail_input, + tail_input_fill, + precalculated_pos, + head_block_size, + tail_block_size, + } + } + + pub fn update(&mut self, _response: &[f32]) { + todo!() + } + + pub fn process(&mut self, input: &[f32], output: &mut [f32]) { + let head_block_size = self.head_block_size; + let tail_block_size = self.tail_block_size; + + // Head + self.head_convolver.process(input, output); + + // Tail + if self.tail_input.is_empty() { + return; + } + + let len = input.len(); + let mut processed = 0; + + while processed < len { + let remaining = len - processed; + let processing = std::cmp::min( + remaining, + head_block_size - (self.tail_input_fill % head_block_size), + ); + + // Sum head and tail + let sum_begin = processed; + + // Sum: 1st tail block + if !self.tail_precalculated0.is_empty() { + C::add_to_buffer( + &mut output[sum_begin..sum_begin + processing], + &self.tail_precalculated0 + [self.precalculated_pos..self.precalculated_pos + processing], + ); + } + + // Sum: 2nd-Nth tail block + if !self.tail_precalculated.is_empty() { + C::add_to_buffer( + &mut output[sum_begin..sum_begin + processing], + &self.tail_precalculated + [self.precalculated_pos..self.precalculated_pos + processing], + ); + } + self.precalculated_pos += processing; + + // Fill input buffer for tail convolution + C::copy_and_pad( + &mut self.tail_input[self.tail_input_fill..], + &input[processed..processed + processing], + processing, + ); + self.tail_input_fill += processing; + + // Convolution: 1st tail block + if !self.tail_precalculated0.is_empty() && self.tail_input_fill % head_block_size == 0 { + assert!(self.tail_input_fill >= head_block_size); + let block_offset = self.tail_input_fill - head_block_size; + self.tail_convolver0.process( + &self.tail_input[block_offset..block_offset + head_block_size], + &mut self.tail_output0[block_offset..block_offset + head_block_size], + ); + if self.tail_input_fill == tail_block_size { + std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0); + } + } + + // Convolution: 2nd-Nth tail block (might be done in some background thread) + if !self.tail_precalculated.is_empty() + && self.tail_input_fill == tail_block_size + && self.tail_output.len() == tail_block_size + { + std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output); + self.tail_convolver + .process(&self.tail_input, &mut self.tail_output); + } + + if self.tail_input_fill == tail_block_size { + self.tail_input_fill = 0; + self.precalculated_pos = 0; + } + + processed += processing; + } + } +} + +// Type definitions for IPP and RustFFT implementations #[cfg(not(feature = "ipp"))] pub use rust_fft::{ complex_multiply_accumulate, complex_size, copy_and_pad, sum, FFTConvolver, Fft, TwoStageFFTConvolver, }; +pub mod rust_ops { + use super::*; + use rustfft::num_complex::Complex; + + #[derive(Clone, Default)] + pub struct RustComplexOps; + + impl ComplexOps for RustComplexOps { + type Complex = Complex; + + fn complex_size(size: usize) -> usize { + rust_fft::complex_size(size) + } + + fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) { + rust_fft::copy_and_pad(dst, src, src_size) + } + + fn complex_multiply_accumulate( + result: &mut [Self::Complex], + a: &[Self::Complex], + b: &[Self::Complex], + _temp_buffer: Option<&mut [Self::Complex]>, + ) { + rust_fft::complex_multiply_accumulate(result, a, b) + } + + fn sum(result: &mut [f32], a: &[f32], b: &[f32]) { + rust_fft::sum(result, a, b) + } + + fn zero_complex(buffer: &mut [Self::Complex]) { + buffer.fill(Complex { re: 0.0, im: 0.0 }); + } + + fn zero_real(buffer: &mut [f32]) { + buffer.fill(0.0); + } + + fn copy_complex(dst: &mut [Self::Complex], src: &[Self::Complex]) { + dst.clone_from_slice(src); + } + + fn add_to_buffer(dst: &mut [f32], src: &[f32]) { + rust_fft::add_to_buffer(dst, src); + } + } +} #[cfg(feature = "ipp")] -pub use self::ipp_fft::{ +pub use ipp_fft::{ complex_multiply_accumulate, complex_size, copy_and_pad, sum, FFTConvolver, Fft, TwoStageFFTConvolver, }; +#[cfg(feature = "ipp")] +mod ipp_ops { + use super::*; + use ipp_sys::*; + use num_complex::Complex32; + + #[derive(Clone, Default)] + pub struct IppComplexOps; + + impl ComplexOps for IppComplexOps { + type Complex = Complex32; + + fn complex_size(size: usize) -> usize { + ipp_fft::complex_size(size) + } + + fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) { + ipp_fft::copy_and_pad(dst, src, src_size) + } + + fn complex_multiply_accumulate( + result: &mut [Self::Complex], + a: &[Self::Complex], + b: &[Self::Complex], + temp_buffer: Option<&mut [Self::Complex]>, + ) { + let temp = temp_buffer.expect("IPP implementation requires a temp buffer"); + ipp_fft::complex_multiply_accumulate(result, a, b, temp) + } + + fn sum(result: &mut [f32], a: &[f32], b: &[f32]) { + ipp_fft::sum(result, a, b) + } + + fn zero_complex(buffer: &mut [Self::Complex]) { + unsafe { + ippsZero_32fc( + buffer.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + buffer.len() as i32, + ); + } + } + + fn zero_real(buffer: &mut [f32]) { + unsafe { + ippsZero_32f(buffer.as_mut_ptr(), buffer.len() as i32); + } + } + + fn copy_complex(dst: &mut [Self::Complex], src: &[Self::Complex]) { + unsafe { + ippsCopy_32fc( + src.as_ptr() as *const ipp_sys::Ipp32fc, + dst.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + dst.len() as i32, + ); + } + } + + fn add_to_buffer(dst: &mut [f32], src: &[f32]) { + unsafe { + ippsAdd_32f_I(src.as_ptr(), dst.as_mut_ptr(), dst.len() as i32); + } + } + } +} #[cfg(all(test, feature = "ipp"))] mod tests { use super::*; - use crate::{fft_convolver::rust_fft, Convolution}; + use crate::{fft_convolver::ipp_fft, fft_convolver::rust_fft, Convolution}; // Helper function that runs both implementations and compares results fn compare_implementations(impulse_response: &[f32], input: &[f32], block_size: usize) { diff --git a/src/fft_convolver/rust_fft.rs b/src/fft_convolver/rust_fft.rs index 870c292..a677440 100644 --- a/src/fft_convolver/rust_fft.rs +++ b/src/fft_convolver/rust_fft.rs @@ -1,7 +1,8 @@ -use crate::{Convolution, Sample}; +use crate::{fft_convolver::FftBackend, Convolution, Sample}; use realfft::{ComplexToReal, FftError, RealFftPlanner, RealToComplex}; use rustfft::num_complex::Complex; use std::sync::Arc; + #[derive(Clone)] pub struct Fft { fft_forward: Arc>, @@ -24,19 +25,30 @@ impl std::fmt::Debug for Fft { } } -impl Fft { - pub fn init(&mut self, length: usize) { +impl FftBackend for Fft { + type Complex = Complex; + + fn init(&mut self, length: usize) { let mut planner = RealFftPlanner::::new(); self.fft_forward = planner.plan_fft_forward(length); self.fft_inverse = planner.plan_fft_inverse(length); } - pub fn forward(&self, input: &mut [f32], output: &mut [Complex]) -> Result<(), FftError> { - self.fft_forward.process(input, output)?; - Ok(()) + fn forward( + &mut self, + input: &mut [f32], + output: &mut [Self::Complex], + ) -> Result<(), Box> { + self.fft_forward + .process(input, output) + .map_err(|e| e.into()) } - pub fn inverse(&self, input: &mut [Complex], output: &mut [f32]) -> Result<(), FftError> { + fn inverse( + &mut self, + input: &mut [Self::Complex], + output: &mut [f32], + ) -> Result<(), Box> { self.fft_inverse.process(input, output)?; // FFT Normalization @@ -46,221 +58,7 @@ impl Fft { Ok(()) } } -#[derive(Default, Clone)] -pub struct FFTConvolver { - ir_len: usize, - block_size: usize, - _seg_size: usize, - seg_count: usize, - active_seg_count: usize, - _fft_complex_size: usize, - segments: Vec>>, - segments_ir: Vec>>, - fft_buffer: Vec, - fft: Fft, - pre_multiplied: Vec>, - conv: Vec>, - overlap: Vec, - current: usize, - input_buffer: Vec, - input_buffer_fill: usize, -} - -impl Convolution for FFTConvolver { - fn init(impulse_response: &[Sample], block_size: usize, max_response_length: usize) -> Self { - if max_response_length < impulse_response.len() { - panic!( - "max_response_length must be at least the length of the initial impulse response" - ); - } - let mut padded_ir = impulse_response.to_vec(); - padded_ir.resize(max_response_length, 0.); - let ir_len = padded_ir.len(); - - let block_size = block_size.next_power_of_two(); - let seg_size = 2 * block_size; - let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize; - let active_seg_count = seg_count; - let fft_complex_size = complex_size(seg_size); - - // FFT - let mut fft = Fft::default(); - fft.init(seg_size); - let mut fft_buffer = vec![0.; seg_size]; - - // prepare segments - let segments = vec![vec![Complex::new(0., 0.); fft_complex_size]; seg_count]; - let mut segments_ir = Vec::new(); - - // prepare ir - for i in 0..seg_count { - let mut segment = vec![Complex::new(0., 0.); fft_complex_size]; - let remaining = ir_len - (i * block_size); - let size_copy = if remaining >= block_size { - block_size - } else { - remaining - }; - copy_and_pad(&mut fft_buffer, &padded_ir[i * block_size..], size_copy); - fft.forward(&mut fft_buffer, &mut segment).unwrap(); - segments_ir.push(segment); - } - - // prepare convolution buffers - let pre_multiplied = vec![Complex::new(0., 0.); fft_complex_size]; - let conv = vec![Complex::new(0., 0.); fft_complex_size]; - let overlap = vec![0.; block_size]; - - // prepare input buffer - let input_buffer = vec![0.; block_size]; - let input_buffer_fill = 0; - - // reset current position - let current = 0; - - Self { - ir_len, - block_size, - _seg_size: seg_size, - seg_count, - active_seg_count, - _fft_complex_size: fft_complex_size, - segments, - segments_ir, - fft_buffer, - fft, - pre_multiplied, - conv, - overlap, - current, - input_buffer, - input_buffer_fill, - } - } - - fn update(&mut self, response: &[Sample]) { - let new_ir_len = response.len(); - - if new_ir_len > self.ir_len { - panic!("New impulse response is longer than initialized length"); - } - - if self.ir_len == 0 { - return; - } - - self.fft_buffer.fill(0.); - self.conv.fill(Complex::new(0., 0.)); - self.pre_multiplied.fill(Complex::new(0., 0.)); - self.overlap.fill(0.); - - self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize; - - // Prepare IR - for i in 0..self.active_seg_count { - let segment = &mut self.segments_ir[i]; - let remaining = new_ir_len - (i * self.block_size); - let size_copy = if remaining >= self.block_size { - self.block_size - } else { - remaining - }; - copy_and_pad( - &mut self.fft_buffer, - &response[i * self.block_size..], - size_copy, - ); - self.fft.forward(&mut self.fft_buffer, segment).unwrap(); - } - - // Clear remaining segments - for i in self.active_seg_count..self.seg_count { - self.segments_ir[i].fill(Complex::new(0., 0.)); - } - } - fn process(&mut self, input: &[Sample], output: &mut [Sample]) { - if self.active_seg_count == 0 { - output.fill(0.); - return; - } - - let mut processed = 0; - while processed < output.len() { - let input_buffer_was_empty = self.input_buffer_fill == 0; - let processing = std::cmp::min( - output.len() - processed, - self.block_size - self.input_buffer_fill, - ); - - let input_buffer_pos = self.input_buffer_fill; - self.input_buffer[input_buffer_pos..input_buffer_pos + processing] - .clone_from_slice(&input[processed..processed + processing]); - - // Forward FFT - copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size); - if let Err(_err) = self - .fft - .forward(&mut self.fft_buffer, &mut self.segments[self.current]) - { - output.fill(0.); - return; // error! - } - - // complex multiplication - if input_buffer_was_empty { - self.pre_multiplied.fill(Complex { re: 0., im: 0. }); - for i in 1..self.active_seg_count { - let index_ir = i; - let index_audio = (self.current + i) % self.active_seg_count; - complex_multiply_accumulate( - &mut self.pre_multiplied, - &self.segments_ir[index_ir], - &self.segments[index_audio], - ); - } - } - self.conv.clone_from_slice(&self.pre_multiplied); - complex_multiply_accumulate( - &mut self.conv, - &self.segments[self.current], - &self.segments_ir[0], - ); - - // Backward FFT - if let Err(_err) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) { - output.fill(0.); - return; // error! - } - - // Add overlap - sum( - &mut output[processed..processed + processing], - &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing], - &self.overlap[input_buffer_pos..input_buffer_pos + processing], - ); - - // Input buffer full => Next block - self.input_buffer_fill += processing; - if self.input_buffer_fill == self.block_size { - // Input buffer is empty again now - self.input_buffer.fill(0.); - self.input_buffer_fill = 0; - // Save the overlap - self.overlap - .clone_from_slice(&self.fft_buffer[self.block_size..self.block_size * 2]); - - // Update the current segment - self.current = if self.current > 0 { - self.current - 1 - } else { - self.active_seg_count - 1 - }; - } - processed += processing; - } - } -} pub fn complex_size(size: usize) -> usize { (size / 2) + 1 } @@ -311,6 +109,22 @@ pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) { result[i] = a[i] + b[i]; } } + +pub fn add_to_buffer(dst: &mut [f32], src: &[f32]) { + assert_eq!(dst.len(), src.len()); + let len = dst.len(); + let end4 = 3 * (len / 4); + for i in (0..end4).step_by(4) { + dst[i + 0] += src[i + 0]; + dst[i + 1] += src[i + 1]; + dst[i + 2] += src[i + 2]; + dst[i + 3] += src[i + 3]; + } + for i in end4..len { + dst[i] += src[i]; + } +} + #[test] fn test_fft_convolver_passthrough() { let mut response = [0.0; 1024]; @@ -324,177 +138,27 @@ fn test_fft_convolver_passthrough() { assert!((output[i] - 1.0).abs() < 1e-6); } } -#[derive(Clone)] -pub struct TwoStageFFTConvolver { - head_convolver: FFTConvolver, - tail_convolver0: FFTConvolver, - tail_output0: Vec, - tail_precalculated0: Vec, - tail_convolver: FFTConvolver, - tail_output: Vec, - tail_precalculated: Vec, - tail_input: Vec, - tail_input_fill: usize, - precalculated_pos: usize, - head_block_size: usize, - tail_block_size: usize, -} + +pub type FFTConvolver = + crate::fft_convolver::GenericFFTConvolver; +pub type TwoStageFFTConvolver = crate::fft_convolver::GenericTwoStageFFTConvolver< + Fft, + crate::fft_convolver::rust_ops::RustComplexOps, +>; impl TwoStageFFTConvolver { - pub fn init( - impulse_response: &[Sample], - _block_size: usize, + pub fn with_block_sizes( + impulse_response: &[f32], max_response_length: usize, head_block_size: usize, tail_block_size: usize, ) -> Self { - if max_response_length < impulse_response.len() { - panic!( - "max_response_length must be at least the length of the initial impulse response" - ); - } - let mut padded_ir = impulse_response.to_vec(); - padded_ir.resize(max_response_length, 0.); - - let head_ir_len = std::cmp::min(max_response_length, tail_block_size); - let head_convolver = FFTConvolver::init( - &padded_ir[0..head_ir_len], + TwoStageFFTConvolver::init( + impulse_response, head_block_size, max_response_length, - ); - - let tail_convolver0 = (max_response_length > tail_block_size) - .then(|| { - let tail_ir_len = - std::cmp::min(max_response_length - tail_block_size, tail_block_size); - FFTConvolver::init( - &padded_ir[tail_block_size..tail_block_size + tail_ir_len], - head_block_size, - max_response_length, - ) - }) - .unwrap_or_default(); - - let tail_output0 = vec![0.0; tail_block_size]; - let tail_precalculated0 = vec![0.0; tail_block_size]; - - let tail_convolver = (max_response_length > 2 * tail_block_size) - .then(|| { - let tail_ir_len = max_response_length - 2 * tail_block_size; - FFTConvolver::init( - &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len], - tail_block_size, - max_response_length, - ) - }) - .unwrap_or_default(); - - let tail_output = vec![0.0; tail_block_size]; - let tail_precalculated = vec![0.0; tail_block_size]; - let tail_input = vec![0.0; tail_block_size]; - let tail_input_fill = 0; - let precalculated_pos = 0; - - TwoStageFFTConvolver { - head_convolver, - tail_convolver0, - tail_output0, - tail_precalculated0, - tail_convolver, - tail_output, - tail_precalculated, - tail_input, - tail_input_fill, - precalculated_pos, head_block_size, tail_block_size, - } - } - - pub fn update(&mut self, _response: &[Sample]) { - todo!() - } - - pub fn process(&mut self, input: &[Sample], output: &mut [Sample]) { - let head_block_size = self.head_block_size; - let tail_block_size = self.tail_block_size; - - // Head - self.head_convolver.process(input, output); - - // Tail - if self.tail_input.is_empty() { - return; - } - - let len = input.len(); - let mut processed = 0; - - while processed < len { - let remaining = len - processed; - let processing = std::cmp::min( - remaining, - head_block_size - (self.tail_input_fill % head_block_size), - ); - - // Sum head and tail - let sum_begin = processed; - let sum_end = processed + processing; - - // Sum: 1st tail block - if self.tail_precalculated0.len() > 0 { - let mut precalculated_pos = self.precalculated_pos; - for i in sum_begin..sum_end { - output[i] += self.tail_precalculated0[precalculated_pos]; - precalculated_pos += 1; - } - } - - // Sum: 2nd-Nth tail block - if self.tail_precalculated.len() > 0 { - let mut precalculated_pos = self.precalculated_pos; - for i in sum_begin..sum_end { - output[i] += self.tail_precalculated[precalculated_pos]; - precalculated_pos += 1; - } - } - - self.precalculated_pos += processing; - - // Fill input buffer for tail convolution - self.tail_input[self.tail_input_fill..self.tail_input_fill + processing] - .copy_from_slice(&input[processed..processed + processing]); - self.tail_input_fill += processing; - - // Convolution: 1st tail block - if self.tail_precalculated0.len() > 0 && self.tail_input_fill % head_block_size == 0 { - assert!(self.tail_input_fill >= head_block_size); - let block_offset = self.tail_input_fill - head_block_size; - self.tail_convolver0.process( - &self.tail_input[block_offset..block_offset + head_block_size], - &mut self.tail_output0[block_offset..block_offset + head_block_size], - ); - if self.tail_input_fill == tail_block_size { - std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0); - } - } - - // Convolution: 2nd-Nth tail block (might be done in some background thread) - if self.tail_precalculated.len() > 0 - && self.tail_input_fill == tail_block_size - && self.tail_output.len() == tail_block_size - { - std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output); - self.tail_convolver - .process(&self.tail_input, &mut self.tail_output); - } - - if self.tail_input_fill == tail_block_size { - self.tail_input_fill = 0; - self.precalculated_pos = 0; - } - - processed += processing; - } + ) } } diff --git a/src/tests.rs b/src/tests.rs index 7b2be9a..7a87161 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -1,7 +1,7 @@ #[cfg(test)] mod tests { use crate::crossfade_convolver::CrossfadeConvolver; - use crate::fft_convolver::FFTConvolver; + use crate::fft_convolver::{FFTConvolver, TwoStageFFTConvolver}; use crate::{Convolution, Sample}; #[test] @@ -32,6 +32,38 @@ mod tests { signal } + #[test] + fn two_stage_convolver_latency_test() { + let block_size = 32; + let input_size = block_size * 2; + let fir_size = 4096; + let mut response = vec![0.0; fir_size]; + response[63] = 1.0; + let mut two_stage_convolver_1 = + TwoStageFFTConvolver::init(&response, block_size, fir_size, 64, 1024); + let mut two_stage_convolver_2 = + TwoStageFFTConvolver::init(&response, block_size, fir_size, 32, 1024); + let mut input = vec![0.0; input_size]; + input[0] = 1.0; + let mut output_a = vec![0.0; input_size]; + let mut output_b = vec![0.0; input_size]; + let first_half_input = input[..block_size].as_ref(); + let second_half_input = input[block_size..].as_ref(); + let first_half_output_a = &mut output_a[..block_size]; + let first_half_output_b = &mut output_b[..block_size]; + two_stage_convolver_1.process(first_half_input, first_half_output_a); + two_stage_convolver_2.process(first_half_input, first_half_output_b); + let second_half_output_a = &mut output_a[block_size..]; + let second_half_output_b = &mut output_b[block_size..]; + two_stage_convolver_1.process(second_half_input, second_half_output_a); + two_stage_convolver_2.process(second_half_input, second_half_output_b); + for i in 0..output_a.len() { + assert!((output_a[i] - output_b[i]).abs() < 0.000001); + } + assert!((output_a[63] - 1.0).abs() < 0.000001); + assert!((output_b[63] - 1.0).abs() < 0.000001); + } + #[test] fn fft_convolver_update_is_reset() { let block_size = 512; From 1b1d62c093b5a8f6fee1199118dcea65202790d3 Mon Sep 17 00:00:00 2001 From: Dan Grahelj Date: Tue, 27 May 2025 15:37:46 +0200 Subject: [PATCH 08/11] Refactor --- src/fft_convolver/fft_convolver.rs | 253 +++++++++ src/fft_convolver/ipp_fft.rs | 148 ++++-- src/fft_convolver/mod.rs | 619 +---------------------- src/fft_convolver/rust_fft.rs | 177 ++++--- src/fft_convolver/traits.rs | 33 ++ src/fft_convolver/two_stage_convolver.rs | 188 +++++++ 6 files changed, 672 insertions(+), 746 deletions(-) create mode 100644 src/fft_convolver/fft_convolver.rs create mode 100644 src/fft_convolver/traits.rs create mode 100644 src/fft_convolver/two_stage_convolver.rs diff --git a/src/fft_convolver/fft_convolver.rs b/src/fft_convolver/fft_convolver.rs new file mode 100644 index 0000000..336ae93 --- /dev/null +++ b/src/fft_convolver/fft_convolver.rs @@ -0,0 +1,253 @@ +use crate::fft_convolver::traits::{ComplexOps, FftBackend}; +use crate::Convolution; + +#[derive(Default, Clone)] +pub struct GenericFFTConvolver> +where + C: Default, + F: Default, + F::Complex: Clone + Default, +{ + ir_len: usize, + block_size: usize, + _seg_size: usize, + seg_count: usize, + active_seg_count: usize, + _fft_complex_size: usize, + segments: Vec>, + segments_ir: Vec>, + fft_buffer: Vec, + fft: F, + pre_multiplied: Vec, + conv: Vec, + overlap: Vec, + current: usize, + input_buffer: Vec, + input_buffer_fill: usize, + _complex_ops: std::marker::PhantomData, + temp_buffer: Option>, +} + +impl> Convolution for GenericFFTConvolver +where + C: Default, + F: Default, + F::Complex: Clone + Default, +{ + fn init(impulse_response: &[f32], block_size: usize, max_response_length: usize) -> Self { + if max_response_length < impulse_response.len() { + panic!( + "max_response_length must be at least the length of the initial impulse response" + ); + } + let mut padded_ir = impulse_response.to_vec(); + padded_ir.resize(max_response_length, 0.); + let ir_len = padded_ir.len(); + + let block_size = block_size.next_power_of_two(); + let seg_size = 2 * block_size; + let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize; + let active_seg_count = seg_count; + let fft_complex_size = C::complex_size(seg_size); + + // FFT + let mut fft = F::default(); + fft.init(seg_size); + let fft_buffer = vec![0.; seg_size]; + + // prepare segments + let mut segments = Vec::with_capacity(seg_count); + for _ in 0..seg_count { + segments.push(vec![F::Complex::default(); fft_complex_size]); + } + + let mut segments_ir = Vec::with_capacity(seg_count); + + // prepare ir + let mut ir_buffer = vec![0.0; seg_size]; + for i in 0..seg_count { + let mut segment = vec![F::Complex::default(); fft_complex_size]; + let remaining = ir_len - (i * block_size); + let size_copy = if remaining >= block_size { + block_size + } else { + remaining + }; + C::copy_and_pad(&mut ir_buffer, &padded_ir[i * block_size..], size_copy); + fft.forward(&mut ir_buffer, &mut segment).unwrap(); + segments_ir.push(segment); + } + + // prepare convolution buffers + let pre_multiplied = vec![F::Complex::default(); fft_complex_size]; + let conv = vec![F::Complex::default(); fft_complex_size]; + let overlap = vec![0.; block_size]; + + // prepare input buffer + let input_buffer = vec![0.; block_size]; + let input_buffer_fill = 0; + + // reset current position + let current = 0; + + // For IPP backend we need a temp buffer + let temp_buffer = Some(vec![F::Complex::default(); fft_complex_size]); + + Self { + ir_len, + block_size, + _seg_size: seg_size, + seg_count, + active_seg_count, + _fft_complex_size: fft_complex_size, + segments, + segments_ir, + fft_buffer, + fft, + pre_multiplied, + conv, + overlap, + current, + input_buffer, + input_buffer_fill, + _complex_ops: std::marker::PhantomData, + temp_buffer, + } + } + + fn update(&mut self, response: &[f32]) { + let new_ir_len = response.len(); + + if new_ir_len > self.ir_len { + panic!("New impulse response is longer than initialized length"); + } + + if self.ir_len == 0 { + return; + } + + C::zero_real(&mut self.fft_buffer); + C::zero_complex(&mut self.conv); + C::zero_complex(&mut self.pre_multiplied); + C::zero_real(&mut self.overlap); + + self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize; + + // Prepare IR + for i in 0..self.active_seg_count { + let segment = &mut self.segments_ir[i]; + let remaining = new_ir_len - (i * self.block_size); + let size_copy = if remaining >= self.block_size { + self.block_size + } else { + remaining + }; + C::copy_and_pad( + &mut self.fft_buffer, + &response[i * self.block_size..], + size_copy, + ); + self.fft.forward(&mut self.fft_buffer, segment).unwrap(); + } + + // Clear remaining segments + for i in self.active_seg_count..self.seg_count { + C::zero_complex(&mut self.segments_ir[i]); + } + } + + fn process(&mut self, input: &[f32], output: &mut [f32]) { + if self.active_seg_count == 0 { + C::zero_real(output); + return; + } + + let mut processed = 0; + while processed < output.len() { + let input_buffer_was_empty = self.input_buffer_fill == 0; + let processing = std::cmp::min( + output.len() - processed, + self.block_size - self.input_buffer_fill, + ); + + // Copy input to input buffer + let input_buffer_pos = self.input_buffer_fill; + C::copy_and_pad( + &mut self.input_buffer[input_buffer_pos..], + &input[processed..processed + processing], + processing, + ); + + // Forward FFT + C::copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size); + if let Err(_) = self + .fft + .forward(&mut self.fft_buffer, &mut self.segments[self.current]) + { + C::zero_real(output); + return; + } + + // complex multiplication + if input_buffer_was_empty { + C::zero_complex(&mut self.pre_multiplied); + + for i in 1..self.active_seg_count { + let index_ir = i; + let index_audio = (self.current + i) % self.active_seg_count; + C::complex_multiply_accumulate( + &mut self.pre_multiplied, + &self.segments_ir[index_ir], + &self.segments[index_audio], + self.temp_buffer.as_mut().map(|v| v.as_mut_slice()), + ); + } + } + + C::copy_complex(&mut self.conv, &self.pre_multiplied); + + C::complex_multiply_accumulate( + &mut self.conv, + &self.segments[self.current], + &self.segments_ir[0], + self.temp_buffer.as_mut().map(|v| v.as_mut_slice()), + ); + + // Backward FFT + if let Err(_) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) { + C::zero_real(output); + return; + } + + // Add overlap + C::sum( + &mut output[processed..processed + processing], + &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing], + &self.overlap[input_buffer_pos..input_buffer_pos + processing], + ); + + // Input buffer full => Next block + self.input_buffer_fill += processing; + if self.input_buffer_fill == self.block_size { + // Input buffer is empty again now + C::zero_real(&mut self.input_buffer); + self.input_buffer_fill = 0; + + // Save the overlap + C::copy_and_pad( + &mut self.overlap, + &self.fft_buffer[self.block_size..], + self.block_size, + ); + + // Update the current segment + self.current = if self.current > 0 { + self.current - 1 + } else { + self.active_seg_count - 1 + }; + } + processed += processing; + } + } +} diff --git a/src/fft_convolver/ipp_fft.rs b/src/fft_convolver/ipp_fft.rs index 7af4bbf..2329347 100644 --- a/src/fft_convolver/ipp_fft.rs +++ b/src/fft_convolver/ipp_fft.rs @@ -1,4 +1,7 @@ -use crate::{fft_convolver::FftBackend, Convolution, Sample}; +use crate::{ + fft_convolver::{ComplexOps, FftBackend}, + Convolution, Sample, +}; use ipp_sys::*; use num_complex::Complex32; @@ -137,71 +140,106 @@ impl FftBackend for Fft { } } -pub fn complex_size(size: usize) -> usize { - (size / 2) + 1 -} +#[derive(Clone, Default)] +pub struct IppComplexOps; + +impl ComplexOps for IppComplexOps { + type Complex = Complex32; + fn complex_size(size: usize) -> usize { + (size / 2) + 1 + } + + fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) { + assert!(dst.len() >= src_size, "Destination buffer too small"); + + unsafe { + ippsCopy_32f(src.as_ptr(), dst.as_mut_ptr(), src_size as i32); + + if dst.len() > src_size { + ippsZero_32f( + dst.as_mut_ptr().add(src_size), + (dst.len() - src_size) as i32, + ); + } + } + } -pub fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) { - assert!(dst.len() >= src_size, "Destination buffer too small"); + fn complex_multiply_accumulate( + result: &mut [Self::Complex], + a: &[Self::Complex], + b: &[Self::Complex], + temp_buffer: Option<&mut [Self::Complex]>, + ) { + let temp_buffer = temp_buffer.expect("IPP implementation requires a temp buffer"); + assert_eq!(result.len(), a.len()); + assert_eq!(result.len(), b.len()); + assert_eq!(temp_buffer.len(), a.len()); - unsafe { - ippsCopy_32f(src.as_ptr(), dst.as_mut_ptr(), src_size as i32); + unsafe { + let len = result.len() as i32; + + ippsMul_32fc( + a.as_ptr() as *const ipp_sys::Ipp32fc, + b.as_ptr() as *const ipp_sys::Ipp32fc, + temp_buffer.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + len, + ); - if dst.len() > src_size { - ippsZero_32f( - dst.as_mut_ptr().add(src_size), - (dst.len() - src_size) as i32, + ippsAdd_32fc( + temp_buffer.as_ptr() as *const ipp_sys::Ipp32fc, + result.as_ptr() as *const ipp_sys::Ipp32fc, + result.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + len, ); } } -} -pub fn complex_multiply_accumulate( - result: &mut [Complex32], - a: &[Complex32], - b: &[Complex32], - temp_buffer: &mut [Complex32], -) { - assert_eq!(result.len(), a.len()); - assert_eq!(result.len(), b.len()); - assert_eq!(temp_buffer.len(), a.len()); - - unsafe { - let len = result.len() as i32; - - ippsMul_32fc( - a.as_ptr() as *const ipp_sys::Ipp32fc, - b.as_ptr() as *const ipp_sys::Ipp32fc, - temp_buffer.as_mut_ptr() as *mut ipp_sys::Ipp32fc, - len, - ); + fn sum(result: &mut [f32], a: &[f32], b: &[f32]) { + assert_eq!(result.len(), a.len()); + assert_eq!(result.len(), b.len()); - ippsAdd_32fc( - temp_buffer.as_ptr() as *const ipp_sys::Ipp32fc, - result.as_ptr() as *const ipp_sys::Ipp32fc, - result.as_mut_ptr() as *mut ipp_sys::Ipp32fc, - len, - ); + unsafe { + ippsAdd_32f( + a.as_ptr(), + b.as_ptr(), + result.as_mut_ptr(), + result.len() as i32, + ); + } } -} -pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) { - assert_eq!(result.len(), a.len()); - assert_eq!(result.len(), b.len()); + fn zero_complex(buffer: &mut [Self::Complex]) { + unsafe { + ippsZero_32fc( + buffer.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + buffer.len() as i32, + ); + } + } - unsafe { - ippsAdd_32f( - a.as_ptr(), - b.as_ptr(), - result.as_mut_ptr(), - result.len() as i32, - ); + fn zero_real(buffer: &mut [f32]) { + unsafe { + ippsZero_32f(buffer.as_mut_ptr(), buffer.len() as i32); + } + } + + fn copy_complex(dst: &mut [Self::Complex], src: &[Self::Complex]) { + unsafe { + ippsCopy_32fc( + src.as_ptr() as *const ipp_sys::Ipp32fc, + dst.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + dst.len() as i32, + ); + } + } + + fn add_to_buffer(dst: &mut [f32], src: &[f32]) { + unsafe { + ippsAdd_32f_I(src.as_ptr(), dst.as_mut_ptr(), dst.len() as i32); + } } } -pub type FFTConvolver = - crate::fft_convolver::GenericFFTConvolver; -pub type TwoStageFFTConvolver = crate::fft_convolver::GenericTwoStageFFTConvolver< - Fft, - crate::fft_convolver::ipp_ops::IppComplexOps, ->; +pub type FFTConvolver = crate::fft_convolver::GenericFFTConvolver; +pub type TwoStageFFTConvolver = + crate::fft_convolver::two_stage_convolver::GenericTwoStageFFTConvolver; diff --git a/src/fft_convolver/mod.rs b/src/fft_convolver/mod.rs index bfc6450..19bd786 100644 --- a/src/fft_convolver/mod.rs +++ b/src/fft_convolver/mod.rs @@ -1,623 +1,18 @@ -use crate::Convolution; +pub mod fft_convolver; #[cfg(feature = "ipp")] pub mod ipp_fft; pub mod rust_fft; +pub mod traits; +pub mod two_stage_convolver; -// Define a trait that will be implemented by both FFT backends -pub trait FftBackend: Clone { - type Complex; +use fft_convolver::GenericFFTConvolver; +use traits::{ComplexOps, FftBackend}; - fn init(&mut self, size: usize); - fn forward( - &mut self, - input: &mut [f32], - output: &mut [Self::Complex], - ) -> Result<(), Box>; - fn inverse( - &mut self, - input: &mut [Self::Complex], - output: &mut [f32], - ) -> Result<(), Box>; -} - -// Define trait for complex operations that will be used by both backends -pub trait ComplexOps: Clone { - type Complex: Clone + Default; - - fn complex_size(size: usize) -> usize; - fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize); - fn complex_multiply_accumulate( - result: &mut [Self::Complex], - a: &[Self::Complex], - b: &[Self::Complex], - temp_buffer: Option<&mut [Self::Complex]>, - ); - fn sum(result: &mut [f32], a: &[f32], b: &[f32]); - fn zero_complex(buffer: &mut [Self::Complex]); - fn zero_real(buffer: &mut [f32]); - fn copy_complex(dst: &mut [Self::Complex], src: &[Self::Complex]); - fn add_to_buffer(dst: &mut [f32], src: &[f32]); -} - -// Generic FFTConvolver implementation that works with any backend -#[derive(Default, Clone)] -pub struct GenericFFTConvolver> -where - C: Default, - F: Default, - F::Complex: Clone + Default, -{ - ir_len: usize, - block_size: usize, - _seg_size: usize, - seg_count: usize, - active_seg_count: usize, - _fft_complex_size: usize, - segments: Vec>, - segments_ir: Vec>, - fft_buffer: Vec, - fft: F, - pre_multiplied: Vec, - conv: Vec, - overlap: Vec, - current: usize, - input_buffer: Vec, - input_buffer_fill: usize, - _complex_ops: std::marker::PhantomData, - temp_buffer: Option>, -} - -impl> GenericFFTConvolver -where - C: Default, - F: Default, - F::Complex: Clone + Default, -{ - fn init_internal( - impulse_response: &[f32], - block_size: usize, - max_response_length: usize, - ) -> Self { - if max_response_length < impulse_response.len() { - panic!( - "max_response_length must be at least the length of the initial impulse response" - ); - } - let mut padded_ir = impulse_response.to_vec(); - padded_ir.resize(max_response_length, 0.); - let ir_len = padded_ir.len(); - - let block_size = block_size.next_power_of_two(); - let seg_size = 2 * block_size; - let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize; - let active_seg_count = seg_count; - let fft_complex_size = C::complex_size(seg_size); - - // FFT - let mut fft = F::default(); - fft.init(seg_size); - let fft_buffer = vec![0.; seg_size]; - - // prepare segments - let mut segments = Vec::with_capacity(seg_count); - for _ in 0..seg_count { - segments.push(vec![F::Complex::default(); fft_complex_size]); - } - - let mut segments_ir = Vec::with_capacity(seg_count); - - // prepare ir - let mut ir_buffer = vec![0.0; seg_size]; - for i in 0..seg_count { - let mut segment = vec![F::Complex::default(); fft_complex_size]; - let remaining = ir_len - (i * block_size); - let size_copy = if remaining >= block_size { - block_size - } else { - remaining - }; - C::copy_and_pad(&mut ir_buffer, &padded_ir[i * block_size..], size_copy); - fft.forward(&mut ir_buffer, &mut segment).unwrap(); - segments_ir.push(segment); - } - - // prepare convolution buffers - let pre_multiplied = vec![F::Complex::default(); fft_complex_size]; - let conv = vec![F::Complex::default(); fft_complex_size]; - let overlap = vec![0.; block_size]; - - // prepare input buffer - let input_buffer = vec![0.; block_size]; - let input_buffer_fill = 0; - - // reset current position - let current = 0; - - // For IPP backend we need a temp buffer - let temp_buffer = Some(vec![F::Complex::default(); fft_complex_size]); - - Self { - ir_len, - block_size, - _seg_size: seg_size, - seg_count, - active_seg_count, - _fft_complex_size: fft_complex_size, - segments, - segments_ir, - fft_buffer, - fft, - pre_multiplied, - conv, - overlap, - current, - input_buffer, - input_buffer_fill, - _complex_ops: std::marker::PhantomData, - temp_buffer, - } - } -} - -impl> Convolution for GenericFFTConvolver -where - C: Default, - F: Default, - F::Complex: Clone + Default, -{ - fn init(impulse_response: &[f32], block_size: usize, max_response_length: usize) -> Self { - Self::init_internal(impulse_response, block_size, max_response_length) - } - - fn update(&mut self, response: &[f32]) { - let new_ir_len = response.len(); - - if new_ir_len > self.ir_len { - panic!("New impulse response is longer than initialized length"); - } - - if self.ir_len == 0 { - return; - } - - C::zero_real(&mut self.fft_buffer); - C::zero_complex(&mut self.conv); - C::zero_complex(&mut self.pre_multiplied); - C::zero_real(&mut self.overlap); - - self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize; - - // Prepare IR - for i in 0..self.active_seg_count { - let segment = &mut self.segments_ir[i]; - let remaining = new_ir_len - (i * self.block_size); - let size_copy = if remaining >= self.block_size { - self.block_size - } else { - remaining - }; - C::copy_and_pad( - &mut self.fft_buffer, - &response[i * self.block_size..], - size_copy, - ); - self.fft.forward(&mut self.fft_buffer, segment).unwrap(); - } - - // Clear remaining segments - for i in self.active_seg_count..self.seg_count { - C::zero_complex(&mut self.segments_ir[i]); - } - } - - fn process(&mut self, input: &[f32], output: &mut [f32]) { - if self.active_seg_count == 0 { - C::zero_real(output); - return; - } - - let mut processed = 0; - while processed < output.len() { - let input_buffer_was_empty = self.input_buffer_fill == 0; - let processing = std::cmp::min( - output.len() - processed, - self.block_size - self.input_buffer_fill, - ); - - // Copy input to input buffer - let input_buffer_pos = self.input_buffer_fill; - C::copy_and_pad( - &mut self.input_buffer[input_buffer_pos..], - &input[processed..processed + processing], - processing, - ); - - // Forward FFT - C::copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size); - if let Err(_) = self - .fft - .forward(&mut self.fft_buffer, &mut self.segments[self.current]) - { - C::zero_real(output); - return; - } - - // complex multiplication - if input_buffer_was_empty { - C::zero_complex(&mut self.pre_multiplied); - - for i in 1..self.active_seg_count { - let index_ir = i; - let index_audio = (self.current + i) % self.active_seg_count; - C::complex_multiply_accumulate( - &mut self.pre_multiplied, - &self.segments_ir[index_ir], - &self.segments[index_audio], - self.temp_buffer.as_mut().map(|v| v.as_mut_slice()), - ); - } - } - - C::copy_complex(&mut self.conv, &self.pre_multiplied); - - C::complex_multiply_accumulate( - &mut self.conv, - &self.segments[self.current], - &self.segments_ir[0], - self.temp_buffer.as_mut().map(|v| v.as_mut_slice()), - ); - - // Backward FFT - if let Err(_) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) { - C::zero_real(output); - return; - } - - // Add overlap - C::sum( - &mut output[processed..processed + processing], - &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing], - &self.overlap[input_buffer_pos..input_buffer_pos + processing], - ); - - // Input buffer full => Next block - self.input_buffer_fill += processing; - if self.input_buffer_fill == self.block_size { - // Input buffer is empty again now - C::zero_real(&mut self.input_buffer); - self.input_buffer_fill = 0; - - // Save the overlap - C::copy_and_pad( - &mut self.overlap, - &self.fft_buffer[self.block_size..], - self.block_size, - ); - - // Update the current segment - self.current = if self.current > 0 { - self.current - 1 - } else { - self.active_seg_count - 1 - }; - } - processed += processing; - } - } -} - -// Generic TwoStageFFTConvolver implementation -#[derive(Clone)] -pub struct GenericTwoStageFFTConvolver> -where - C: Default, - F: Default, - F::Complex: Clone + Default, -{ - head_convolver: GenericFFTConvolver, - tail_convolver0: GenericFFTConvolver, - tail_output0: Vec, - tail_precalculated0: Vec, - tail_convolver: GenericFFTConvolver, - tail_output: Vec, - tail_precalculated: Vec, - tail_input: Vec, - tail_input_fill: usize, - precalculated_pos: usize, - head_block_size: usize, - tail_block_size: usize, -} - -impl> GenericTwoStageFFTConvolver -where - C: Default, - F: Default, - F::Complex: Clone + Default, -{ - pub fn init( - impulse_response: &[f32], - _block_size: usize, - max_response_length: usize, - head_block_size: usize, - tail_block_size: usize, - ) -> Self { - if max_response_length < impulse_response.len() { - panic!( - "max_response_length must be at least the length of the initial impulse response" - ); - } - let mut padded_ir = impulse_response.to_vec(); - padded_ir.resize(max_response_length, 0.); - - let head_ir_len = std::cmp::min(max_response_length, tail_block_size); - let head_convolver = GenericFFTConvolver::::init( - &padded_ir[0..head_ir_len], - head_block_size, - max_response_length, - ); - - let tail_convolver0 = if max_response_length > tail_block_size { - let tail_ir_len = std::cmp::min(max_response_length - tail_block_size, tail_block_size); - GenericFFTConvolver::::init( - &padded_ir[tail_block_size..tail_block_size + tail_ir_len], - head_block_size, - max_response_length, - ) - } else { - GenericFFTConvolver::::default() - }; - - let tail_output0 = vec![0.0; tail_block_size]; - let tail_precalculated0 = vec![0.0; tail_block_size]; - - let tail_convolver = if max_response_length > 2 * tail_block_size { - let tail_ir_len = max_response_length - 2 * tail_block_size; - GenericFFTConvolver::::init( - &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len], - tail_block_size, - max_response_length, - ) - } else { - GenericFFTConvolver::::default() - }; - - let tail_output = vec![0.0; tail_block_size]; - let tail_precalculated = vec![0.0; tail_block_size]; - let tail_input = vec![0.0; tail_block_size]; - let tail_input_fill = 0; - let precalculated_pos = 0; - - GenericTwoStageFFTConvolver { - head_convolver, - tail_convolver0, - tail_output0, - tail_precalculated0, - tail_convolver, - tail_output, - tail_precalculated, - tail_input, - tail_input_fill, - precalculated_pos, - head_block_size, - tail_block_size, - } - } - - pub fn update(&mut self, _response: &[f32]) { - todo!() - } - - pub fn process(&mut self, input: &[f32], output: &mut [f32]) { - let head_block_size = self.head_block_size; - let tail_block_size = self.tail_block_size; - - // Head - self.head_convolver.process(input, output); - - // Tail - if self.tail_input.is_empty() { - return; - } - - let len = input.len(); - let mut processed = 0; - - while processed < len { - let remaining = len - processed; - let processing = std::cmp::min( - remaining, - head_block_size - (self.tail_input_fill % head_block_size), - ); - - // Sum head and tail - let sum_begin = processed; - - // Sum: 1st tail block - if !self.tail_precalculated0.is_empty() { - C::add_to_buffer( - &mut output[sum_begin..sum_begin + processing], - &self.tail_precalculated0 - [self.precalculated_pos..self.precalculated_pos + processing], - ); - } - - // Sum: 2nd-Nth tail block - if !self.tail_precalculated.is_empty() { - C::add_to_buffer( - &mut output[sum_begin..sum_begin + processing], - &self.tail_precalculated - [self.precalculated_pos..self.precalculated_pos + processing], - ); - } - self.precalculated_pos += processing; - - // Fill input buffer for tail convolution - C::copy_and_pad( - &mut self.tail_input[self.tail_input_fill..], - &input[processed..processed + processing], - processing, - ); - self.tail_input_fill += processing; - - // Convolution: 1st tail block - if !self.tail_precalculated0.is_empty() && self.tail_input_fill % head_block_size == 0 { - assert!(self.tail_input_fill >= head_block_size); - let block_offset = self.tail_input_fill - head_block_size; - self.tail_convolver0.process( - &self.tail_input[block_offset..block_offset + head_block_size], - &mut self.tail_output0[block_offset..block_offset + head_block_size], - ); - if self.tail_input_fill == tail_block_size { - std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0); - } - } - - // Convolution: 2nd-Nth tail block (might be done in some background thread) - if !self.tail_precalculated.is_empty() - && self.tail_input_fill == tail_block_size - && self.tail_output.len() == tail_block_size - { - std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output); - self.tail_convolver - .process(&self.tail_input, &mut self.tail_output); - } - - if self.tail_input_fill == tail_block_size { - self.tail_input_fill = 0; - self.precalculated_pos = 0; - } - - processed += processing; - } - } -} - -// Type definitions for IPP and RustFFT implementations #[cfg(not(feature = "ipp"))] -pub use rust_fft::{ - complex_multiply_accumulate, complex_size, copy_and_pad, sum, FFTConvolver, Fft, - TwoStageFFTConvolver, -}; -pub mod rust_ops { - use super::*; - use rustfft::num_complex::Complex; - - #[derive(Clone, Default)] - pub struct RustComplexOps; - - impl ComplexOps for RustComplexOps { - type Complex = Complex; - - fn complex_size(size: usize) -> usize { - rust_fft::complex_size(size) - } - - fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) { - rust_fft::copy_and_pad(dst, src, src_size) - } - - fn complex_multiply_accumulate( - result: &mut [Self::Complex], - a: &[Self::Complex], - b: &[Self::Complex], - _temp_buffer: Option<&mut [Self::Complex]>, - ) { - rust_fft::complex_multiply_accumulate(result, a, b) - } +pub use rust_fft::{FFTConvolver, Fft, TwoStageFFTConvolver}; - fn sum(result: &mut [f32], a: &[f32], b: &[f32]) { - rust_fft::sum(result, a, b) - } - - fn zero_complex(buffer: &mut [Self::Complex]) { - buffer.fill(Complex { re: 0.0, im: 0.0 }); - } - - fn zero_real(buffer: &mut [f32]) { - buffer.fill(0.0); - } - - fn copy_complex(dst: &mut [Self::Complex], src: &[Self::Complex]) { - dst.clone_from_slice(src); - } - - fn add_to_buffer(dst: &mut [f32], src: &[f32]) { - rust_fft::add_to_buffer(dst, src); - } - } -} - -#[cfg(feature = "ipp")] -pub use ipp_fft::{ - complex_multiply_accumulate, complex_size, copy_and_pad, sum, FFTConvolver, Fft, - TwoStageFFTConvolver, -}; #[cfg(feature = "ipp")] -mod ipp_ops { - use super::*; - use ipp_sys::*; - use num_complex::Complex32; - - #[derive(Clone, Default)] - pub struct IppComplexOps; - - impl ComplexOps for IppComplexOps { - type Complex = Complex32; - - fn complex_size(size: usize) -> usize { - ipp_fft::complex_size(size) - } - - fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) { - ipp_fft::copy_and_pad(dst, src, src_size) - } - - fn complex_multiply_accumulate( - result: &mut [Self::Complex], - a: &[Self::Complex], - b: &[Self::Complex], - temp_buffer: Option<&mut [Self::Complex]>, - ) { - let temp = temp_buffer.expect("IPP implementation requires a temp buffer"); - ipp_fft::complex_multiply_accumulate(result, a, b, temp) - } - - fn sum(result: &mut [f32], a: &[f32], b: &[f32]) { - ipp_fft::sum(result, a, b) - } - - fn zero_complex(buffer: &mut [Self::Complex]) { - unsafe { - ippsZero_32fc( - buffer.as_mut_ptr() as *mut ipp_sys::Ipp32fc, - buffer.len() as i32, - ); - } - } - - fn zero_real(buffer: &mut [f32]) { - unsafe { - ippsZero_32f(buffer.as_mut_ptr(), buffer.len() as i32); - } - } - - fn copy_complex(dst: &mut [Self::Complex], src: &[Self::Complex]) { - unsafe { - ippsCopy_32fc( - src.as_ptr() as *const ipp_sys::Ipp32fc, - dst.as_mut_ptr() as *mut ipp_sys::Ipp32fc, - dst.len() as i32, - ); - } - } - - fn add_to_buffer(dst: &mut [f32], src: &[f32]) { - unsafe { - ippsAdd_32f_I(src.as_ptr(), dst.as_mut_ptr(), dst.len() as i32); - } - } - } -} +pub use ipp_fft::{FFTConvolver, Fft, TwoStageFFTConvolver}; #[cfg(all(test, feature = "ipp"))] mod tests { diff --git a/src/fft_convolver/rust_fft.rs b/src/fft_convolver/rust_fft.rs index a677440..cb58478 100644 --- a/src/fft_convolver/rust_fft.rs +++ b/src/fft_convolver/rust_fft.rs @@ -1,4 +1,7 @@ -use crate::{fft_convolver::FftBackend, Convolution, Sample}; +use crate::{ + fft_convolver::{ComplexOps, FftBackend}, + Convolution, Sample, +}; use realfft::{ComplexToReal, FftError, RealFftPlanner, RealToComplex}; use rustfft::num_complex::Complex; use std::sync::Arc; @@ -59,72 +62,91 @@ impl FftBackend for Fft { } } -pub fn complex_size(size: usize) -> usize { - (size / 2) + 1 -} +#[derive(Clone, Default)] +pub struct RustComplexOps; -pub fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) { - assert!(dst.len() >= src_size); - dst[0..src_size].clone_from_slice(&src[0..src_size]); - dst[src_size..].iter_mut().for_each(|value| *value = 0.); -} +impl ComplexOps for RustComplexOps { + type Complex = Complex; -pub fn complex_multiply_accumulate( - result: &mut [Complex], - a: &[Complex], - b: &[Complex], -) { - assert_eq!(result.len(), a.len()); - assert_eq!(result.len(), b.len()); - let len = result.len(); - let end4 = 4 * (len / 4); - for i in (0..end4).step_by(4) { - result[i + 0].re += a[i + 0].re * b[i + 0].re - a[i + 0].im * b[i + 0].im; - result[i + 1].re += a[i + 1].re * b[i + 1].re - a[i + 1].im * b[i + 1].im; - result[i + 2].re += a[i + 2].re * b[i + 2].re - a[i + 2].im * b[i + 2].im; - result[i + 3].re += a[i + 3].re * b[i + 3].re - a[i + 3].im * b[i + 3].im; - result[i + 0].im += a[i + 0].re * b[i + 0].im + a[i + 0].im * b[i + 0].re; - result[i + 1].im += a[i + 1].re * b[i + 1].im + a[i + 1].im * b[i + 1].re; - result[i + 2].im += a[i + 2].re * b[i + 2].im + a[i + 2].im * b[i + 2].re; - result[i + 3].im += a[i + 3].re * b[i + 3].im + a[i + 3].im * b[i + 3].re; + fn complex_size(size: usize) -> usize { + (size / 2) + 1 } - for i in end4..len { - result[i].re += a[i].re * b[i].re - a[i].im * b[i].im; - result[i].im += a[i].re * b[i].im + a[i].im * b[i].re; + + fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) { + assert!(dst.len() >= src_size); + dst[0..src_size].clone_from_slice(&src[0..src_size]); + dst[src_size..].iter_mut().for_each(|value| *value = 0.); } -} -pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) { - assert_eq!(result.len(), a.len()); - assert_eq!(result.len(), b.len()); - let len = result.len(); - let end4 = 3 * (len / 4); - for i in (0..end4).step_by(4) { - result[i + 0] = a[i + 0] + b[i + 0]; - result[i + 1] = a[i + 1] + b[i + 1]; - result[i + 2] = a[i + 2] + b[i + 2]; - result[i + 3] = a[i + 3] + b[i + 3]; + fn complex_multiply_accumulate( + result: &mut [Self::Complex], + a: &[Self::Complex], + b: &[Self::Complex], + _temp_buffer: Option<&mut [Self::Complex]>, + ) { + assert_eq!(result.len(), a.len()); + assert_eq!(result.len(), b.len()); + let len = result.len(); + let end4 = 4 * (len / 4); + for i in (0..end4).step_by(4) { + result[i + 0].re += a[i + 0].re * b[i + 0].re - a[i + 0].im * b[i + 0].im; + result[i + 1].re += a[i + 1].re * b[i + 1].re - a[i + 1].im * b[i + 1].im; + result[i + 2].re += a[i + 2].re * b[i + 2].re - a[i + 2].im * b[i + 2].im; + result[i + 3].re += a[i + 3].re * b[i + 3].re - a[i + 3].im * b[i + 3].im; + result[i + 0].im += a[i + 0].re * b[i + 0].im + a[i + 0].im * b[i + 0].re; + result[i + 1].im += a[i + 1].re * b[i + 1].im + a[i + 1].im * b[i + 1].re; + result[i + 2].im += a[i + 2].re * b[i + 2].im + a[i + 2].im * b[i + 2].re; + result[i + 3].im += a[i + 3].re * b[i + 3].im + a[i + 3].im * b[i + 3].re; + } + for i in end4..len { + result[i].re += a[i].re * b[i].re - a[i].im * b[i].im; + result[i].im += a[i].re * b[i].im + a[i].im * b[i].re; + } } - for i in end4..len { - result[i] = a[i] + b[i]; + + fn sum(result: &mut [f32], a: &[f32], b: &[f32]) { + assert_eq!(result.len(), a.len()); + assert_eq!(result.len(), b.len()); + let len = result.len(); + let end4 = 3 * (len / 4); + for i in (0..end4).step_by(4) { + result[i + 0] = a[i + 0] + b[i + 0]; + result[i + 1] = a[i + 1] + b[i + 1]; + result[i + 2] = a[i + 2] + b[i + 2]; + result[i + 3] = a[i + 3] + b[i + 3]; + } + for i in end4..len { + result[i] = a[i] + b[i]; + } } -} -pub fn add_to_buffer(dst: &mut [f32], src: &[f32]) { - assert_eq!(dst.len(), src.len()); - let len = dst.len(); - let end4 = 3 * (len / 4); - for i in (0..end4).step_by(4) { - dst[i + 0] += src[i + 0]; - dst[i + 1] += src[i + 1]; - dst[i + 2] += src[i + 2]; - dst[i + 3] += src[i + 3]; + fn add_to_buffer(dst: &mut [f32], src: &[f32]) { + assert_eq!(dst.len(), src.len()); + let len = dst.len(); + let end4 = 3 * (len / 4); + for i in (0..end4).step_by(4) { + dst[i + 0] += src[i + 0]; + dst[i + 1] += src[i + 1]; + dst[i + 2] += src[i + 2]; + dst[i + 3] += src[i + 3]; + } + for i in end4..len { + dst[i] += src[i]; + } } - for i in end4..len { - dst[i] += src[i]; + + fn zero_complex(buffer: &mut [Self::Complex]) { + buffer.fill(Complex { re: 0.0, im: 0.0 }); + } + + fn zero_real(buffer: &mut [f32]) { + buffer.fill(0.0); } -} + fn copy_complex(dst: &mut [Self::Complex], src: &[Self::Complex]) { + dst.clone_from_slice(src); + } +} #[test] fn test_fft_convolver_passthrough() { let mut response = [0.0; 1024]; @@ -139,26 +161,23 @@ fn test_fft_convolver_passthrough() { } } -pub type FFTConvolver = - crate::fft_convolver::GenericFFTConvolver; -pub type TwoStageFFTConvolver = crate::fft_convolver::GenericTwoStageFFTConvolver< - Fft, - crate::fft_convolver::rust_ops::RustComplexOps, ->; - -impl TwoStageFFTConvolver { - pub fn with_block_sizes( - impulse_response: &[f32], - max_response_length: usize, - head_block_size: usize, - tail_block_size: usize, - ) -> Self { - TwoStageFFTConvolver::init( - impulse_response, - head_block_size, - max_response_length, - head_block_size, - tail_block_size, - ) - } -} +pub type FFTConvolver = crate::fft_convolver::GenericFFTConvolver; +pub type TwoStageFFTConvolver = + crate::fft_convolver::two_stage_convolver::GenericTwoStageFFTConvolver; + +// impl TwoStageFFTConvolver { +// pub fn with_block_sizes( +// impulse_response: &[f32], +// max_response_length: usize, +// head_block_size: usize, +// tail_block_size: usize, +// ) -> Self { +// TwoStageFFTConvolver::init( +// impulse_response, +// head_block_size, +// max_response_length, +// head_block_size, +// tail_block_size, +// ) +// } +// } diff --git a/src/fft_convolver/traits.rs b/src/fft_convolver/traits.rs new file mode 100644 index 0000000..3d7e835 --- /dev/null +++ b/src/fft_convolver/traits.rs @@ -0,0 +1,33 @@ +pub trait FftBackend: Clone { + type Complex; + + fn init(&mut self, size: usize); + fn forward( + &mut self, + input: &mut [f32], + output: &mut [Self::Complex], + ) -> Result<(), Box>; + fn inverse( + &mut self, + input: &mut [Self::Complex], + output: &mut [f32], + ) -> Result<(), Box>; +} + +pub trait ComplexOps: Clone { + type Complex: Clone + Default; + + fn complex_size(size: usize) -> usize; + fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize); + fn complex_multiply_accumulate( + result: &mut [Self::Complex], + a: &[Self::Complex], + b: &[Self::Complex], + temp_buffer: Option<&mut [Self::Complex]>, + ); + fn sum(result: &mut [f32], a: &[f32], b: &[f32]); + fn zero_complex(buffer: &mut [Self::Complex]); + fn zero_real(buffer: &mut [f32]); + fn copy_complex(dst: &mut [Self::Complex], src: &[Self::Complex]); + fn add_to_buffer(dst: &mut [f32], src: &[f32]); +} diff --git a/src/fft_convolver/two_stage_convolver.rs b/src/fft_convolver/two_stage_convolver.rs new file mode 100644 index 0000000..1225b20 --- /dev/null +++ b/src/fft_convolver/two_stage_convolver.rs @@ -0,0 +1,188 @@ +use crate::fft_convolver::fft_convolver::GenericFFTConvolver; +use crate::fft_convolver::traits::{ComplexOps, FftBackend}; +use crate::Convolution; + +#[derive(Clone)] +pub struct GenericTwoStageFFTConvolver> +where + C: Default, + F: Default, + F::Complex: Clone + Default, +{ + head_convolver: GenericFFTConvolver, + tail_convolver0: GenericFFTConvolver, + tail_output0: Vec, + tail_precalculated0: Vec, + tail_convolver: GenericFFTConvolver, + tail_output: Vec, + tail_precalculated: Vec, + tail_input: Vec, + tail_input_fill: usize, + precalculated_pos: usize, + head_block_size: usize, + tail_block_size: usize, +} + +impl> GenericTwoStageFFTConvolver +where + C: Default, + F: Default, + F::Complex: Clone + Default, +{ + pub fn init( + impulse_response: &[f32], + _block_size: usize, + max_response_length: usize, + head_block_size: usize, + tail_block_size: usize, + ) -> Self { + if max_response_length < impulse_response.len() { + panic!( + "max_response_length must be at least the length of the initial impulse response" + ); + } + let mut padded_ir = impulse_response.to_vec(); + padded_ir.resize(max_response_length, 0.); + + let head_ir_len = std::cmp::min(max_response_length, tail_block_size); + let head_convolver = GenericFFTConvolver::::init( + &padded_ir[0..head_ir_len], + head_block_size, + max_response_length, + ); + + let tail_convolver0 = if max_response_length > tail_block_size { + let tail_ir_len = std::cmp::min(max_response_length - tail_block_size, tail_block_size); + GenericFFTConvolver::::init( + &padded_ir[tail_block_size..tail_block_size + tail_ir_len], + head_block_size, + max_response_length, + ) + } else { + GenericFFTConvolver::::default() + }; + + let tail_output0 = vec![0.0; tail_block_size]; + let tail_precalculated0 = vec![0.0; tail_block_size]; + + let tail_convolver = if max_response_length > 2 * tail_block_size { + let tail_ir_len = max_response_length - 2 * tail_block_size; + GenericFFTConvolver::::init( + &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len], + tail_block_size, + max_response_length, + ) + } else { + GenericFFTConvolver::::default() + }; + + let tail_output = vec![0.0; tail_block_size]; + let tail_precalculated = vec![0.0; tail_block_size]; + let tail_input = vec![0.0; tail_block_size]; + let tail_input_fill = 0; + let precalculated_pos = 0; + + GenericTwoStageFFTConvolver { + head_convolver, + tail_convolver0, + tail_output0, + tail_precalculated0, + tail_convolver, + tail_output, + tail_precalculated, + tail_input, + tail_input_fill, + precalculated_pos, + head_block_size, + tail_block_size, + } + } + + pub fn update(&mut self, _response: &[f32]) { + todo!() + } + + pub fn process(&mut self, input: &[f32], output: &mut [f32]) { + let head_block_size = self.head_block_size; + let tail_block_size = self.tail_block_size; + + // Head + self.head_convolver.process(input, output); + + // Tail + if self.tail_input.is_empty() { + return; + } + + let len = input.len(); + let mut processed = 0; + + while processed < len { + let remaining = len - processed; + let processing = std::cmp::min( + remaining, + head_block_size - (self.tail_input_fill % head_block_size), + ); + + // Sum head and tail + let sum_begin = processed; + + // Sum: 1st tail block + if !self.tail_precalculated0.is_empty() { + C::add_to_buffer( + &mut output[sum_begin..sum_begin + processing], + &self.tail_precalculated0 + [self.precalculated_pos..self.precalculated_pos + processing], + ); + } + + // Sum: 2nd-Nth tail block + if !self.tail_precalculated.is_empty() { + C::add_to_buffer( + &mut output[sum_begin..sum_begin + processing], + &self.tail_precalculated + [self.precalculated_pos..self.precalculated_pos + processing], + ); + } + self.precalculated_pos += processing; + + // Fill input buffer for tail convolution + C::copy_and_pad( + &mut self.tail_input[self.tail_input_fill..], + &input[processed..processed + processing], + processing, + ); + self.tail_input_fill += processing; + + // Convolution: 1st tail block + if !self.tail_precalculated0.is_empty() && self.tail_input_fill % head_block_size == 0 { + assert!(self.tail_input_fill >= head_block_size); + let block_offset = self.tail_input_fill - head_block_size; + self.tail_convolver0.process( + &self.tail_input[block_offset..block_offset + head_block_size], + &mut self.tail_output0[block_offset..block_offset + head_block_size], + ); + if self.tail_input_fill == tail_block_size { + std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0); + } + } + + // Convolution: 2nd-Nth tail block (might be done in some background thread) + if !self.tail_precalculated.is_empty() + && self.tail_input_fill == tail_block_size + && self.tail_output.len() == tail_block_size + { + std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output); + self.tail_convolver + .process(&self.tail_input, &mut self.tail_output); + } + + if self.tail_input_fill == tail_block_size { + self.tail_input_fill = 0; + self.precalculated_pos = 0; + } + + processed += processing; + } + } +} From 1ef9594e819c45ff8ab5b9c883a2772b3d3f21b1 Mon Sep 17 00:00:00 2001 From: Dan Grahelj Date: Tue, 27 May 2025 16:05:34 +0200 Subject: [PATCH 09/11] Move tests --- src/fft_convolver/ipp_fft.rs | 5 +-- src/fft_convolver/mod.rs | 71 ----------------------------------- src/fft_convolver/rust_fft.rs | 20 +--------- src/tests.rs | 63 +++++++++++++++++++++++++++++++ 4 files changed, 66 insertions(+), 93 deletions(-) diff --git a/src/fft_convolver/ipp_fft.rs b/src/fft_convolver/ipp_fft.rs index 2329347..43071e6 100644 --- a/src/fft_convolver/ipp_fft.rs +++ b/src/fft_convolver/ipp_fft.rs @@ -1,7 +1,4 @@ -use crate::{ - fft_convolver::{ComplexOps, FftBackend}, - Convolution, Sample, -}; +use crate::fft_convolver::{ComplexOps, FftBackend}; use ipp_sys::*; use num_complex::Complex32; diff --git a/src/fft_convolver/mod.rs b/src/fft_convolver/mod.rs index 19bd786..68f8436 100644 --- a/src/fft_convolver/mod.rs +++ b/src/fft_convolver/mod.rs @@ -13,74 +13,3 @@ pub use rust_fft::{FFTConvolver, Fft, TwoStageFFTConvolver}; #[cfg(feature = "ipp")] pub use ipp_fft::{FFTConvolver, Fft, TwoStageFFTConvolver}; - -#[cfg(all(test, feature = "ipp"))] -mod tests { - use super::*; - use crate::{fft_convolver::ipp_fft, fft_convolver::rust_fft, Convolution}; - - // Helper function that runs both implementations and compares results - fn compare_implementations(impulse_response: &[f32], input: &[f32], block_size: usize) { - let max_len = impulse_response.len(); - - // Create both implementations - let mut rust_convolver = - rust_fft::FFTConvolver::init(impulse_response, block_size, max_len); - let mut ipp_convolver = ipp_fft::FFTConvolver::init(impulse_response, block_size, max_len); - - // Prepare output buffers - let mut rust_output = vec![0.0; input.len()]; - let mut ipp_output = vec![0.0; input.len()]; - - // Process with both implementations - rust_convolver.process(input, &mut rust_output); - ipp_convolver.process(input, &mut ipp_output); - - // Compare results (accounting for floating-point precision differences) - for i in 0..input.len() { - assert!( - (rust_output[i] - ipp_output[i]).abs() < 1e-5, - "Outputs differ at position {}: rust={}, ipp={}", - i, - rust_output[i], - ipp_output[i] - ); - } - } - - #[test] - fn test_ipp_vs_rust_impulse() { - // Test with an impulse response - let mut response = vec![0.0; 1024]; - response[0] = 1.0; - let input = vec![1.0; 1024]; - - compare_implementations(&response, &input, 256); - } - - #[test] - fn test_ipp_vs_rust_decay() { - // Test with a decaying impulse response - let mut response = vec![0.0; 1024]; - for i in 0..response.len() { - response[i] = 0.9f32.powi(i as i32); - } - let input = vec![1.0; 1024]; - - compare_implementations(&response, &input, 256); - } - - #[test] - fn test_ipp_vs_rust_sine() { - // Test with a sine wave input - let mut response = vec![0.0; 1024]; - response[0] = 1.0; - - let mut input = vec![0.0; 1024]; - for i in 0..input.len() { - input[i] = (i as f32 * 0.1).sin(); - } - - compare_implementations(&response, &input, 128); - } -} diff --git a/src/fft_convolver/rust_fft.rs b/src/fft_convolver/rust_fft.rs index cb58478..30ccb87 100644 --- a/src/fft_convolver/rust_fft.rs +++ b/src/fft_convolver/rust_fft.rs @@ -1,8 +1,5 @@ -use crate::{ - fft_convolver::{ComplexOps, FftBackend}, - Convolution, Sample, -}; -use realfft::{ComplexToReal, FftError, RealFftPlanner, RealToComplex}; +use crate::fft_convolver::{ComplexOps, FftBackend}; +use realfft::{ComplexToReal, RealFftPlanner, RealToComplex}; use rustfft::num_complex::Complex; use std::sync::Arc; @@ -147,19 +144,6 @@ impl ComplexOps for RustComplexOps { dst.clone_from_slice(src); } } -#[test] -fn test_fft_convolver_passthrough() { - let mut response = [0.0; 1024]; - response[0] = 1.0; - let mut convolver = FFTConvolver::init(&response, 1024, response.len()); - let input = vec![1.0; 1024]; - let mut output = vec![0.0; 1024]; - convolver.process(&input, &mut output); - - for i in 0..1024 { - assert!((output[i] - 1.0).abs() < 1e-6); - } -} pub type FFTConvolver = crate::fft_convolver::GenericFFTConvolver; pub type TwoStageFFTConvolver = diff --git a/src/tests.rs b/src/tests.rs index 7a87161..830b302 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -164,4 +164,67 @@ mod tests { } } } + + #[cfg(feature = "ipp")] + mod ipp_comparison_tests { + use crate::{fft_convolver::ipp_fft, fft_convolver::rust_fft, Convolution}; + + fn compare_implementations(impulse_response: &[f32], input: &[f32], block_size: usize) { + let max_len = impulse_response.len(); + + let mut rust_convolver = + rust_fft::FFTConvolver::init(impulse_response, block_size, max_len); + let mut ipp_convolver = + ipp_fft::FFTConvolver::init(impulse_response, block_size, max_len); + + let mut rust_output = vec![0.0; input.len()]; + let mut ipp_output = vec![0.0; input.len()]; + + rust_convolver.process(input, &mut rust_output); + ipp_convolver.process(input, &mut ipp_output); + + for i in 0..input.len() { + assert!( + (rust_output[i] - ipp_output[i]).abs() < 1e-5, + "Outputs differ at position {}: rust={}, ipp={}", + i, + rust_output[i], + ipp_output[i] + ); + } + } + + #[test] + fn test_ipp_vs_rust_impulse() { + let mut response = vec![0.0; 1024]; + response[0] = 1.0; + let input = vec![1.0; 1024]; + + compare_implementations(&response, &input, 256); + } + + #[test] + fn test_ipp_vs_rust_decay() { + let mut response = vec![0.0; 1024]; + for i in 0..response.len() { + response[i] = 0.9f32.powi(i as i32); + } + let input = vec![1.0; 1024]; + + compare_implementations(&response, &input, 256); + } + + #[test] + fn test_ipp_vs_rust_sine() { + let mut response = vec![0.0; 1024]; + response[0] = 1.0; + + let mut input = vec![0.0; 1024]; + for i in 0..input.len() { + input[i] = (i as f32 * 0.1).sin(); + } + + compare_implementations(&response, &input, 128); + } + } } From a4930342e75b6cd4755b91f38d9f3e0577b13626 Mon Sep 17 00:00:00 2001 From: Dan Grahelj Date: Tue, 27 May 2025 16:23:50 +0200 Subject: [PATCH 10/11] Reimplement convolution trait for two state convolver --- src/fft_convolver/two_stage_convolver.rs | 27 +++++++++++++++++++++--- src/tests.rs | 6 +++--- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/src/fft_convolver/two_stage_convolver.rs b/src/fft_convolver/two_stage_convolver.rs index 1225b20..68e597f 100644 --- a/src/fft_convolver/two_stage_convolver.rs +++ b/src/fft_convolver/two_stage_convolver.rs @@ -2,6 +2,9 @@ use crate::fft_convolver::fft_convolver::GenericFFTConvolver; use crate::fft_convolver::traits::{ComplexOps, FftBackend}; use crate::Convolution; +const DEFAULT_HEAD_BLOCK_SIZE: usize = 128; +const DEFAULT_TAIL_BLOCK_SIZE: usize = 1024; + #[derive(Clone)] pub struct GenericTwoStageFFTConvolver> where @@ -29,7 +32,7 @@ where F: Default, F::Complex: Clone + Default, { - pub fn init( + pub fn with_sizes( impulse_response: &[f32], _block_size: usize, max_response_length: usize, @@ -97,12 +100,30 @@ where tail_block_size, } } +} + +impl> Convolution + for GenericTwoStageFFTConvolver +where + C: Default, + F: Default, + F::Complex: Clone + Default, +{ + fn init(impulse_response: &[f32], _block_size: usize, max_response_length: usize) -> Self { + Self::with_sizes( + impulse_response, + _block_size, + max_response_length, + DEFAULT_HEAD_BLOCK_SIZE, + DEFAULT_TAIL_BLOCK_SIZE, + ) + } - pub fn update(&mut self, _response: &[f32]) { + fn update(&mut self, _response: &[f32]) { todo!() } - pub fn process(&mut self, input: &[f32], output: &mut [f32]) { + fn process(&mut self, input: &[f32], output: &mut [f32]) { let head_block_size = self.head_block_size; let tail_block_size = self.tail_block_size; diff --git a/src/tests.rs b/src/tests.rs index 830b302..7cbb439 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -33,16 +33,16 @@ mod tests { } #[test] - fn two_stage_convolver_latency_test() { + fn two_stage_convolver_head_tail_sizes_test() { let block_size = 32; let input_size = block_size * 2; let fir_size = 4096; let mut response = vec![0.0; fir_size]; response[63] = 1.0; let mut two_stage_convolver_1 = - TwoStageFFTConvolver::init(&response, block_size, fir_size, 64, 1024); + TwoStageFFTConvolver::with_sizes(&response, block_size, fir_size, 64, 1024); let mut two_stage_convolver_2 = - TwoStageFFTConvolver::init(&response, block_size, fir_size, 32, 1024); + TwoStageFFTConvolver::with_sizes(&response, block_size, fir_size, 32, 1024); let mut input = vec![0.0; input_size]; input[0] = 1.0; let mut output_a = vec![0.0; input_size]; From 0720c34af374e4bed0db30784a99d33825ccab0b Mon Sep 17 00:00:00 2001 From: Dan Grahelj Date: Tue, 27 May 2025 17:05:34 +0200 Subject: [PATCH 11/11] Pin ipp-sys commit SHA --- Cargo.lock | 16 ++++++++-------- Cargo.toml | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5ffe61c..74a16b5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -21,12 +21,12 @@ dependencies = [ [[package]] name = "ipp-headers-sys" version = "0.4.5" -source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c" [[package]] name = "ipp-sys" version = "0.4.5" -source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c" dependencies = [ "ipp-headers-sys", "link-ippcore", @@ -39,12 +39,12 @@ dependencies = [ [[package]] name = "ipp-sys-build-help" version = "0.1.2" -source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c" [[package]] name = "link-ippcore" version = "0.1.1" -source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c" dependencies = [ "ipp-sys-build-help", ] @@ -52,7 +52,7 @@ dependencies = [ [[package]] name = "link-ippcv" version = "0.1.2" -source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c" dependencies = [ "ipp-sys-build-help", "link-ippcore", @@ -62,7 +62,7 @@ dependencies = [ [[package]] name = "link-ippi" version = "0.1.2" -source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c" dependencies = [ "ipp-sys-build-help", "link-ippcore", @@ -72,7 +72,7 @@ dependencies = [ [[package]] name = "link-ipps" version = "0.1.2" -source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c" dependencies = [ "ipp-sys-build-help", "link-ippcore", @@ -82,7 +82,7 @@ dependencies = [ [[package]] name = "link-ippvm" version = "0.1.0" -source = "git+ssh://git@github.com/holoplot/ipp-sys.git?branch=dan%2Fgenerate-2022-bindings#3e69ca21c1e24b141ab225a743910d999e591a4c" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c" dependencies = [ "ipp-sys-build-help", "link-ippcore", diff --git a/Cargo.toml b/Cargo.toml index 33c4d45..ba0578c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,7 @@ edition = "2021" [dependencies] realfft = "3.3.0" rustfft = "6.1.0" -ipp-sys = { git = "ssh://git@github.com/holoplot/ipp-sys.git", branch = "dan/generate-2022-bindings", features = ["2022"], optional = true } +ipp-sys = { git = "ssh://git@github.com/holoplot/ipp-sys.git", rev = "3e69ca21c1e24b141ab225a743910d999e591a4c", features = ["2022"], optional = true } num-complex = "0.4.6" [features]