From 9db573977579aa50e14352cd90cbe30c538a360b Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Mon, 1 Jul 2024 12:34:12 -0400 Subject: [PATCH 1/6] Make planner reusable - Planner should be re-usable so it can be re-used for FFT's of the same size - Add regression tests to make sure `fft_64`/`fft_32` gives the same results as `fft_64_with_opts_and_plan`/`fft_32_with_opts_and_plan` --- Cargo.toml | 8 ++- benches/bench.rs | 119 +++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 119 ++++++++++++++++++++++++++++++++----------- src/planner.rs | 11 +++- src/twiddles.rs | 28 +++++----- utilities/src/lib.rs | 30 +++++++++++ 6 files changed, 272 insertions(+), 43 deletions(-) create mode 100644 benches/bench.rs diff --git a/Cargo.toml b/Cargo.toml index 12d50ef..5aa2807 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,8 +15,14 @@ num-traits = "0.2.18" multiversion = "0.7" [dev-dependencies] -utilities = { path = "utilities" } +criterion = "0.5.1" fftw = "0.8.0" +rand = "0.8.5" +utilities = { path = "utilities" } + +[[bench]] +name = "bench" +harness = false [profile.release] codegen-units = 1 diff --git a/benches/bench.rs b/benches/bench.rs new file mode 100644 index 0000000..8187e47 --- /dev/null +++ b/benches/bench.rs @@ -0,0 +1,119 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use num_traits::Float; +use phastft::{ + fft_32_with_opts_and_plan, fft_64_with_opts_and_plan, + options::Options, + planner::{Direction, Planner32, Planner64}, +}; +use rand::{ + distributions::{Distribution, Standard}, + thread_rng, Rng, +}; + +const LENGTHS: &[usize] = &[ + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, +]; + +fn generate_numbers(n: usize) -> (Vec, Vec) +where + Standard: Distribution, +{ + let mut rng = thread_rng(); + + let samples: Vec = (&mut rng).sample_iter(Standard).take(2 * n).collect(); + + let mut reals = vec![T::zero(); n]; + let mut imags = vec![T::zero(); n]; + + for ((z_re, z_im), rand_chunk) in reals + .iter_mut() + .zip(imags.iter_mut()) + .zip(samples.chunks_exact(2)) + { + *z_re = rand_chunk[0]; + *z_im = rand_chunk[1]; + } + + (reals, imags) +} + +fn benchmark_forward_f32(c: &mut Criterion) { + let options = Options::default(); + + for n in LENGTHS.iter() { + let len = 1 << n; + let id = format!("FFT Forward f32 {} elements", len); + c.bench_function(&id, |b| { + let (mut reals, mut imags) = generate_numbers(len); + let planner = Planner32::new(len, Direction::Forward); + b.iter(|| { + black_box(fft_32_with_opts_and_plan( + &mut reals, &mut imags, &options, &planner, + )); + }); + }); + } +} + +fn benchmark_inverse_f32(c: &mut Criterion) { + let options = Options::default(); + + for n in LENGTHS.iter() { + let len = 1 << n; + let id = format!("FFT Inverse f32 {} elements", len); + c.bench_function(&id, |b| { + let (mut reals, mut imags) = generate_numbers(len); + let planner = Planner32::new(len, Direction::Reverse); + b.iter(|| { + black_box(fft_32_with_opts_and_plan( + &mut reals, &mut imags, &options, &planner, + )); + }); + }); + } +} + +fn benchmark_forward_f64(c: &mut Criterion) { + let options = Options::default(); + + for n in LENGTHS.iter() { + let len = 1 << n; + let id = format!("FFT Forward f64 {} elements", len); + c.bench_function(&id, |b| { + let (mut reals, mut imags) = generate_numbers(len); + let planner = Planner64::new(len, Direction::Forward); + b.iter(|| { + black_box(fft_64_with_opts_and_plan( + &mut reals, &mut imags, &options, &planner, + )); + }); + }); + } +} + +fn benchmark_inverse_f64(c: &mut Criterion) { + let options = Options::default(); + + for n in LENGTHS.iter() { + let len = 1 << n; + let id = format!("FFT Inverse f64 {} elements", len); + c.bench_function(&id, |b| { + let (mut reals, mut imags) = generate_numbers(len); + let planner = Planner64::new(len, Direction::Reverse); + b.iter(|| { + black_box(fft_64_with_opts_and_plan( + &mut reals, &mut imags, &options, &planner, + )); + }); + }); + } +} + +criterion_group!( + benches, + benchmark_forward_f32, + benchmark_inverse_f32, + benchmark_forward_f64, + benchmark_inverse_f64 +); +criterion_main!(benches); diff --git a/src/lib.rs b/src/lib.rs index d0d84ca..ad3ea9c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -48,7 +48,7 @@ macro_rules! impl_fft_for { imags.len() ); - let mut planner = <$planner>::new(reals.len(), Direction::Forward); + let planner = <$planner>::new(reals.len(), direction); assert!( planner.num_twiddles().is_power_of_two() && planner.num_twiddles() == reals.len() / 2 @@ -56,27 +56,7 @@ macro_rules! impl_fft_for { let opts = Options::guess_options(reals.len()); - match direction { - Direction::Reverse => { - for z_im in imags.iter_mut() { - *z_im = -*z_im; - } - } - _ => (), - } - - $opts_and_plan(reals, imags, &opts, &mut planner); - - match direction { - Direction::Reverse => { - let scaling_factor = (reals.len() as $precision).recip(); - for (z_re, z_im) in reals.iter_mut().zip(imags.iter_mut()) { - *z_re *= scaling_factor; - *z_im *= -scaling_factor; - } - } - _ => (), - } + $opts_and_plan(reals, imags, &opts, &planner); } }; } @@ -112,30 +92,39 @@ macro_rules! impl_fft_with_opts_and_plan_for { reals: &mut [$precision], imags: &mut [$precision], opts: &Options, - planner: &mut $planner, + planner: &$planner, ) { assert!(reals.len() == imags.len() && reals.len().is_power_of_two()); let n: usize = reals.len().ilog2() as usize; - let twiddles_re = &mut planner.twiddles_re; - let twiddles_im = &mut planner.twiddles_im; + let mut twiddles_re = planner.twiddles_re.clone(); + let mut twiddles_im = planner.twiddles_im.clone(); // We shouldn't be able to execute FFT if the # of twiddles isn't equal to the distance // between pairs assert!(twiddles_re.len() == reals.len() / 2 && twiddles_im.len() == imags.len() / 2); + match planner.direction { + Direction::Reverse => { + for z_im in imags.iter_mut() { + *z_im = -*z_im; + } + } + _ => (), + } + for t in (0..n).rev() { let dist = 1 << t; let chunk_size = dist << 1; if chunk_size > 4 { if t < n - 1 { - filter_twiddles(twiddles_re, twiddles_im); + (twiddles_re, twiddles_im) = filter_twiddles(&twiddles_re, &twiddles_im); } if chunk_size >= $lanes * 2 { - $simd_butterfly_kernel(reals, imags, twiddles_re, twiddles_im, dist); + $simd_butterfly_kernel(reals, imags, &twiddles_re, &twiddles_im, dist); } else { - fft_chunk_n(reals, imags, twiddles_re, twiddles_im, dist); + fft_chunk_n(reals, imags, &twiddles_re, &twiddles_im, dist); } } else if chunk_size == 2 { fft_chunk_2(reals, imags); @@ -153,6 +142,17 @@ macro_rules! impl_fft_with_opts_and_plan_for { cobra_apply(reals, n); cobra_apply(imags, n); } + + match planner.direction { + Direction::Reverse => { + let scaling_factor = (reals.len() as $precision).recip(); + for (z_re, z_im) in reals.iter_mut().zip(imags.iter_mut()) { + *z_re *= scaling_factor; + *z_im *= -scaling_factor; + } + } + _ => (), + } } }; } @@ -179,7 +179,7 @@ mod tests { use utilities::rustfft::num_complex::Complex; use utilities::rustfft::FftPlanner; - use utilities::{assert_float_closeness, gen_random_signal}; + use utilities::{assert_float_closeness, gen_random_signal, gen_random_signal_f32}; use super::*; @@ -308,4 +308,65 @@ mod tests { } } } + + #[test] + fn fft_64_with_opts_and_plan_vs_fft_64() { + let num_points = 4096; + + let mut reals = vec![0.0; num_points]; + let mut imags = vec![0.0; num_points]; + gen_random_signal(&mut reals, &mut imags); + + let mut re = reals.clone(); + let mut im = imags.clone(); + + let mut planner = Planner64::new(num_points, Direction::Forward); + let opts = Options::guess_options(reals.len()); + fft_64_with_opts_and_plan(&mut reals, &mut imags, &opts, &mut planner); + + fft_64(&mut re, &mut im, Direction::Forward); + + reals + .iter() + .zip(imags.iter()) + .zip(re.iter()) + .zip(im.iter()) + .for_each(|(((r, i), z_re), z_im)| { + assert_float_closeness(*r, *z_re, 1e-6); + assert_float_closeness(*i, *z_im, 1e-6); + }); + } + + #[test] + fn fft_32_with_opts_and_plan_vs_fft_64() { + let dirs = [Direction::Forward, Direction::Reverse]; + + for direction in dirs { + for n in 4..14 { + let num_points = 1 << n; + let mut reals = vec![0.0; num_points]; + let mut imags = vec![0.0; num_points]; + gen_random_signal_f32(&mut reals, &mut imags); + + let mut re = reals.clone(); + let mut im = imags.clone(); + + let mut planner = Planner32::new(num_points, direction); + let opts = Options::guess_options(reals.len()); + fft_32_with_opts_and_plan(&mut reals, &mut imags, &opts, &mut planner); + + fft_32(&mut re, &mut im, direction); + + reals + .iter() + .zip(imags.iter()) + .zip(re.iter()) + .zip(im.iter()) + .for_each(|(((r, i), z_re), z_im)| { + assert_float_closeness(*r, *z_re, 1e-6); + assert_float_closeness(*i, *z_im, 1e-6); + }); + } + } + } } diff --git a/src/planner.rs b/src/planner.rs index b174567..69a9777 100644 --- a/src/planner.rs +++ b/src/planner.rs @@ -25,6 +25,8 @@ macro_rules! impl_planner_for { pub twiddles_re: Vec<$precision>, /// The imaginary components of the twiddle factors pub twiddles_im: Vec<$precision>, + /// The direction of the FFT associated with this `Planner` + pub direction: Direction, } impl $struct_name { /// Create a `Planner` for an FFT of size `num_points`. @@ -35,12 +37,18 @@ macro_rules! impl_planner_for { /// # Panics /// /// Panics if `num_points < 1` or if `num_points` is __not__ a power of 2. - pub fn new(num_points: usize, _direction: Direction) -> Self { + pub fn new(num_points: usize, direction: Direction) -> Self { assert!(num_points > 0 && num_points.is_power_of_two()); + let dir = match direction { + Direction::Forward => Direction::Forward, + Direction::Reverse => Direction::Reverse, + }; + if num_points <= 4 { return Self { twiddles_re: vec![], twiddles_im: vec![], + direction: dir, }; } @@ -57,6 +65,7 @@ macro_rules! impl_planner_for { Self { twiddles_re, twiddles_im, + direction: dir, } } diff --git a/src/twiddles.rs b/src/twiddles.rs index 13dedba..e827f85 100644 --- a/src/twiddles.rs +++ b/src/twiddles.rs @@ -4,6 +4,9 @@ use num_traits::{Float, FloatConst, One, Zero}; use crate::planner::Direction; +/// This isn't really used except for testing. +/// It may be better to use this in the case where the input size is very large, +/// as to free up the cache. pub(crate) struct Twiddles { st: T, ct: T, @@ -182,7 +185,7 @@ generate_twiddles_simd!(generate_twiddles_simd_64, f64, 8, f64x8); generate_twiddles_simd!(generate_twiddles_simd_32, f32, 8, f32x8); #[inline] -pub(crate) fn filter_twiddles(twiddles_re: &mut Vec, twiddles_im: &mut Vec) { +pub(crate) fn filter_twiddles(twiddles_re: &[T], twiddles_im: &[T]) -> (Vec, Vec) { assert_eq!(twiddles_re.len(), twiddles_im.len()); let dist = twiddles_re.len(); @@ -194,8 +197,7 @@ pub(crate) fn filter_twiddles(twiddles_re: &mut Vec, twiddles_im: & && filtered_twiddles_re.len() == dist / 2 ); - let _ = std::mem::replace(twiddles_re, filtered_twiddles_re); - let _ = std::mem::replace(twiddles_im, filtered_twiddles_im); + (filtered_twiddles_re, filtered_twiddles_im) } #[cfg(test)] @@ -275,28 +277,30 @@ mod tests { let mut twiddles_iter = Twiddles::new(dist); - let (mut twiddles_re, mut twiddles_im) = generate_twiddles(dist, Direction::Forward); + let (twiddles_re, twiddles_im) = generate_twiddles(dist, Direction::Forward); for i in 0..dist { - let (tw_re, tw_im) = twiddles_iter.next().unwrap(); - assert_float_closeness(twiddles_re[i], tw_re, 1e-6); - assert_float_closeness(twiddles_im[i], tw_im, 1e-6); + let (w_re, w_im) = twiddles_iter.next().unwrap(); + assert_float_closeness(twiddles_re[i], w_re, 1e-6); + assert_float_closeness(twiddles_im[i], w_im, 1e-6); } + let (mut tw_re, mut tw_im) = (twiddles_re.clone(), twiddles_im.clone()); + for t in (0..n - 1).rev() { let dist = 1 << t; let mut twiddles_iter = Twiddles::new(dist); // Don't re-compute all the twiddles. // Just filter them out by taking every other twiddle factor - filter_twiddles(&mut twiddles_re, &mut twiddles_im); + (tw_re, tw_im) = filter_twiddles(&tw_re, &tw_im); - assert!(twiddles_re.len() == dist && twiddles_im.len() == dist); + assert!(tw_re.len() == dist && tw_im.len() == dist); for i in 0..dist { - let (tw_re, tw_im) = twiddles_iter.next().unwrap(); - assert_float_closeness(twiddles_re[i], tw_re, 1e-6); - assert_float_closeness(twiddles_im[i], tw_im, 1e-6); + let (w_re, w_im) = twiddles_iter.next().unwrap(); + assert_float_closeness(tw_re[i], w_re, 1e-6); + assert_float_closeness(tw_im[i], w_im, 1e-6); } } } diff --git a/utilities/src/lib.rs b/utilities/src/lib.rs index f7eff6f..0b330db 100644 --- a/utilities/src/lib.rs +++ b/utilities/src/lib.rs @@ -22,6 +22,36 @@ pub fn assert_float_closeness(actual: T, expected: T, epsilo } } +pub fn gen_random_signal_f32(reals: &mut [f32], imags: &mut [f32]) { + assert!(reals.len() == imags.len() && !reals.is_empty()); + let mut rng = thread_rng(); + let between = Uniform::from(0.0..1.0); + let angle_dist = Uniform::from(0.0..2.0 * (PI as f32)); + let num_amps = reals.len(); + + let mut probs: Vec<_> = (0..num_amps).map(|_| between.sample(&mut rng)).collect(); + + let total: f32 = probs.iter().sum(); + let total_recip = total.recip(); + + probs.iter_mut().for_each(|p| *p *= total_recip); + + let angles = (0..num_amps).map(|_| angle_dist.sample(&mut rng)); + + probs + .iter() + .zip(angles) + .enumerate() + .for_each(|(i, (p, a))| { + let p_sqrt = p.sqrt(); + let (sin_a, cos_a) = a.sin_cos(); + let re = p_sqrt * cos_a; + let im = p_sqrt * sin_a; + reals[i] = re; + imags[i] = im; + }); +} + /// Generate a random, complex, signal in the provided buffers /// /// # Panics From 0301325709b347ab19bc80211d9ea33714922f9c Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Mon, 1 Jul 2024 13:43:44 -0400 Subject: [PATCH 2/6] Move planner completely outside of bench function --- benches/bench.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/benches/bench.rs b/benches/bench.rs index 8187e47..8ec905d 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -43,9 +43,10 @@ fn benchmark_forward_f32(c: &mut Criterion) { for n in LENGTHS.iter() { let len = 1 << n; let id = format!("FFT Forward f32 {} elements", len); + let planner = Planner32::new(len, Direction::Forward); + c.bench_function(&id, |b| { let (mut reals, mut imags) = generate_numbers(len); - let planner = Planner32::new(len, Direction::Forward); b.iter(|| { black_box(fft_32_with_opts_and_plan( &mut reals, &mut imags, &options, &planner, @@ -61,9 +62,10 @@ fn benchmark_inverse_f32(c: &mut Criterion) { for n in LENGTHS.iter() { let len = 1 << n; let id = format!("FFT Inverse f32 {} elements", len); + let planner = Planner32::new(len, Direction::Reverse); + c.bench_function(&id, |b| { let (mut reals, mut imags) = generate_numbers(len); - let planner = Planner32::new(len, Direction::Reverse); b.iter(|| { black_box(fft_32_with_opts_and_plan( &mut reals, &mut imags, &options, &planner, @@ -79,9 +81,10 @@ fn benchmark_forward_f64(c: &mut Criterion) { for n in LENGTHS.iter() { let len = 1 << n; let id = format!("FFT Forward f64 {} elements", len); + let planner = Planner64::new(len, Direction::Forward); + c.bench_function(&id, |b| { let (mut reals, mut imags) = generate_numbers(len); - let planner = Planner64::new(len, Direction::Forward); b.iter(|| { black_box(fft_64_with_opts_and_plan( &mut reals, &mut imags, &options, &planner, @@ -97,9 +100,10 @@ fn benchmark_inverse_f64(c: &mut Criterion) { for n in LENGTHS.iter() { let len = 1 << n; let id = format!("FFT Inverse f64 {} elements", len); + let planner = Planner64::new(len, Direction::Reverse); + c.bench_function(&id, |b| { let (mut reals, mut imags) = generate_numbers(len); - let planner = Planner64::new(len, Direction::Reverse); b.iter(|| { black_box(fft_64_with_opts_and_plan( &mut reals, &mut imags, &options, &planner, From db4c3d426866882471c9d0125b81873c0942a710 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Mon, 1 Jul 2024 14:55:13 -0400 Subject: [PATCH 3/6] Add criterion group for forward FFT f32 --- benches/bench.rs | 52 ++++++++++++++++++++++++++++++++++++++++-------- src/options.rs | 3 ++- 2 files changed, 46 insertions(+), 9 deletions(-) diff --git a/benches/bench.rs b/benches/bench.rs index 8ec905d..540c3cc 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -1,4 +1,4 @@ -use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; use num_traits::Float; use phastft::{ fft_32_with_opts_and_plan, fft_64_with_opts_and_plan, @@ -9,6 +9,8 @@ use rand::{ distributions::{Distribution, Standard}, thread_rng, Rng, }; +use utilities::rustfft::num_complex::Complex; +use utilities::rustfft::{FftNum, FftPlanner}; const LENGTHS: &[usize] = &[ 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, @@ -37,23 +39,57 @@ where (reals, imags) } +fn generate_complex_numbers(n: usize) -> Vec> +where + Standard: Distribution, +{ + let mut rng = thread_rng(); + + let samples: Vec = (&mut rng).sample_iter(Standard).take(2 * n).collect(); + + let mut signal = vec![Complex::default(); n]; + + for (z, rand_chunk) in signal.iter_mut().zip(samples.chunks_exact(2)) { + z.re = rand_chunk[0]; + z.im = rand_chunk[1]; + } + + signal +} + fn benchmark_forward_f32(c: &mut Criterion) { - let options = Options::default(); + let mut group = c.benchmark_group("Forward f32"); for n in LENGTHS.iter() { let len = 1 << n; - let id = format!("FFT Forward f32 {} elements", len); + group.throughput(Throughput::Elements(len as u64)); + + let id = "PhastFT FFT Forward"; + let options = Options::guess_options(len); let planner = Planner32::new(len, Direction::Forward); + let (mut reals, mut imags) = generate_numbers(len); - c.bench_function(&id, |b| { - let (mut reals, mut imags) = generate_numbers(len); + group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &len| { b.iter(|| { - black_box(fft_32_with_opts_and_plan( - &mut reals, &mut imags, &options, &planner, - )); + fft_32_with_opts_and_plan( + black_box(&mut reals), + black_box(&mut imags), + black_box(&options), + black_box(&planner), + ); }); }); + + let id = "RustFFT FFT Forward"; + let mut planner = FftPlanner::::new(); + let fft = planner.plan_fft_forward(len); + let mut signal = generate_complex_numbers(len); + + group.bench_with_input(BenchmarkId::new(id, len), &len, |b, &len| { + b.iter(|| fft.process(black_box(&mut signal))); + }); } + group.finish(); } fn benchmark_inverse_f32(c: &mut Criterion) { diff --git a/src/options.rs b/src/options.rs index 9903946..6559dd9 100644 --- a/src/options.rs +++ b/src/options.rs @@ -15,7 +15,8 @@ pub struct Options { } impl Options { - pub(crate) fn guess_options(input_size: usize) -> Options { + /// Attempt to guess the best settings to use for optimal FFT + pub fn guess_options(input_size: usize) -> Options { let mut options = Options::default(); let n: usize = input_size.ilog2() as usize; options.multithreaded_bit_reversal = n >= 22; From b882363fd88ae560dca8a1c9f5bd197724ba6ad6 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Tue, 2 Jul 2024 10:30:16 -0400 Subject: [PATCH 4/6] Avoid cloning twiddles --- src/lib.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index ad3ea9c..144b378 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -97,8 +97,9 @@ macro_rules! impl_fft_with_opts_and_plan_for { assert!(reals.len() == imags.len() && reals.len().is_power_of_two()); let n: usize = reals.len().ilog2() as usize; - let mut twiddles_re = planner.twiddles_re.clone(); - let mut twiddles_im = planner.twiddles_im.clone(); + // Use references to avoid unnecessary clones + let twiddles_re = &planner.twiddles_re; + let twiddles_im = &planner.twiddles_im; // We shouldn't be able to execute FFT if the # of twiddles isn't equal to the distance // between pairs @@ -113,18 +114,21 @@ macro_rules! impl_fft_with_opts_and_plan_for { _ => (), } + let mut filtered_twiddles_re = twiddles_re.clone(); + let mut filtered_twiddles_im = twiddles_im.clone(); + for t in (0..n).rev() { let dist = 1 << t; let chunk_size = dist << 1; if chunk_size > 4 { if t < n - 1 { - (twiddles_re, twiddles_im) = filter_twiddles(&twiddles_re, &twiddles_im); + (filtered_twiddles_re, filtered_twiddles_im) = filter_twiddles(&filtered_twiddles_re, &filtered_twiddles_im); } if chunk_size >= $lanes * 2 { - $simd_butterfly_kernel(reals, imags, &twiddles_re, &twiddles_im, dist); + $simd_butterfly_kernel(reals, imags, &filtered_twiddles_re, &filtered_twiddles_im, dist); } else { - fft_chunk_n(reals, imags, &twiddles_re, &twiddles_im, dist); + fft_chunk_n(reals, imags, &filtered_twiddles_re, &filtered_twiddles_im, dist); } } else if chunk_size == 2 { fft_chunk_2(reals, imags); From 8c24e088e1f09b47ca36ad624e1535afd3a5f622 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Tue, 2 Jul 2024 11:21:00 -0400 Subject: [PATCH 5/6] Updates to benchmarks without planners --- examples/benchmark.rs | 10 +++++++--- examples/profile.rs | 27 ++++++++++++++++++--------- examples/rustfft.rs | 7 ++++--- src/kernels.rs | 32 ++++++++++++++++++++++++++++++++ src/twiddles.rs | 9 +++++++++ 5 files changed, 70 insertions(+), 15 deletions(-) diff --git a/examples/benchmark.rs b/examples/benchmark.rs index 46d940e..e6ce897 100644 --- a/examples/benchmark.rs +++ b/examples/benchmark.rs @@ -3,8 +3,9 @@ use std::str::FromStr; use utilities::gen_random_signal; -use phastft::fft_64; -use phastft::planner::Direction; +use phastft::fft_64_with_opts_and_plan; +use phastft::options::Options; +use phastft::planner::{Direction, Planner64}; fn benchmark_fft_64(n: usize) { let big_n = 1 << n; @@ -12,8 +13,11 @@ fn benchmark_fft_64(n: usize) { let mut imags = vec![0.0; big_n]; gen_random_signal(&mut reals, &mut imags); + let planner = Planner64::new(reals.len(), Direction::Forward); + let opts = Options::guess_options(reals.len()); + let now = std::time::Instant::now(); - fft_64(&mut reals, &mut imags, Direction::Forward); + fft_64_with_opts_and_plan(&mut reals, &mut imags, &opts, &planner); let elapsed = now.elapsed().as_micros(); println!("{elapsed}"); } diff --git a/examples/profile.rs b/examples/profile.rs index 6725046..dd31597 100644 --- a/examples/profile.rs +++ b/examples/profile.rs @@ -1,14 +1,22 @@ use std::env; use std::str::FromStr; -use phastft::fft_64; -use phastft::planner::Direction; - -fn benchmark_fft(num_qubits: usize) { - let n = 1 << num_qubits; - let mut reals: Vec = (1..=n).map(|i| i as f64).collect(); - let mut imags: Vec = (1..=n).map(|i| i as f64).collect(); - fft_64(&mut reals, &mut imags, Direction::Forward); +use utilities::gen_random_signal; + +use phastft::fft_64_with_opts_and_plan; +use phastft::options::Options; +use phastft::planner::{Direction, Planner64}; + +fn benchmark_fft_64(n: usize) { + let big_n = 1 << n; + let mut reals = vec![0.0; big_n]; + let mut imags = vec![0.0; big_n]; + gen_random_signal(&mut reals, &mut imags); + + let planner = Planner64::new(reals.len(), Direction::Forward); + let opts = Options::guess_options(reals.len()); + + fft_64_with_opts_and_plan(&mut reals, &mut imags, &opts, &planner); } fn main() { @@ -16,5 +24,6 @@ fn main() { assert_eq!(args.len(), 2, "Usage {} ", args[0]); let n = usize::from_str(&args[1]).unwrap(); - benchmark_fft(n); + + benchmark_fft_64(n); } diff --git a/examples/rustfft.rs b/examples/rustfft.rs index 4ed34d7..8e9ac51 100644 --- a/examples/rustfft.rs +++ b/examples/rustfft.rs @@ -9,8 +9,8 @@ use utilities::{ fn benchmark_rustfft(n: usize) { let big_n = 1 << n; - let mut reals = vec![0.0; big_n]; - let mut imags = vec![0.0; big_n]; + let mut reals = vec![0.0f64; big_n]; + let mut imags = vec![0.0f64; big_n]; gen_random_signal(&mut reals, &mut imags); let mut signal = vec![Complex64::default(); big_n]; @@ -23,9 +23,10 @@ fn benchmark_rustfft(n: usize) { z.im = im; }); - let now = std::time::Instant::now(); let mut planner = FftPlanner::new(); let fft = planner.plan_fft_forward(signal.len()); + + let now = std::time::Instant::now(); fft.process(&mut signal); let elapsed = now.elapsed().as_micros(); println!("{elapsed}"); diff --git a/src/kernels.rs b/src/kernels.rs index 101b2ec..8651ee9 100644 --- a/src/kernels.rs +++ b/src/kernels.rs @@ -4,6 +4,14 @@ use num_traits::Float; macro_rules! fft_butterfly_n_simd { ($func_name:ident, $precision:ty, $lanes:literal, $simd_vector:ty) => { + #[multiversion::multiversion(targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4 + "x86_64+avx2+fma", // x86_64-v3 + "x86_64+sse4.2", // x86_64-v2 + "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", + "x86+avx2+fma", + "x86+sse4.2", + "x86+sse2", + ))] #[inline] pub fn $func_name( reals: &mut [$precision], @@ -52,6 +60,14 @@ macro_rules! fft_butterfly_n_simd { fft_butterfly_n_simd!(fft_64_chunk_n_simd, f64, 8, f64x8); fft_butterfly_n_simd!(fft_32_chunk_n_simd, f32, 16, f32x16); +#[multiversion::multiversion(targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4 + "x86_64+avx2+fma", // x86_64-v3 + "x86_64+sse4.2", // x86_64-v2 + "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", + "x86+avx2+fma", + "x86+sse4.2", + "x86+sse2", +))] #[inline] pub(crate) fn fft_chunk_n( reals: &mut [T], @@ -93,6 +109,14 @@ pub(crate) fn fft_chunk_n( } /// `chunk_size == 4`, so hard code twiddle factors +#[multiversion::multiversion(targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4 + "x86_64+avx2+fma", // x86_64-v3 + "x86_64+sse4.2", // x86_64-v2 + "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", + "x86+avx2+fma", + "x86+sse4.2", + "x86+sse2", +))] #[inline] pub(crate) fn fft_chunk_4(reals: &mut [T], imags: &mut [T]) { let dist = 2; @@ -128,6 +152,14 @@ pub(crate) fn fft_chunk_4(reals: &mut [T], imags: &mut [T]) { } /// `chunk_size == 2`, so skip phase +#[multiversion::multiversion(targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4 + "x86_64+avx2+fma", // x86_64-v3 + "x86_64+sse4.2", // x86_64-v2 + "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", + "x86+avx2+fma", + "x86+sse4.2", + "x86+sse2", +))] #[inline] pub(crate) fn fft_chunk_2(reals: &mut [T], imags: &mut [T]) { reals diff --git a/src/twiddles.rs b/src/twiddles.rs index e827f85..f8d0506 100644 --- a/src/twiddles.rs +++ b/src/twiddles.rs @@ -184,6 +184,15 @@ macro_rules! generate_twiddles_simd { generate_twiddles_simd!(generate_twiddles_simd_64, f64, 8, f64x8); generate_twiddles_simd!(generate_twiddles_simd_32, f32, 8, f32x8); +#[multiversion::multiversion( + targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4 + "x86_64+avx2+fma", // x86_64-v3 + "x86_64+sse4.2", // x86_64-v2 + "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", + "x86+avx2+fma", + "x86+sse4.2", + "x86+sse2", +))] #[inline] pub(crate) fn filter_twiddles(twiddles_re: &[T], twiddles_im: &[T]) -> (Vec, Vec) { assert_eq!(twiddles_re.len(), twiddles_im.len()); From b3568d3612098c2424499288710e14f2961fa793 Mon Sep 17 00:00:00 2001 From: Saveliy Yusufov Date: Sat, 13 Jul 2024 22:11:56 -0400 Subject: [PATCH 6/6] Updates --- benches/bench.rs | 2 +- src/lib.rs | 35 ++++++++++++++++++++++++++--------- src/twiddles.rs | 28 ++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 10 deletions(-) diff --git a/benches/bench.rs b/benches/bench.rs index 540c3cc..9bf511e 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -10,7 +10,7 @@ use rand::{ thread_rng, Rng, }; use utilities::rustfft::num_complex::Complex; -use utilities::rustfft::{FftNum, FftPlanner}; +use utilities::rustfft::FftPlanner; const LENGTHS: &[usize] = &[ 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, diff --git a/src/lib.rs b/src/lib.rs index 144b378..2ece7ec 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -114,27 +114,44 @@ macro_rules! impl_fft_with_opts_and_plan_for { _ => (), } - let mut filtered_twiddles_re = twiddles_re.clone(); - let mut filtered_twiddles_im = twiddles_im.clone(); + // 0th stage is special due to no need to filter twiddle factor + let dist = 1 << (n - 1); + let chunk_size = dist << 1; + + if chunk_size > 4 { + if chunk_size >= $lanes * 2 { + $simd_butterfly_kernel(reals, imags, twiddles_re, twiddles_im, dist); + } else { + fft_chunk_n(reals, imags, twiddles_re, twiddles_im, dist); + } + } + else if chunk_size == 4 { + fft_chunk_4(reals, imags); + } + else if chunk_size == 2 { + fft_chunk_2(reals, imags); + } + + let (mut filtered_twiddles_re, mut filtered_twiddles_im) = filter_twiddles(twiddles_re, twiddles_im); - for t in (0..n).rev() { + for t in (0..n - 1).rev() { let dist = 1 << t; let chunk_size = dist << 1; if chunk_size > 4 { - if t < n - 1 { - (filtered_twiddles_re, filtered_twiddles_im) = filter_twiddles(&filtered_twiddles_re, &filtered_twiddles_im); - } if chunk_size >= $lanes * 2 { $simd_butterfly_kernel(reals, imags, &filtered_twiddles_re, &filtered_twiddles_im, dist); } else { fft_chunk_n(reals, imags, &filtered_twiddles_re, &filtered_twiddles_im, dist); } - } else if chunk_size == 2 { - fft_chunk_2(reals, imags); - } else if chunk_size == 4 { + } + else if chunk_size == 4 { fft_chunk_4(reals, imags); } + else if chunk_size == 2 { + fft_chunk_2(reals, imags); + } + (filtered_twiddles_re, filtered_twiddles_im) = filter_twiddles(&filtered_twiddles_re, &filtered_twiddles_im); } if opts.multithreaded_bit_reversal { diff --git a/src/twiddles.rs b/src/twiddles.rs index f8d0506..7e11ba9 100644 --- a/src/twiddles.rs +++ b/src/twiddles.rs @@ -217,6 +217,34 @@ mod tests { use super::*; + // TODO(saveliy): use + #[test] + fn twiddles_cos_only() { + let n = 4; + let big_n = 1 << n; + + let dist = big_n >> 1; + + let (fwd_twiddles_re, fwd_twiddles_im) = if dist >= 8 * 2 { + generate_twiddles_simd_64(dist, Direction::Forward) + } else { + generate_twiddles(dist, Direction::Forward) + }; + + assert!(fwd_twiddles_re.len() == dist && fwd_twiddles_im.len() == dist); + + for i in 0..dist { + let w_re = fwd_twiddles_re[i]; + let expected_w_im = fwd_twiddles_im[i]; + + let actual_w_im = -fwd_twiddles_re[(i + dist / 2) % dist]; + //assert_float_closeness(actual_w_im, expected_w_im, 1e-6); + print!("actual: {actual_w_im} expected: {expected_w_im}\n"); + } + println!("{:?}", fwd_twiddles_re); + println!("{:?}", fwd_twiddles_im); + } + #[test] fn twiddles_4() { const N: usize = 4;