From 4b2858a007ac5bdce8fba7f734b78162e96df243 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Fri, 2 Feb 2024 22:06:53 +0000 Subject: [PATCH 1/6] Add Options struct --- src/lib.rs | 1 + src/options.rs | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 src/options.rs diff --git a/src/lib.rs b/src/lib.rs index b069afb..803c860 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,6 +7,7 @@ use crate::twiddles::{filter_twiddles, generate_twiddles, generate_twiddles_simd mod cobra; mod kernels; mod twiddles; +pub mod options; /// FFT -- Decimation in Frequency /// diff --git a/src/options.rs b/src/options.rs new file mode 100644 index 0000000..0f21602 --- /dev/null +++ b/src/options.rs @@ -0,0 +1,43 @@ +/// Options to tune to improve performance depending on the hardware and input size. +/// +/// Calling FFT routines without specifying options will automatically select reasonable defaults +/// depending on the input size and other factors. +/// +/// You only need to tune these options if you are trying to squeeze maximum performance +/// out of a known hardware platform that you can bechmark at varying input sizes. +#[non_exhaustive] +#[derive(Debug, Clone, Default)] +pub struct Options { + pub bit_reverse: BitReverseAlgorithm, +} + +impl Options { + pub(crate) fn guess_options(input_size: usize) -> Options { + let mut options = Options::default(); + let n: usize = input_size.ilog2() as usize; + if n < 22 { + options.bit_reverse = BitReverseAlgorithm::Cobra; + } else { + options.bit_reverse = BitReverseAlgorithm::MultiThreadedCobra; + } + options + } +} + +/// The algorithm to use for bit reversal. +/// Different algorithms perform best on different input sizes. +#[derive(Debug, Copy, Clone, PartialEq, Default)] +pub enum BitReverseAlgorithm { + #[default] + /// Straightforward algorithm that performs best at smaller sizes + Plain, + /// Cache-Optimal Bit Reversal Algorithm + /// + /// This is faster at larger datasets that do not fit into the cache. + /// The exact threshold where it starts being beneficial varies depending on the hardware. + Cobra, + /// COBRA but run on two threads instead of one. + /// Typically beneficial at even larger sizes than single-threaded COBRA, and slower otherwise. + /// The exact threshold where it starts being beneficial varies depending on the hardware. + MultiThreadedCobra, +} \ No newline at end of file From 93b9d038031beba55df70fde906ae6b1c92157e6 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Fri, 2 Feb 2024 22:07:01 +0000 Subject: [PATCH 2/6] cargo fmt --- src/lib.rs | 2 +- src/options.rs | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 803c860..e57088b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,8 +6,8 @@ use crate::twiddles::{filter_twiddles, generate_twiddles, generate_twiddles_simd mod cobra; mod kernels; -mod twiddles; pub mod options; +mod twiddles; /// FFT -- Decimation in Frequency /// diff --git a/src/options.rs b/src/options.rs index 0f21602..3f82a9b 100644 --- a/src/options.rs +++ b/src/options.rs @@ -1,8 +1,8 @@ /// Options to tune to improve performance depending on the hardware and input size. -/// +/// /// Calling FFT routines without specifying options will automatically select reasonable defaults /// depending on the input size and other factors. -/// +/// /// You only need to tune these options if you are trying to squeeze maximum performance /// out of a known hardware platform that you can bechmark at varying input sizes. #[non_exhaustive] @@ -32,7 +32,7 @@ pub enum BitReverseAlgorithm { /// Straightforward algorithm that performs best at smaller sizes Plain, /// Cache-Optimal Bit Reversal Algorithm - /// + /// /// This is faster at larger datasets that do not fit into the cache. /// The exact threshold where it starts being beneficial varies depending on the hardware. Cobra, @@ -40,4 +40,4 @@ pub enum BitReverseAlgorithm { /// Typically beneficial at even larger sizes than single-threaded COBRA, and slower otherwise. /// The exact threshold where it starts being beneficial varies depending on the hardware. MultiThreadedCobra, -} \ No newline at end of file +} From db724700dadf89075313d17f16e07ac6c392bf91 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Fri, 2 Feb 2024 22:23:06 +0000 Subject: [PATCH 3/6] Simplify options down to a single multi-threading knob --- src/options.rs | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/src/options.rs b/src/options.rs index 3f82a9b..46751d7 100644 --- a/src/options.rs +++ b/src/options.rs @@ -8,7 +8,10 @@ #[non_exhaustive] #[derive(Debug, Clone, Default)] pub struct Options { - pub bit_reverse: BitReverseAlgorithm, + /// Whether to run bit reversal step in 2 threads instead of one. + /// This is beneficial only at large input sizes (i.e. gigabytes of data). + /// The exact threshold where it starts being beneficial varies depending on the hardware. + pub multithreaded_bit_reversal: bool, } impl Options { @@ -16,28 +19,10 @@ impl Options { let mut options = Options::default(); let n: usize = input_size.ilog2() as usize; if n < 22 { - options.bit_reverse = BitReverseAlgorithm::Cobra; + options.multithreaded_bit_reversal = false; } else { - options.bit_reverse = BitReverseAlgorithm::MultiThreadedCobra; + options.multithreaded_bit_reversal = true; } options } } - -/// The algorithm to use for bit reversal. -/// Different algorithms perform best on different input sizes. -#[derive(Debug, Copy, Clone, PartialEq, Default)] -pub enum BitReverseAlgorithm { - #[default] - /// Straightforward algorithm that performs best at smaller sizes - Plain, - /// Cache-Optimal Bit Reversal Algorithm - /// - /// This is faster at larger datasets that do not fit into the cache. - /// The exact threshold where it starts being beneficial varies depending on the hardware. - Cobra, - /// COBRA but run on two threads instead of one. - /// Typically beneficial at even larger sizes than single-threaded COBRA, and slower otherwise. - /// The exact threshold where it starts being beneficial varies depending on the hardware. - MultiThreadedCobra, -} From 34f1968a7eaf82904ed71257b05d3d7b5073488b Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Fri, 2 Feb 2024 22:26:26 +0000 Subject: [PATCH 4/6] Wire up guessing the options to the main FFT function --- src/lib.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index e57088b..e79210b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,6 +2,7 @@ use crate::cobra::cobra_apply; use crate::kernels::{fft_chunk_2, fft_chunk_4, fft_chunk_n, fft_chunk_n_simd, Float}; +use crate::options::Options; use crate::twiddles::{filter_twiddles, generate_twiddles, generate_twiddles_simd}; mod cobra; @@ -22,6 +23,7 @@ mod twiddles; pub fn fft_dif(reals: &mut [Float], imags: &mut [Float]) { assert_eq!(reals.len(), imags.len()); let n: usize = reals.len().ilog2() as usize; + let opts = Options::guess_options(reals.len()); let dist = 1 << (n - 1); let chunk_size = dist << 1; @@ -63,14 +65,14 @@ pub fn fft_dif(reals: &mut [Float], imags: &mut [Float]) { } } - if n < 22 { - cobra_apply(reals, n); - cobra_apply(imags, n); - } else { + if opts.multithreaded_bit_reversal { std::thread::scope(|s| { s.spawn(|| cobra_apply(reals, n)); s.spawn(|| cobra_apply(imags, n)); }); + } else { + cobra_apply(reals, n); + cobra_apply(imags, n); } } From ed4ebcad2d9058c6386a82d3984939e74e1a4b66 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Fri, 2 Feb 2024 22:30:23 +0000 Subject: [PATCH 5/6] Add a public function that accepts caller-provided `Options` --- src/lib.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index e79210b..3c1cdc9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,9 +21,17 @@ mod twiddles; /// /// [1] https://inst.eecs.berkeley.edu/~ee123/sp15/Notes/Lecture08_FFT_and_SpectAnalysis.key.pdf pub fn fft_dif(reals: &mut [Float], imags: &mut [Float]) { + let opts = Options::guess_options(reals.len()); + fft_dif_with_opts(reals, imags, &opts) +} + +/// Same as [fft_dif], but also accepts [`Options`] that control optimization strategies. +/// +/// `fft_dif` automatically guesses the best strategy for a given input, +/// so you only need to call this if you are tuning performance for a specific hardware platform. +pub fn fft_dif_with_opts(reals: &mut [Float], imags: &mut [Float], opts: &Options) { assert_eq!(reals.len(), imags.len()); let n: usize = reals.len().ilog2() as usize; - let opts = Options::guess_options(reals.len()); let dist = 1 << (n - 1); let chunk_size = dist << 1; From 6c6d16fb150e511cb764fe5a256b0e1cbfb14e33 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Fri, 2 Feb 2024 22:32:54 +0000 Subject: [PATCH 6/6] placate Clippy --- src/options.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/options.rs b/src/options.rs index 46751d7..07d1086 100644 --- a/src/options.rs +++ b/src/options.rs @@ -18,11 +18,7 @@ impl Options { pub(crate) fn guess_options(input_size: usize) -> Options { let mut options = Options::default(); let n: usize = input_size.ilog2() as usize; - if n < 22 { - options.multithreaded_bit_reversal = false; - } else { - options.multithreaded_bit_reversal = true; - } + options.multithreaded_bit_reversal = n >= 22; options } }