QuState · smu160 · Apr 18, 2024 · Apr 12, 2024 · Shnatsel · Apr 12, 2024
diff --git a/.cargo/config.toml b/.cargo/config.toml
diff --git a/Cargo.toml b/Cargo.toml
@@ -12,6 +12,7 @@ exclude = ["assets", "scripts", "benches"]
 
 [dependencies]
 num-traits = "0.2.18"
+multiversion = "0.7"
 
 [dev-dependencies]
 utilities = { path = "utilities" }

diff --git a/README.md b/README.md
@@ -23,14 +23,11 @@ Transform (FFT) library written in pure Rust.
 
 - Only supports input with a length of `2^n` (i.e., a power of 2) -- input should be padded with zeros to the next power
   of 2
-- No runtime CPU feature detection (yet). Right now achieving the highest performance requires compiling
-  with `-C target-cpu=native` or [`cargo multivers`](https://github.com/ronnychevalier/cargo-multivers)
 - Requires nightly Rust compiler due to use of portable SIMD
 
 ## Planned features
 
 - Bluestein's algorithm (to handle arbitrary sized FFTs)
-- Runtime CPU feature detection
 - More multi-threading
 - More work on cache-optimal FFT
 
@@ -84,7 +81,7 @@ Then you can install PhastFT itself:
 
 ```bash
 pip install numpy
-RUSTFLAGS='-Ctarget-cpu=native' pip install git+https://github.com/QuState/PhastFT#subdirectory=pyphastft
+pip install git+https://github.com/QuState/PhastFT#subdirectory=pyphastft
 ```
 
 ```python

diff --git a/benches/Makefile b/benches/Makefile
@@ -1,5 +1,5 @@
 CC = gcc
-CFLAGS = -Wall -Wextra -Werror -O3 -march=native
+CFLAGS = -Wall -Wextra -Werror -O3
 LIBS = -lfftw3 -lm
 
 bench_fftw: main.c

diff --git a/benches/README.md b/benches/README.md
@@ -18,15 +18,15 @@
 3. Create virtual env
 
 ```bash
-cd ~/PhastFT/benches && python -m venv .env && source .env/bin/activate
+cd ~/PhastFT/benches && python3 -m venv .env && source .env/bin/activate
 ```
 
 4. Install python dependencies[^1]
 
 ```bash
 pip install -r requirements.txt
 cd ~/PhastFT/pyphastft
-RUSTFLAGS='-Ctarget-cpu=native' pip install .
+pip install .
 ```
 
 5. Run the `FFTW3` vs. `RustFFT` vs. `PhastFT` benchmark for all inputs of size `2^n`, where `n \in [4, 30].`
@@ -55,13 +55,13 @@ The generated images will be saved in your working directory.
 
 ### Libraries and Packages
 
-| Library/Package | Version        | Language  | Benchmark Compilation Flags                                                                          |
-|-----------------|----------------|-----------|------------------------------------------------------------------------------------------------------|
-| `FFTW3`         | 3.3.10-1 amd64 | C, OCaml  | `-O3 -march=native`                                                                                  |
-| `RustFFT`       | 6.2.0          | Rust      | `-C target-cpu=native -C opt-level=3 --edition=2021; codegen-units = 1; lto = true; panic = "abort"` |
-| `PhastFT`       | 0.1.0          | Rust      | `-C target-cpu=native -C opt-level=3 --edition=2021; codegen-units = 1; lto = true; panic = "abort"` |
-| `NumPy`         | 1.26.4         | Python, C | `N/A`                                                                                                |
-| `pyFFTW`        | 0.13.1         | Python, C | `N/A`                                                                                                |
+| Library/Package | Version        | Language  | Benchmark Compilation Flags                                                     |
+|-----------------|----------------|-----------|---------------------------------------------------------------------------------|
+| `FFTW3`         | 3.3.10-1 amd64 | C, OCaml  | `-O3`                                                                           |
+| `RustFFT`       | 6.2.0          | Rust      | `-C opt-level=3 --edition=2021; codegen-units = 1; lto = true; panic = "abort"` |
+| `PhastFT`       | 0.1.0          | Rust      | `-C opt-level=3 --edition=2021; codegen-units = 1; lto = true; panic = "abort"` |
+| `NumPy`         | 1.26.4         | Python, C | `N/A`                                                                           |
+| `pyFFTW`        | 0.13.1         | Python, C | `N/A`                                                                           |
 
 ### Benchmark System Configuration
 

diff --git a/benches/benchmark.sh b/benches/benchmark.sh
@@ -15,8 +15,6 @@ max_iters=1000  # Set your desired maximum number of iterations
 OUTPUT_DIR=benchmark-data.$(date +"%Y.%m.%d.%H-%M-%S")
 mkdir -p "$OUTPUT_DIR"/fftw3 && mkdir "$OUTPUT_DIR"/rustfft && mkdir "$OUTPUT_DIR"/phastft && mkdir "$OUTPUT_DIR"/fftwrb
 
-RUSTFLAGS="-C target-cpu=native"
-
 benchmark_fftw3() {
     make clean && make
 

diff --git a/profile.sh b/profile.sh
@@ -8,7 +8,7 @@ then
     exit 1
 fi
 
-RUSTFLAGS='-Ctarget-cpu=native' cargo +nightly build --profile profiling --example profile
+cargo +nightly build --profile profiling --example profile
 
 sudo perf record --call-graph=dwarf ./target/profiling/examples/profile $1 && sudo perf script -f -F +pid > processed_result.perf
 

diff --git a/rust-toolchain.toml b/rust-toolchain.toml
@@ -0,0 +1,2 @@
+[toolchain]
+channel = "nightly"
diff --git a/src/cobra.rs b/src/cobra.rs
@@ -24,6 +24,7 @@ const LOG_BLOCK_WIDTH: usize = 7; // log2 of cacheline
 ///
 /// ## References
 /// [1] <https://www.katjaas.nl/bitreversal/bitreversal.html>
+#[inline]
 pub(crate) fn bit_rev<T>(buf: &mut [T], log_n: usize) {
     let mut nodd: usize;
     let mut noddrev; // to hold bitwise negated or odd values
@@ -164,6 +165,14 @@ pub(crate) fn bit_reverse_permutation<T>(buf: &mut [T]) {
 /// [2] Christian Knauth, Boran Adas, Daniel Whitfield, Xuesong Wang, Lydia Ickler, Tim Conrad, Oliver Serang:
 /// Practically efficient methods for performing bit-reversed permutation in C++11 on the x86-64 architecture
 /// [3] <https://bitbucket.org/orserang/bit-reversal-methods/src/master/src_and_bin/src/algorithms/COBRAShuffle.hpp>
+#[multiversion::multiversion(targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4
+                                     "x86_64+avx2+fma", // x86_64-v3
+                                     "x86_64+sse4.2", // x86_64-v2
+                                     "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl",
+                                     "x86+avx2+fma",
+                                     "x86+sse4.2",
+                                     "x86+sse2",
+))]
 pub fn cobra_apply<T: Default + Copy + Clone>(v: &mut [T], log_n: usize) {
     if log_n <= 2 * LOG_BLOCK_WIDTH {
         bit_rev(v, log_n);

diff --git a/src/kernels.rs b/src/kernels.rs
@@ -4,6 +4,7 @@ use num_traits::Float;
 
 macro_rules! fft_butterfly_n_simd {
     ($func_name:ident, $precision:ty, $lanes:literal, $simd_vector:ty) => {
+        #[inline]
         pub fn $func_name(
             reals: &mut [$precision],
             imags: &mut [$precision],
@@ -51,6 +52,7 @@ macro_rules! fft_butterfly_n_simd {
 fft_butterfly_n_simd!(fft_64_chunk_n_simd, f64, 8, f64x8);
 fft_butterfly_n_simd!(fft_32_chunk_n_simd, f32, 16, f32x16);
 
+#[inline]
 pub(crate) fn fft_chunk_n<T: Float>(
     reals: &mut [T],
     imags: &mut [T],
@@ -91,6 +93,7 @@ pub(crate) fn fft_chunk_n<T: Float>(
 }
 
 /// `chunk_size == 4`, so hard code twiddle factors
+#[inline]
 pub(crate) fn fft_chunk_4<T: Float>(reals: &mut [T], imags: &mut [T]) {
     let dist = 2;
     let chunk_size = dist << 1;
@@ -125,6 +128,7 @@ pub(crate) fn fft_chunk_4<T: Float>(reals: &mut [T], imags: &mut [T]) {
 }
 
 /// `chunk_size == 2`, so skip phase
+#[inline]
 pub(crate) fn fft_chunk_2<T: Float>(reals: &mut [T], imags: &mut [T]) {
     reals
         .chunks_exact_mut(2)

diff --git a/src/lib.rs b/src/lib.rs
@@ -6,7 +6,7 @@
 #![warn(clippy::suspicious)]
 #![warn(clippy::perf)]
 #![forbid(unsafe_code)]
-#![feature(portable_simd)]
+#![feature(portable_simd, avx512_target_feature)]
 
 use crate::cobra::cobra_apply;
 use crate::kernels::{
@@ -77,6 +77,15 @@ macro_rules! impl_fft_with_opts_and_plan_for {
         /// # Panics
         ///
         /// Panics if `reals.len() != imags.len()`, or if the input length is _not_ a power of 2.
+        #[multiversion::multiversion(
+                                    targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4
+                                            "x86_64+avx2+fma", // x86_64-v3
+                                            "x86_64+sse4.2", // x86_64-v2
+                                            "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl",
+                                            "x86+avx2+fma",
+                                            "x86+sse4.2",
+                                            "x86+sse2",
+        ))]
         pub fn $func_name(
             reals: &mut [$precision],
             imags: &mut [$precision],

diff --git a/src/twiddles.rs b/src/twiddles.rs
@@ -181,6 +181,7 @@ macro_rules! generate_twiddles_simd {
 generate_twiddles_simd!(generate_twiddles_simd_64, f64, 8, f64x8);
 generate_twiddles_simd!(generate_twiddles_simd_32, f32, 8, f32x8);
 
+#[inline]
 pub(crate) fn filter_twiddles<T: Float>(twiddles_re: &mut Vec<T>, twiddles_im: &mut Vec<T>) {
     assert_eq!(twiddles_re.len(), twiddles_im.len());
     let dist = twiddles_re.len();
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,7 +8,7 @@ then @@
         exit 1
     fi
-    RUSTFLAGS='-Ctarget-cpu=native' cargo +nightly build --profile profiling --example profile
+    cargo +nightly build --profile profiling --example profile
     sudo perf record --call-graph=dwarf ./target/profiling/examples/profile $1 && sudo perf script -f -F +pid > processed_result.perf
@@ Expand Down @@