From f34eb3251b478fc8cbbe0c501f9641675896b65e Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Thu, 11 Apr 2024 22:37:05 -0400
Subject: [PATCH] Add automatic CPU feature detection

---
 .cargo/config.toml   |  2 --
 Cargo.toml           |  1 +
 README.md            |  5 +----
 benches/Makefile     |  2 +-
 benches/README.md    | 18 +++++++++---------
 benches/benchmark.sh |  2 --
 profile.sh           |  2 +-
 rust-toolchain.toml  |  2 ++
 src/cobra.rs         |  9 +++++++++
 src/kernels.rs       |  4 ++++
 src/lib.rs           | 11 ++++++++++-
 src/twiddles.rs      |  1 +
 12 files changed, 39 insertions(+), 20 deletions(-)
 delete mode 100644 .cargo/config.toml
 create mode 100644 rust-toolchain.toml

diff --git a/.cargo/config.toml b/.cargo/config.toml
deleted file mode 100644
index 745f656..0000000
--- a/.cargo/config.toml
+++ /dev/null
@@ -1,2 +0,0 @@
-[build]
-rustflags = ["-C", "target-cpu=native"] # custom flags to pass to all compiler invocations
\ No newline at end of file
diff --git a/Cargo.toml b/Cargo.toml
index dd141f2..38cf255 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,6 +12,7 @@ exclude = ["assets", "scripts", "benches"]
 
 [dependencies]
 num-traits = "0.2.18"
+multiversion = "0.7"
 
 [dev-dependencies]
 utilities = { path = "utilities" }
diff --git a/README.md b/README.md
index c3e3996..e885f3b 100644
--- a/README.md
+++ b/README.md
@@ -23,14 +23,11 @@ Transform (FFT) library written in pure Rust.
 
 - Only supports input with a length of `2^n` (i.e., a power of 2) -- input should be padded with zeros to the next power
   of 2
-- No runtime CPU feature detection (yet). Right now achieving the highest performance requires compiling
-  with `-C target-cpu=native` or [`cargo multivers`](https://github.com/ronnychevalier/cargo-multivers)
 - Requires nightly Rust compiler due to use of portable SIMD
 
 ## Planned features
 
 - Bluestein's algorithm (to handle arbitrary sized FFTs)
-- Runtime CPU feature detection
 - More multi-threading
 - More work on cache-optimal FFT
 
@@ -84,7 +81,7 @@ Then you can install PhastFT itself:
 
 ```bash
 pip install numpy
-RUSTFLAGS='-Ctarget-cpu=native' pip install git+https://github.com/QuState/PhastFT#subdirectory=pyphastft
+pip install git+https://github.com/QuState/PhastFT#subdirectory=pyphastft
 ```
 
 ```python
diff --git a/benches/Makefile b/benches/Makefile
index 0659f53..b3891b6 100644
--- a/benches/Makefile
+++ b/benches/Makefile
@@ -1,5 +1,5 @@
 CC = gcc
-CFLAGS = -Wall -Wextra -Werror -O3 -march=native
+CFLAGS = -Wall -Wextra -Werror -O3
 LIBS = -lfftw3 -lm
 
 bench_fftw: main.c
diff --git a/benches/README.md b/benches/README.md
index 80e3f7d..2d09118 100644
--- a/benches/README.md
+++ b/benches/README.md
@@ -18,7 +18,7 @@
 3. Create virtual env
 
 ```bash
-cd ~/PhastFT/benches && python -m venv .env && source .env/bin/activate
+cd ~/PhastFT/benches && python3 -m venv .env && source .env/bin/activate
 ```
 
 4. Install python dependencies[^1]
@@ -26,7 +26,7 @@ cd ~/PhastFT/benches && python -m venv .env && source .env/bin/activate
 ```bash
 pip install -r requirements.txt
 cd ~/PhastFT/pyphastft
-RUSTFLAGS='-Ctarget-cpu=native' pip install .
+pip install .
 ```
 
 5. Run the `FFTW3` vs. `RustFFT` vs. `PhastFT` benchmark for all inputs of size `2^n`, where `n \in [4, 30].`
@@ -55,13 +55,13 @@ The generated images will be saved in your working directory.
 
 ### Libraries and Packages
 
-| Library/Package | Version        | Language  | Benchmark Compilation Flags                                                                          |
-|-----------------|----------------|-----------|------------------------------------------------------------------------------------------------------|
-| `FFTW3`         | 3.3.10-1 amd64 | C, OCaml  | `-O3 -march=native`                                                                                  |
-| `RustFFT`       | 6.2.0          | Rust      | `-C target-cpu=native -C opt-level=3 --edition=2021; codegen-units = 1; lto = true; panic = "abort"` |
-| `PhastFT`       | 0.1.0          | Rust      | `-C target-cpu=native -C opt-level=3 --edition=2021; codegen-units = 1; lto = true; panic = "abort"` |
-| `NumPy`         | 1.26.4         | Python, C | `N/A`                                                                                                |
-| `pyFFTW`        | 0.13.1         | Python, C | `N/A`                                                                                                |
+| Library/Package | Version        | Language  | Benchmark Compilation Flags                                                     |
+|-----------------|----------------|-----------|---------------------------------------------------------------------------------|
+| `FFTW3`         | 3.3.10-1 amd64 | C, OCaml  | `-O3`                                                                           |
+| `RustFFT`       | 6.2.0          | Rust      | `-C opt-level=3 --edition=2021; codegen-units = 1; lto = true; panic = "abort"` |
+| `PhastFT`       | 0.1.0          | Rust      | `-C opt-level=3 --edition=2021; codegen-units = 1; lto = true; panic = "abort"` |
+| `NumPy`         | 1.26.4         | Python, C | `N/A`                                                                           |
+| `pyFFTW`        | 0.13.1         | Python, C | `N/A`                                                                           |
 
 ### Benchmark System Configuration
 
diff --git a/benches/benchmark.sh b/benches/benchmark.sh
index 05e45fa..98132ef 100755
--- a/benches/benchmark.sh
+++ b/benches/benchmark.sh
@@ -15,8 +15,6 @@ max_iters=1000  # Set your desired maximum number of iterations
 OUTPUT_DIR=benchmark-data.$(date +"%Y.%m.%d.%H-%M-%S")
 mkdir -p "$OUTPUT_DIR"/fftw3 && mkdir "$OUTPUT_DIR"/rustfft && mkdir "$OUTPUT_DIR"/phastft && mkdir "$OUTPUT_DIR"/fftwrb
 
-RUSTFLAGS="-C target-cpu=native"
-
 benchmark_fftw3() {
     make clean && make
 
diff --git a/profile.sh b/profile.sh
index 7898688..158e560 100755
--- a/profile.sh
+++ b/profile.sh
@@ -8,7 +8,7 @@ then
     exit 1
 fi
 
-RUSTFLAGS='-Ctarget-cpu=native' cargo +nightly build --profile profiling --example profile
+cargo +nightly build --profile profiling --example profile
 
 sudo perf record --call-graph=dwarf ./target/profiling/examples/profile $1 && sudo perf script -f -F +pid > processed_result.perf
 
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
new file mode 100644
index 0000000..5d56faf
--- /dev/null
+++ b/rust-toolchain.toml
@@ -0,0 +1,2 @@
+[toolchain]
+channel = "nightly"
diff --git a/src/cobra.rs b/src/cobra.rs
index 56a42e3..24713d3 100644
--- a/src/cobra.rs
+++ b/src/cobra.rs
@@ -24,6 +24,7 @@ const LOG_BLOCK_WIDTH: usize = 7; // log2 of cacheline
 ///
 /// ## References
 /// [1] <https://www.katjaas.nl/bitreversal/bitreversal.html>
+#[inline]
 pub(crate) fn bit_rev<T>(buf: &mut [T], log_n: usize) {
     let mut nodd: usize;
     let mut noddrev; // to hold bitwise negated or odd values
@@ -164,6 +165,14 @@ pub(crate) fn bit_reverse_permutation<T>(buf: &mut [T]) {
 /// [2] Christian Knauth, Boran Adas, Daniel Whitfield, Xuesong Wang, Lydia Ickler, Tim Conrad, Oliver Serang:
 /// Practically efficient methods for performing bit-reversed permutation in C++11 on the x86-64 architecture
 /// [3] <https://bitbucket.org/orserang/bit-reversal-methods/src/master/src_and_bin/src/algorithms/COBRAShuffle.hpp>
+#[multiversion::multiversion(targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4
+                                     "x86_64+avx2+fma", // x86_64-v3
+                                     "x86_64+sse4.2", // x86_64-v2
+                                     "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl",
+                                     "x86+avx2+fma",
+                                     "x86+sse4.2",
+                                     "x86+sse2",
+))]
 pub fn cobra_apply<T: Default + Copy + Clone>(v: &mut [T], log_n: usize) {
     if log_n <= 2 * LOG_BLOCK_WIDTH {
         bit_rev(v, log_n);
diff --git a/src/kernels.rs b/src/kernels.rs
index a02f77a..101b2ec 100644
--- a/src/kernels.rs
+++ b/src/kernels.rs
@@ -4,6 +4,7 @@ use num_traits::Float;
 
 macro_rules! fft_butterfly_n_simd {
     ($func_name:ident, $precision:ty, $lanes:literal, $simd_vector:ty) => {
+        #[inline]
         pub fn $func_name(
             reals: &mut [$precision],
             imags: &mut [$precision],
@@ -51,6 +52,7 @@ macro_rules! fft_butterfly_n_simd {
 fft_butterfly_n_simd!(fft_64_chunk_n_simd, f64, 8, f64x8);
 fft_butterfly_n_simd!(fft_32_chunk_n_simd, f32, 16, f32x16);
 
+#[inline]
 pub(crate) fn fft_chunk_n<T: Float>(
     reals: &mut [T],
     imags: &mut [T],
@@ -91,6 +93,7 @@ pub(crate) fn fft_chunk_n<T: Float>(
 }
 
 /// `chunk_size == 4`, so hard code twiddle factors
+#[inline]
 pub(crate) fn fft_chunk_4<T: Float>(reals: &mut [T], imags: &mut [T]) {
     let dist = 2;
     let chunk_size = dist << 1;
@@ -125,6 +128,7 @@ pub(crate) fn fft_chunk_4<T: Float>(reals: &mut [T], imags: &mut [T]) {
 }
 
 /// `chunk_size == 2`, so skip phase
+#[inline]
 pub(crate) fn fft_chunk_2<T: Float>(reals: &mut [T], imags: &mut [T]) {
     reals
         .chunks_exact_mut(2)
diff --git a/src/lib.rs b/src/lib.rs
index 257a9f0..c65a74f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -6,7 +6,7 @@
 #![warn(clippy::suspicious)]
 #![warn(clippy::perf)]
 #![forbid(unsafe_code)]
-#![feature(portable_simd)]
+#![feature(portable_simd, avx512_target_feature)]
 
 use crate::cobra::cobra_apply;
 use crate::kernels::{
@@ -77,6 +77,15 @@ macro_rules! impl_fft_with_opts_and_plan_for {
         /// # Panics
         ///
         /// Panics if `reals.len() != imags.len()`, or if the input length is _not_ a power of 2.
+        #[multiversion::multiversion(
+                                    targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4
+                                            "x86_64+avx2+fma", // x86_64-v3
+                                            "x86_64+sse4.2", // x86_64-v2
+                                            "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl",
+                                            "x86+avx2+fma",
+                                            "x86+sse4.2",
+                                            "x86+sse2",
+        ))]
         pub fn $func_name(
             reals: &mut [$precision],
             imags: &mut [$precision],
diff --git a/src/twiddles.rs b/src/twiddles.rs
index 97f31a0..b754a11 100644
--- a/src/twiddles.rs
+++ b/src/twiddles.rs
@@ -181,6 +181,7 @@ macro_rules! generate_twiddles_simd {
 generate_twiddles_simd!(generate_twiddles_simd_64, f64, 8, f64x8);
 generate_twiddles_simd!(generate_twiddles_simd_32, f32, 8, f32x8);
 
+#[inline]
 pub(crate) fn filter_twiddles<T: Float>(twiddles_re: &mut Vec<T>, twiddles_im: &mut Vec<T>) {
     assert_eq!(twiddles_re.len(), twiddles_im.len());
     let dist = twiddles_re.len();