From f34eb3251b478fc8cbbe0c501f9641675896b65e Mon Sep 17 00:00:00 2001 From: Caleb Zulawski Date: Thu, 11 Apr 2024 22:37:05 -0400 Subject: [PATCH] Add automatic CPU feature detection --- .cargo/config.toml | 2 -- Cargo.toml | 1 + README.md | 5 +---- benches/Makefile | 2 +- benches/README.md | 18 +++++++++--------- benches/benchmark.sh | 2 -- profile.sh | 2 +- rust-toolchain.toml | 2 ++ src/cobra.rs | 9 +++++++++ src/kernels.rs | 4 ++++ src/lib.rs | 11 ++++++++++- src/twiddles.rs | 1 + 12 files changed, 39 insertions(+), 20 deletions(-) delete mode 100644 .cargo/config.toml create mode 100644 rust-toolchain.toml diff --git a/.cargo/config.toml b/.cargo/config.toml deleted file mode 100644 index 745f656..0000000 --- a/.cargo/config.toml +++ /dev/null @@ -1,2 +0,0 @@ -[build] -rustflags = ["-C", "target-cpu=native"] # custom flags to pass to all compiler invocations \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index dd141f2..38cf255 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ exclude = ["assets", "scripts", "benches"] [dependencies] num-traits = "0.2.18" +multiversion = "0.7" [dev-dependencies] utilities = { path = "utilities" } diff --git a/README.md b/README.md index c3e3996..e885f3b 100644 --- a/README.md +++ b/README.md @@ -23,14 +23,11 @@ Transform (FFT) library written in pure Rust. - Only supports input with a length of `2^n` (i.e., a power of 2) -- input should be padded with zeros to the next power of 2 -- No runtime CPU feature detection (yet). Right now achieving the highest performance requires compiling - with `-C target-cpu=native` or [`cargo multivers`](https://github.com/ronnychevalier/cargo-multivers) - Requires nightly Rust compiler due to use of portable SIMD ## Planned features - Bluestein's algorithm (to handle arbitrary sized FFTs) -- Runtime CPU feature detection - More multi-threading - More work on cache-optimal FFT @@ -84,7 +81,7 @@ Then you can install PhastFT itself: ```bash pip install numpy -RUSTFLAGS='-Ctarget-cpu=native' pip install git+https://github.com/QuState/PhastFT#subdirectory=pyphastft +pip install git+https://github.com/QuState/PhastFT#subdirectory=pyphastft ``` ```python diff --git a/benches/Makefile b/benches/Makefile index 0659f53..b3891b6 100644 --- a/benches/Makefile +++ b/benches/Makefile @@ -1,5 +1,5 @@ CC = gcc -CFLAGS = -Wall -Wextra -Werror -O3 -march=native +CFLAGS = -Wall -Wextra -Werror -O3 LIBS = -lfftw3 -lm bench_fftw: main.c diff --git a/benches/README.md b/benches/README.md index 80e3f7d..2d09118 100644 --- a/benches/README.md +++ b/benches/README.md @@ -18,7 +18,7 @@ 3. Create virtual env ```bash -cd ~/PhastFT/benches && python -m venv .env && source .env/bin/activate +cd ~/PhastFT/benches && python3 -m venv .env && source .env/bin/activate ``` 4. Install python dependencies[^1] @@ -26,7 +26,7 @@ cd ~/PhastFT/benches && python -m venv .env && source .env/bin/activate ```bash pip install -r requirements.txt cd ~/PhastFT/pyphastft -RUSTFLAGS='-Ctarget-cpu=native' pip install . +pip install . ``` 5. Run the `FFTW3` vs. `RustFFT` vs. `PhastFT` benchmark for all inputs of size `2^n`, where `n \in [4, 30].` @@ -55,13 +55,13 @@ The generated images will be saved in your working directory. ### Libraries and Packages -| Library/Package | Version | Language | Benchmark Compilation Flags | -|-----------------|----------------|-----------|------------------------------------------------------------------------------------------------------| -| `FFTW3` | 3.3.10-1 amd64 | C, OCaml | `-O3 -march=native` | -| `RustFFT` | 6.2.0 | Rust | `-C target-cpu=native -C opt-level=3 --edition=2021; codegen-units = 1; lto = true; panic = "abort"` | -| `PhastFT` | 0.1.0 | Rust | `-C target-cpu=native -C opt-level=3 --edition=2021; codegen-units = 1; lto = true; panic = "abort"` | -| `NumPy` | 1.26.4 | Python, C | `N/A` | -| `pyFFTW` | 0.13.1 | Python, C | `N/A` | +| Library/Package | Version | Language | Benchmark Compilation Flags | +|-----------------|----------------|-----------|---------------------------------------------------------------------------------| +| `FFTW3` | 3.3.10-1 amd64 | C, OCaml | `-O3` | +| `RustFFT` | 6.2.0 | Rust | `-C opt-level=3 --edition=2021; codegen-units = 1; lto = true; panic = "abort"` | +| `PhastFT` | 0.1.0 | Rust | `-C opt-level=3 --edition=2021; codegen-units = 1; lto = true; panic = "abort"` | +| `NumPy` | 1.26.4 | Python, C | `N/A` | +| `pyFFTW` | 0.13.1 | Python, C | `N/A` | ### Benchmark System Configuration diff --git a/benches/benchmark.sh b/benches/benchmark.sh index 05e45fa..98132ef 100755 --- a/benches/benchmark.sh +++ b/benches/benchmark.sh @@ -15,8 +15,6 @@ max_iters=1000 # Set your desired maximum number of iterations OUTPUT_DIR=benchmark-data.$(date +"%Y.%m.%d.%H-%M-%S") mkdir -p "$OUTPUT_DIR"/fftw3 && mkdir "$OUTPUT_DIR"/rustfft && mkdir "$OUTPUT_DIR"/phastft && mkdir "$OUTPUT_DIR"/fftwrb -RUSTFLAGS="-C target-cpu=native" - benchmark_fftw3() { make clean && make diff --git a/profile.sh b/profile.sh index 7898688..158e560 100755 --- a/profile.sh +++ b/profile.sh @@ -8,7 +8,7 @@ then exit 1 fi -RUSTFLAGS='-Ctarget-cpu=native' cargo +nightly build --profile profiling --example profile +cargo +nightly build --profile profiling --example profile sudo perf record --call-graph=dwarf ./target/profiling/examples/profile $1 && sudo perf script -f -F +pid > processed_result.perf diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000..5d56faf --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,2 @@ +[toolchain] +channel = "nightly" diff --git a/src/cobra.rs b/src/cobra.rs index 56a42e3..24713d3 100644 --- a/src/cobra.rs +++ b/src/cobra.rs @@ -24,6 +24,7 @@ const LOG_BLOCK_WIDTH: usize = 7; // log2 of cacheline /// /// ## References /// [1] +#[inline] pub(crate) fn bit_rev(buf: &mut [T], log_n: usize) { let mut nodd: usize; let mut noddrev; // to hold bitwise negated or odd values @@ -164,6 +165,14 @@ pub(crate) fn bit_reverse_permutation(buf: &mut [T]) { /// [2] Christian Knauth, Boran Adas, Daniel Whitfield, Xuesong Wang, Lydia Ickler, Tim Conrad, Oliver Serang: /// Practically efficient methods for performing bit-reversed permutation in C++11 on the x86-64 architecture /// [3] +#[multiversion::multiversion(targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4 + "x86_64+avx2+fma", // x86_64-v3 + "x86_64+sse4.2", // x86_64-v2 + "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", + "x86+avx2+fma", + "x86+sse4.2", + "x86+sse2", +))] pub fn cobra_apply(v: &mut [T], log_n: usize) { if log_n <= 2 * LOG_BLOCK_WIDTH { bit_rev(v, log_n); diff --git a/src/kernels.rs b/src/kernels.rs index a02f77a..101b2ec 100644 --- a/src/kernels.rs +++ b/src/kernels.rs @@ -4,6 +4,7 @@ use num_traits::Float; macro_rules! fft_butterfly_n_simd { ($func_name:ident, $precision:ty, $lanes:literal, $simd_vector:ty) => { + #[inline] pub fn $func_name( reals: &mut [$precision], imags: &mut [$precision], @@ -51,6 +52,7 @@ macro_rules! fft_butterfly_n_simd { fft_butterfly_n_simd!(fft_64_chunk_n_simd, f64, 8, f64x8); fft_butterfly_n_simd!(fft_32_chunk_n_simd, f32, 16, f32x16); +#[inline] pub(crate) fn fft_chunk_n( reals: &mut [T], imags: &mut [T], @@ -91,6 +93,7 @@ pub(crate) fn fft_chunk_n( } /// `chunk_size == 4`, so hard code twiddle factors +#[inline] pub(crate) fn fft_chunk_4(reals: &mut [T], imags: &mut [T]) { let dist = 2; let chunk_size = dist << 1; @@ -125,6 +128,7 @@ pub(crate) fn fft_chunk_4(reals: &mut [T], imags: &mut [T]) { } /// `chunk_size == 2`, so skip phase +#[inline] pub(crate) fn fft_chunk_2(reals: &mut [T], imags: &mut [T]) { reals .chunks_exact_mut(2) diff --git a/src/lib.rs b/src/lib.rs index 257a9f0..c65a74f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,7 +6,7 @@ #![warn(clippy::suspicious)] #![warn(clippy::perf)] #![forbid(unsafe_code)] -#![feature(portable_simd)] +#![feature(portable_simd, avx512_target_feature)] use crate::cobra::cobra_apply; use crate::kernels::{ @@ -77,6 +77,15 @@ macro_rules! impl_fft_with_opts_and_plan_for { /// # Panics /// /// Panics if `reals.len() != imags.len()`, or if the input length is _not_ a power of 2. + #[multiversion::multiversion( + targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4 + "x86_64+avx2+fma", // x86_64-v3 + "x86_64+sse4.2", // x86_64-v2 + "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", + "x86+avx2+fma", + "x86+sse4.2", + "x86+sse2", + ))] pub fn $func_name( reals: &mut [$precision], imags: &mut [$precision], diff --git a/src/twiddles.rs b/src/twiddles.rs index 97f31a0..b754a11 100644 --- a/src/twiddles.rs +++ b/src/twiddles.rs @@ -181,6 +181,7 @@ macro_rules! generate_twiddles_simd { generate_twiddles_simd!(generate_twiddles_simd_64, f64, 8, f64x8); generate_twiddles_simd!(generate_twiddles_simd_32, f32, 8, f32x8); +#[inline] pub(crate) fn filter_twiddles(twiddles_re: &mut Vec, twiddles_im: &mut Vec) { assert_eq!(twiddles_re.len(), twiddles_im.len()); let dist = twiddles_re.len();