Skip to content

Commit

Permalink
Add automatic CPU feature detection
Browse files Browse the repository at this point in the history
  • Loading branch information
calebzulawski committed Apr 12, 2024
1 parent ad6fed5 commit 4985516
Show file tree
Hide file tree
Showing 12 changed files with 39 additions and 20 deletions.
2 changes: 0 additions & 2 deletions .cargo/config.toml

This file was deleted.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ exclude = ["assets", "scripts", "benches"]

[dependencies]
num-traits = "0.2.18"
multiversion = "0.7"

[dev-dependencies]
utilities = { path = "utilities" }
Expand Down
5 changes: 1 addition & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,11 @@ Transform (FFT) library written in pure Rust.

- Only supports input with a length of `2^n` (i.e., a power of 2) -- input should be padded with zeros to the next power
of 2
- No runtime CPU feature detection (yet). Right now achieving the highest performance requires compiling
with `-C target-cpu=native` or [`cargo multivers`](https://github.com/ronnychevalier/cargo-multivers)
- Requires nightly Rust compiler due to use of portable SIMD

## Planned features

- Bluestein's algorithm (to handle arbitrary sized FFTs)
- Runtime CPU feature detection
- More multi-threading
- More work on cache-optimal FFT

Expand Down Expand Up @@ -86,7 +83,7 @@ Then you can install PhastFT itself:

```bash
pip install numpy
RUSTFLAGS='-Ctarget-cpu=native' pip install git+https://github.com/QuState/PhastFT#subdirectory=pyphastft
pip install git+https://github.com/QuState/PhastFT#subdirectory=pyphastft
```

```python
Expand Down
2 changes: 1 addition & 1 deletion benches/Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
CC = gcc
CFLAGS = -Wall -Wextra -Werror -O3 -march=native
CFLAGS = -Wall -Wextra -Werror -O3
LIBS = -lfftw3 -lm

bench_fftw: main.c
Expand Down
18 changes: 9 additions & 9 deletions benches/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@
3. Create virtual env

```bash
cd ~/PhastFT/benches && python -m venv .env && source .env/bin/activate
cd ~/PhastFT/benches && python3 -m venv .env && source .env/bin/activate
```

4. Install python dependencies[^1]

```bash
pip install -r requirements.txt
cd ~/PhastFT/pyphastft
RUSTFLAGS='-Ctarget-cpu=native' pip install .
pip install .
```

5. Run the `FFTW3` vs. `RustFFT` vs. `PhastFT` benchmark for all inputs of size `2^n`, where `n \in [4, 30].`
Expand Down Expand Up @@ -55,13 +55,13 @@ The generated images will be saved in your working directory.

### Libraries and Packages

| Library/Package | Version | Language | Benchmark Compilation Flags |
|-----------------|----------------|-----------|------------------------------------------------------------------------------------------------------|
| `FFTW3` | 3.3.10-1 amd64 | C, OCaml | `-O3 -march=native` |
| `RustFFT` | 6.2.0 | Rust | `-C target-cpu=native -C opt-level=3 --edition=2021; codegen-units = 1; lto = true; panic = "abort"` |
| `PhastFT` | 0.1.0 | Rust | `-C target-cpu=native -C opt-level=3 --edition=2021; codegen-units = 1; lto = true; panic = "abort"` |
| `NumPy` | 1.26.4 | Python, C | `N/A` |
| `pyFFTW` | 0.13.1 | Python, C | `N/A` |
| Library/Package | Version | Language | Benchmark Compilation Flags |
|-----------------|----------------|-----------|---------------------------------------------------------------------------------|
| `FFTW3` | 3.3.10-1 amd64 | C, OCaml | `-O3` |
| `RustFFT` | 6.2.0 | Rust | `-C opt-level=3 --edition=2021; codegen-units = 1; lto = true; panic = "abort"` |
| `PhastFT` | 0.1.0 | Rust | `-C opt-level=3 --edition=2021; codegen-units = 1; lto = true; panic = "abort"` |
| `NumPy` | 1.26.4 | Python, C | `N/A` |
| `pyFFTW` | 0.13.1 | Python, C | `N/A` |

### Benchmark System Configuration

Expand Down
2 changes: 0 additions & 2 deletions benches/benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ max_iters=1000 # Set your desired maximum number of iterations
OUTPUT_DIR=benchmark-data.$(date +"%Y.%m.%d.%H-%M-%S")
mkdir -p "$OUTPUT_DIR"/fftw3 && mkdir "$OUTPUT_DIR"/rustfft && mkdir "$OUTPUT_DIR"/phastft && mkdir "$OUTPUT_DIR"/fftwrb

RUSTFLAGS="-C target-cpu=native"

benchmark_fftw3() {
make clean && make

Expand Down
2 changes: 1 addition & 1 deletion profile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ then
exit 1
fi

RUSTFLAGS='-Ctarget-cpu=native' cargo +nightly build --profile profiling --example profile
cargo +nightly build --profile profiling --example profile

sudo perf record --call-graph=dwarf ./target/profiling/examples/profile $1 && sudo perf script -f -F +pid > processed_result.perf

Expand Down
2 changes: 2 additions & 0 deletions rust-toolchain.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[toolchain]
channel = "nightly"
9 changes: 9 additions & 0 deletions src/cobra.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ const LOG_BLOCK_WIDTH: usize = 7; // log2 of cacheline
///
/// ## References
/// [1] <https://www.katjaas.nl/bitreversal/bitreversal.html>
#[inline]
pub(crate) fn bit_rev<T>(buf: &mut [T], log_n: usize) {
let mut nodd: usize;
let mut noddrev; // to hold bitwise negated or odd values
Expand Down Expand Up @@ -164,6 +165,14 @@ pub(crate) fn bit_reverse_permutation<T>(buf: &mut [T]) {
/// [2] Christian Knauth, Boran Adas, Daniel Whitfield, Xuesong Wang, Lydia Ickler, Tim Conrad, Oliver Serang:
/// Practically efficient methods for performing bit-reversed permutation in C++11 on the x86-64 architecture
/// [3] <https://bitbucket.org/orserang/bit-reversal-methods/src/master/src_and_bin/src/algorithms/COBRAShuffle.hpp>
#[multiversion::multiversion(targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4
"x86_64+avx2+fma", // x86_64-v3
"x86_64+sse4.2", // x86_64-v2
"x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl",
"x86+avx2+fma",
"x86+sse4.2",
"x86+sse2",
))]
pub fn cobra_apply<T: Default + Copy + Clone>(v: &mut [T], log_n: usize) {
if log_n <= 2 * LOG_BLOCK_WIDTH {
bit_rev(v, log_n);
Expand Down
4 changes: 4 additions & 0 deletions src/kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use num_traits::Float;

macro_rules! fft_butterfly_n_simd {
($func_name:ident, $precision:ty, $lanes:literal, $simd_vector:ty) => {
#[inline]
pub fn $func_name(
reals: &mut [$precision],
imags: &mut [$precision],
Expand Down Expand Up @@ -51,6 +52,7 @@ macro_rules! fft_butterfly_n_simd {
fft_butterfly_n_simd!(fft_64_chunk_n_simd, f64, 8, f64x8);
fft_butterfly_n_simd!(fft_32_chunk_n_simd, f32, 16, f32x16);

#[inline]
pub(crate) fn fft_chunk_n<T: Float>(
reals: &mut [T],
imags: &mut [T],
Expand Down Expand Up @@ -91,6 +93,7 @@ pub(crate) fn fft_chunk_n<T: Float>(
}

/// `chunk_size == 4`, so hard code twiddle factors
#[inline]
pub(crate) fn fft_chunk_4<T: Float>(reals: &mut [T], imags: &mut [T]) {
let dist = 2;
let chunk_size = dist << 1;
Expand Down Expand Up @@ -125,6 +128,7 @@ pub(crate) fn fft_chunk_4<T: Float>(reals: &mut [T], imags: &mut [T]) {
}

/// `chunk_size == 2`, so skip phase
#[inline]
pub(crate) fn fft_chunk_2<T: Float>(reals: &mut [T], imags: &mut [T]) {
reals
.chunks_exact_mut(2)
Expand Down
11 changes: 10 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#![warn(clippy::suspicious)]
#![warn(clippy::perf)]
#![forbid(unsafe_code)]
#![feature(portable_simd)]
#![feature(portable_simd, avx512_target_feature)]

use crate::cobra::cobra_apply;
use crate::kernels::{
Expand Down Expand Up @@ -77,6 +77,15 @@ macro_rules! impl_fft_with_opts_and_plan_for {
/// # Panics
///
/// Panics if `reals.len() != imags.len()`, or if the input length is _not_ a power of 2.
#[multiversion::multiversion(
targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4
"x86_64+avx2+fma", // x86_64-v3
"x86_64+sse4.2", // x86_64-v2
"x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl",
"x86+avx2+fma",
"x86+sse4.2",
"x86+sse2",
))]
pub fn $func_name(
reals: &mut [$precision],
imags: &mut [$precision],
Expand Down
1 change: 1 addition & 0 deletions src/twiddles.rs
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ macro_rules! generate_twiddles_simd {
generate_twiddles_simd!(generate_twiddles_simd_64, f64, 8, f64x8);
generate_twiddles_simd!(generate_twiddles_simd_32, f32, 8, f32x8);

#[inline]
pub(crate) fn filter_twiddles<T: Float>(twiddles_re: &mut Vec<T>, twiddles_im: &mut Vec<T>) {
assert_eq!(twiddles_re.len(), twiddles_im.len());
let dist = twiddles_re.len();
Expand Down

0 comments on commit 4985516

Please sign in to comment.