QuState · smu160 · Mar 1, 2024 · Mar 4, 2024
diff --git a/Cargo.toml b/Cargo.toml
@@ -10,6 +10,10 @@ keywords = ["quantum", "fft", "discrete", "fourier", "transform"]
 categories = ["algorithms", "compression", "science"]
 exclude = ["assets", "scripts", "benches"]
 
+[features]
+default = ["double"]
+single = []
+double = []
 
 [dev-dependencies]
 utilities = { path = "utilities" }

diff --git a/examples/benchmark.rs b/examples/benchmark.rs
@@ -3,13 +3,13 @@ use std::str::FromStr;
 
 use utilities::gen_random_signal;
 
-use phastft::fft;
 use phastft::planner::Direction;
+use phastft::{fft, Float};
 
 fn benchmark_fft(n: usize) {
     let big_n = 1 << n;
-    let mut reals = vec![0.0; big_n];
-    let mut imags = vec![0.0; big_n];
+    let mut reals: Vec<Float> = vec![0.0; big_n];
+    let mut imags: Vec<Float> = vec![0.0; big_n];
     gen_random_signal(&mut reals, &mut imags);
 
     let now = std::time::Instant::now();

diff --git a/examples/fftwrb.rs b/examples/fftwrb.rs
@@ -2,18 +2,24 @@ use std::{env, ptr::slice_from_raw_parts_mut, str::FromStr};
 
 use fftw::{
     array::AlignedVec,
-    plan::{C2CPlan, C2CPlan64},
     types::{Flag, Sign},
 };
+use fftw::plan::C2CPlan;
+#[cfg(feature = "single")]
+use fftw::plan::C2CPlan32;
+#[cfg(feature = "double")]
+use fftw::plan::C2CPlan64;
 use utilities::{gen_random_signal, rustfft::num_complex::Complex};
 
+use phastft::Float;
+
 fn benchmark_fftw(n: usize) {
     let big_n = 1 << n;
 
     let mut reals = vec![0.0; big_n];
     let mut imags = vec![0.0; big_n];
 
-    gen_random_signal(&mut reals, &mut imags);
+    gen_random_signal::<Float>(&mut reals, &mut imags);
     let mut nums = AlignedVec::new(big_n);
     reals
         .drain(..)
@@ -22,6 +28,8 @@ fn benchmark_fftw(n: usize) {
         .for_each(|((re, im), z)| *z = Complex::new(re, im));
 
     let now = std::time::Instant::now();
+
+    #[cfg(feature = "double")]
     C2CPlan64::aligned(
         &[big_n],
         Sign::Backward,
@@ -34,6 +42,21 @@ fn benchmark_fftw(n: usize) {
         &mut nums,
     )
     .unwrap();
+
+    #[cfg(feature = "single")]
+    C2CPlan32::aligned(
+        &[big_n],
+        Sign::Backward,
+        Flag::DESTROYINPUT | Flag::ESTIMATE,
+    )
+    .unwrap()
+    .c2c(
+        // SAFETY: See above comment.
+        unsafe { &mut *slice_from_raw_parts_mut(nums.as_mut_ptr(), big_n) },
+        &mut nums,
+    )
+    .unwrap();
+
     let elapsed = now.elapsed().as_micros();
     println!("{elapsed}");
 }

diff --git a/examples/profile.rs b/examples/profile.rs
@@ -1,13 +1,13 @@
 use std::env;
 use std::str::FromStr;
 
-use phastft::fft;
 use phastft::planner::Direction;
+use phastft::{fft, Float};
 
 fn benchmark_fft(num_qubits: usize) {
     let n = 1 << num_qubits;
-    let mut reals: Vec<f64> = (1..=n).map(|i| i as f64).collect();
-    let mut imags: Vec<f64> = (1..=n).map(|i| i as f64).collect();
+    let mut reals: Vec<_> = (1..=n).map(|i| i as Float).collect();
+    let mut imags: Vec<_> = (1..=n).map(|i| i as Float).collect();
     fft(&mut reals, &mut imags, Direction::Forward);
 }
 

diff --git a/src/cobra.rs b/src/cobra.rs
@@ -13,7 +13,8 @@
 //! Symposium on Foundations of Computer Science (Cat. No.98CB36280), Palo Alto, CA, USA, 1998, pp. 544-553, doi:
 //! 10.1109/SFCS.1998.743505.
 //! keywords: {Read-write memory;Costs;Computer science;Drives;Random access memory;Argon;Registers;Read only memory;Computational modeling;Libraries}
-use crate::kernels::Float;
+
+use crate::Float;
 
 const BLOCK_WIDTH: usize = 128;
 // size of the cacheline
@@ -317,17 +318,17 @@ mod tests {
     fn jennifer_method() {
         for n in 2..24 {
             let big_n = 1 << n;
-            let mut actual_re: Vec<f64> = (0..big_n).map(f64::from).collect();
-            let mut actual_im: Vec<f64> = (0..big_n).map(f64::from).collect();
+            let mut actual_re: Vec<Float> = (0..big_n).map(|i| i as Float).collect();
+            let mut actual_im: Vec<Float> = (0..big_n).map(|i| i as Float).collect();
 
             #[allow(deprecated)]
             complex_bit_rev(&mut actual_re, &mut actual_im, n);
 
-            let input_re: Vec<f64> = (0..big_n).map(f64::from).collect();
+            let input_re: Vec<Float> = (0..big_n).map(|i| i as Float).collect();
             let expected_re = top_down_bit_reverse_permutation(&input_re);
             assert_eq!(actual_re, expected_re);
 
-            let input_im: Vec<f64> = (0..big_n).map(f64::from).collect();
+            let input_im: Vec<Float> = (0..big_n).map(|i| i as Float).collect();
             let expected_im = top_down_bit_reverse_permutation(&input_im);
             assert_eq!(actual_im, expected_im);
         }
@@ -337,17 +338,17 @@ mod tests {
     fn jennifer_method_parallel() {
         for n in 2..24 {
             let big_n = 1 << n;
-            let mut actual_re: Vec<f64> = (0..big_n).map(f64::from).collect();
-            let mut actual_im: Vec<f64> = (0..big_n).map(f64::from).collect();
+            let mut actual_re: Vec<Float> = (0..big_n).map(|i| i as Float).collect();
+            let mut actual_im: Vec<Float> = (0..big_n).map(|i| i as Float).collect();
 
             #[allow(deprecated)]
             bit_reverse_permute_state_par(&mut actual_re, &mut actual_im, n);
 
-            let input_re: Vec<f64> = (0..big_n).map(f64::from).collect();
+            let input_re: Vec<Float> = (0..big_n).map(|i| i as Float).collect();
             let expected_re = top_down_bit_reverse_permutation(&input_re);
             assert_eq!(actual_re, expected_re);
 
-            let input_im: Vec<f64> = (0..big_n).map(f64::from).collect();
+            let input_im: Vec<Float> = (0..big_n).map(|i| i as Float).collect();
             let expected_im = top_down_bit_reverse_permutation(&input_im);
             assert_eq!(actual_im, expected_im);
         }

diff --git a/src/kernels.rs b/src/kernels.rs
@@ -1,7 +1,11 @@
+#[cfg(feature = "single")]
+use std::simd::f32x16;
+#[cfg(feature = "double")]
 use std::simd::f64x8;
 
-pub type Float = f64;
+use crate::Float;
 
+#[cfg(feature = "double")]
 pub(crate) fn fft_chunk_n_simd(
     reals: &mut [Float],
     imags: &mut [Float],
@@ -45,7 +49,51 @@ pub(crate) fn fft_chunk_n_simd(
         });
 }
 
-// TODO(saveliy): parallelize
+#[cfg(feature = "single")]
+pub(crate) fn fft_chunk_n_simd(
+    reals: &mut [Float],
+    imags: &mut [Float],
+    twiddles_re: &[Float],
+    twiddles_im: &[Float],
+    dist: usize,
+) {
+    const VECTOR_SIZE: usize = 16;
+    let chunk_size = dist << 1;
+    assert!(chunk_size >= 32);
+
+    reals
+        .chunks_exact_mut(chunk_size)
+        .zip(imags.chunks_exact_mut(chunk_size))
+        .for_each(|(reals_chunk, imags_chunk)| {
+            let (reals_s0, reals_s1) = reals_chunk.split_at_mut(dist);
+            let (imags_s0, imags_s1) = imags_chunk.split_at_mut(dist);
+
+            reals_s0
+                .chunks_exact_mut(VECTOR_SIZE)
+                .zip(reals_s1.chunks_exact_mut(VECTOR_SIZE))
+                .zip(imags_s0.chunks_exact_mut(VECTOR_SIZE))
+                .zip(imags_s1.chunks_exact_mut(VECTOR_SIZE))
+                .zip(twiddles_re.chunks_exact(VECTOR_SIZE))
+                .zip(twiddles_im.chunks_exact(VECTOR_SIZE))
+                .for_each(|(((((re_s0, re_s1), im_s0), im_s1), w_re), w_im)| {
+                    let real_c0 = f32x16::from_slice(re_s0);
+                    let real_c1 = f32x16::from_slice(re_s1);
+                    let imag_c0 = f32x16::from_slice(im_s0);
+                    let imag_c1 = f32x16::from_slice(im_s1);
+
+                    let tw_re = f32x16::from_slice(w_re);
+                    let tw_im = f32x16::from_slice(w_im);
+
+                    re_s0.copy_from_slice((real_c0 + real_c1).as_array());
+                    im_s0.copy_from_slice((imag_c0 + imag_c1).as_array());
+                    let v_re = real_c0 - real_c1;
+                    let v_im = imag_c0 - imag_c1;
+                    re_s1.copy_from_slice((v_re * tw_re - v_im * tw_im).as_array());
+                    im_s1.copy_from_slice((v_re * tw_im + v_im * tw_re).as_array());
+                });
+        });
+}
+
 pub(crate) fn fft_chunk_n(
     reals: &mut [Float],
     imags: &mut [Float],
@@ -119,7 +167,7 @@ pub(crate) fn fft_chunk_4(reals: &mut [Float], imags: &mut [Float]) {
         });
 }
 
-/// `chunk_size == 2`, so skip phase
+/// `chunk_size == 2`, so we only need 1 and -1
 pub(crate) fn fft_chunk_2(reals: &mut [Float], imags: &mut [Float]) {
     reals
         .chunks_exact_mut(2)

diff --git a/src/lib.rs b/src/lib.rs
@@ -9,7 +9,7 @@
 #![feature(portable_simd)]
 
 use crate::cobra::cobra_apply;
-use crate::kernels::{fft_chunk_2, fft_chunk_4, fft_chunk_n, fft_chunk_n_simd, Float};
+use crate::kernels::{fft_chunk_2, fft_chunk_4, fft_chunk_n, fft_chunk_n_simd};
 use crate::options::Options;
 use crate::planner::{Direction, Planner};
 use crate::twiddles::filter_twiddles;
@@ -20,6 +20,14 @@
 pub mod planner;
 mod twiddles;
 
+/// Redefine `Float` as `f64` for double precision data
+#[cfg(feature = "double")]
+pub type Float = f64;
+
+/// Redefine `Float` as `f32` for single precision data
+#[cfg(feature = "single")]
+pub type Float = f32;
+
 /// FFT -- Decimation in Frequency. This is just the decimation-in-time algorithm, reversed.
 /// This call to FFT is run, in-place.
 /// The input should be provided in normal order, and then the modified input is bit-reversed.
@@ -83,7 +91,7 @@
             if t < n - 1 {
                 filter_twiddles(twiddles_re, twiddles_im);
             }
-            if chunk_size >= 16 {
+            if chunk_size >= 32 {
                 fft_chunk_n_simd(reals, imags, twiddles_re, twiddles_im, dist);
             } else {
                 fft_chunk_n(reals, imags, twiddles_re, twiddles_im, dist);
@@ -110,12 +118,11 @@
 mod tests {
     use std::ops::Range;
 
-    use utilities::{
-        assert_f64_closeness,
-        rustfft::{num_complex::Complex64, FftPlanner},
-    };
+    use utilities::assert_float_closeness;
+    use utilities::rustfft::FftPlanner;
+    use utilities::rustfft::num_complex::Complex;
 
     use crate::planner::Direction;
 
    use super::*;

@@ -165,12 +172,12 @@
         for k in range {
             let n: usize = 1 << k;
 
-            let mut reals: Vec<Float> = (1..=n).map(|i| i as f64).collect();
-            let mut imags: Vec<Float> = (1..=n).map(|i| i as f64).collect();
+            let mut reals: Vec<Float> = (1..=n).map(|i| i as Float).collect();
+            let mut imags: Vec<Float> = (1..=n).map(|i| i as Float).collect();
             fft(&mut reals, &mut imags, Direction::Forward);
 
-            let mut buffer: Vec<Complex64> = (1..=n)
-                .map(|i| Complex64::new(i as f64, i as f64))
+            let mut buffer: Vec<Complex<Float>> = (1..=n)
+                .map(|i| Complex::new(i as Float, i as Float))
                 .collect();
 
             let mut planner = FftPlanner::new();
@@ -184,8 +191,8 @@
                 .for_each(|(i, (z_re, z_im))| {
                     let expect_re = buffer[i].re;
                     let expect_im = buffer[i].im;
-                    assert_f64_closeness(*z_re, expect_re, 0.01);
-                    assert_f64_closeness(*z_im, expect_im, 0.01);
+                    assert_float_closeness(*z_re, expect_re, 0.01);
+                    assert_float_closeness(*z_im, expect_im, 0.01);
                 });
         }
     }

diff --git a/src/planner.rs b/src/planner.rs
@@ -2,7 +2,7 @@
 //! a Fast Fourier Transform (FFT). Currently, the planner is responsible for
 //! pre-computing twiddle factors based on the input signal length, as well as the
 //! direction of the FFT.
-
+use crate::Float;
 use crate::twiddles::{generate_twiddles, generate_twiddles_simd};
 
 /// Reverse is for running the Inverse Fast Fourier Transform (IFFT)
@@ -20,9 +20,9 @@ pub enum Direction {
 /// the amount of twiddle factors should always be `(1/2) * N`
 pub struct Planner {
     /// The real components of the twiddle factors
-    pub twiddles_re: Vec<f64>,
+    pub twiddles_re: Vec<Float>,
     /// The imaginary components of the twiddle factors
-    pub twiddles_im: Vec<f64>,
+    pub twiddles_im: Vec<Float>,
 }
 
 impl Planner {
@@ -66,7 +66,7 @@ impl Planner {
 
 #[cfg(test)]
 mod tests {
-    use utilities::assert_f64_closeness;
+    use utilities::assert_float_closeness;
 
     use crate::planner::{Direction, Planner};
 
@@ -101,8 +101,8 @@ mod tests {
                 .for_each(|(((a, b), c), d)| {
                     let temp_re = a * c - b * d;
                     let temp_im = a * d + b * c;
-                    assert_f64_closeness(temp_re, 1.0, 1e-6);
-                    assert_f64_closeness(temp_im, 0.0, 1e-6);
+                    assert_float_closeness(temp_re, 1.0, 1e-3);
+                    assert_float_closeness(temp_im, 0.0, 1e-3);
                 });
         }
     }