QuState · Shnatsel · Dec 8, 2025 · Dec 12, 2025 · Dec 12, 2025 · Dec 12, 2025
diff --git a/Cargo.toml b/Cargo.toml
@@ -19,6 +19,7 @@ num-complex = { version = "0.4.6", features = ["bytemuck"], optional = true }
 bytemuck = { version = "1.23.2", optional = true }
 wide = "1.0.3"
 rayon = { version = "1.11.0", optional = true }
+fearless_simd = { git = "https://github.com/linebender/fearless_simd.git", version = "0.3.0" }
 
 [features]
 default = []

diff --git a/src/algorithms/dit.rs b/src/algorithms/dit.rs
@@ -14,6 +14,8 @@
 //! DIT starts with fine-grained memory access and progressively works with
 //! larger contiguous chunks.
 //!
+use fearless_simd::{dispatch, Level, Simd};
+
 use crate::algorithms::cobra::cobra_apply;
 use crate::kernels::dit::{
     fft_dit_32_chunk_n_simd, fft_dit_64_chunk_n_simd, fft_dit_chunk_16_simd_f32,
@@ -33,7 +35,8 @@ const L1_BLOCK_SIZE: usize = 1024;
 ///
 /// Recursively divides by 2 until reaching L1 cache size, processes stages within
 /// each block, then processes cross-block stages on return.
-fn recursive_dit_fft_f64(
+fn recursive_dit_fft_f64<S: Simd>(
+    simd: S,
     reals: &mut [f64],
     imags: &mut [f64],
     size: usize,
@@ -46,6 +49,7 @@ fn recursive_dit_fft_f64(
     if size <= L1_BLOCK_SIZE {
         for stage in 0..log_size {
             stage_twiddle_idx = execute_dit_stage_f64(
+                simd,
                 &mut reals[..size],
                 &mut imags[..size],
                 stage,
@@ -63,8 +67,8 @@ fn recursive_dit_fft_f64(
         // Recursively process both halves
         run_maybe_in_parallel(
             size > opts.smallest_parallel_chunk_size,
-            || recursive_dit_fft_f64(re_first_half, im_first_half, half, planner, opts, 0),
-            || recursive_dit_fft_f64(re_second_half, im_second_half, half, planner, opts, 0),
+            || recursive_dit_fft_f64(simd, re_first_half, im_first_half, half, planner, opts, 0),
+            || recursive_dit_fft_f64(simd, re_second_half, im_second_half, half, planner, opts, 0),
         );
 
         // Both halves completed stages 0..log_half-1
@@ -74,6 +78,7 @@ fn recursive_dit_fft_f64(
         // Process remaining stages that span both halves
         for stage in log_half..log_size {
             stage_twiddle_idx = execute_dit_stage_f64(
+                simd,
                 &mut reals[..size],
                 &mut imags[..size],
                 stage,
@@ -87,7 +92,8 @@ fn recursive_dit_fft_f64(
 }
 
 /// Recursive cache-blocked DIT FFT for f32 using post-order traversal.
-fn recursive_dit_fft_f32(
+fn recursive_dit_fft_f32<S: Simd>(
+    simd: S,
     reals: &mut [f32],
     imags: &mut [f32],
     size: usize,
@@ -100,6 +106,7 @@ fn recursive_dit_fft_f32(
     if size <= L1_BLOCK_SIZE {
         for stage in 0..log_size {
             stage_twiddle_idx = execute_dit_stage_f32(
+                simd,
                 &mut reals[..size],
                 &mut imags[..size],
                 stage,
@@ -117,8 +124,8 @@ fn recursive_dit_fft_f32(
         // Recursively process both halves
         run_maybe_in_parallel(
             size > opts.smallest_parallel_chunk_size,
-            || recursive_dit_fft_f32(re_first_half, im_first_half, half, planner, opts, 0),
-            || recursive_dit_fft_f32(re_second_half, im_second_half, half, planner, opts, 0),
+            || recursive_dit_fft_f32(simd, re_first_half, im_first_half, half, planner, opts, 0),
+            || recursive_dit_fft_f32(simd, re_second_half, im_second_half, half, planner, opts, 0),
         );
 
         // Both halves completed stages 0..log_half-1
@@ -128,6 +135,7 @@ fn recursive_dit_fft_f32(
         // Process remaining stages that span both halves
         for stage in log_half..log_size {
             stage_twiddle_idx = execute_dit_stage_f32(
+                simd,
                 &mut reals[..size],
                 &mut imags[..size],
                 stage,
@@ -142,7 +150,8 @@ fn recursive_dit_fft_f32(
 
 /// Execute a single DIT stage, dispatching to appropriate kernel based on chunk size.
 /// Returns updated stage_twiddle_idx.
-fn execute_dit_stage_f64(
+fn execute_dit_stage_f64<S: Simd>(
+    simd: S,
     reals: &mut [f64],
     imags: &mut [f64],
     stage: usize,
@@ -153,34 +162,37 @@ fn execute_dit_stage_f64(
     let chunk_size = dist << 1;
 
     if chunk_size == 2 {
-        fft_dit_chunk_2(reals, imags);
+        simd.vectorize(|| fft_dit_chunk_2(simd, reals, imags));
         stage_twiddle_idx
     } else if chunk_size == 4 {
-        fft_dit_chunk_4_simd_f64(reals, imags);
+        simd.vectorize(|| fft_dit_chunk_4_simd_f64(simd, reals, imags));
         stage_twiddle_idx
     } else if chunk_size == 8 {
-        fft_dit_chunk_8_simd_f64(reals, imags);
+        simd.vectorize(|| fft_dit_chunk_8_simd_f64(simd, reals, imags));
         stage_twiddle_idx
     } else if chunk_size == 16 {
-        fft_dit_chunk_16_simd_f64(reals, imags);
+        simd.vectorize(|| fft_dit_chunk_16_simd_f64(simd, reals, imags));
         stage_twiddle_idx
     } else if chunk_size == 32 {
-        fft_dit_chunk_32_simd_f64(reals, imags);
+        simd.vectorize(|| fft_dit_chunk_32_simd_f64(simd, reals, imags));
         stage_twiddle_idx
     } else if chunk_size == 64 {
-        fft_dit_chunk_64_simd_f64(reals, imags);
+        simd.vectorize(|| fft_dit_chunk_64_simd_f64(simd, reals, imags));
         stage_twiddle_idx
     } else {
         // For larger chunks, use general kernel with twiddles from planner
         let (twiddles_re, twiddles_im) = &planner.stage_twiddles[stage_twiddle_idx];
-        fft_dit_64_chunk_n_simd(reals, imags, twiddles_re, twiddles_im, dist);
+        simd.vectorize(|| {
+            fft_dit_64_chunk_n_simd(simd, reals, imags, twiddles_re, twiddles_im, dist)
+        });
         stage_twiddle_idx + 1
     }
 }
 
 /// Execute a single DIT stage, dispatching to appropriate kernel based on chunk size.
 /// Returns updated stage_twiddle_idx.
-fn execute_dit_stage_f32(
+fn execute_dit_stage_f32<S: Simd>(
+    simd: S,
     reals: &mut [f32],
     imags: &mut [f32],
     stage: usize,
@@ -191,27 +203,29 @@ fn execute_dit_stage_f32(
     let chunk_size = dist << 1;
 
     if chunk_size == 2 {
-        fft_dit_chunk_2(reals, imags);
+        simd.vectorize(|| fft_dit_chunk_2(simd, reals, imags));
         stage_twiddle_idx
     } else if chunk_size == 4 {
-        fft_dit_chunk_4_simd_f32(reals, imags);
+        simd.vectorize(|| fft_dit_chunk_4_simd_f32(simd, reals, imags));
         stage_twiddle_idx
     } else if chunk_size == 8 {
-        fft_dit_chunk_8_simd_f32(reals, imags);
+        simd.vectorize(|| fft_dit_chunk_8_simd_f32(simd, reals, imags));
         stage_twiddle_idx
     } else if chunk_size == 16 {
-        fft_dit_chunk_16_simd_f32(reals, imags);
+        simd.vectorize(|| fft_dit_chunk_16_simd_f32(simd, reals, imags));
         stage_twiddle_idx
     } else if chunk_size == 32 {
-        fft_dit_chunk_32_simd_f32(reals, imags);
+        simd.vectorize(|| fft_dit_chunk_32_simd_f32(simd, reals, imags));
         stage_twiddle_idx
     } else if chunk_size == 64 {
-        fft_dit_chunk_64_simd_f32(reals, imags);
+        simd.vectorize(|| fft_dit_chunk_64_simd_f32(simd, reals, imags));
         stage_twiddle_idx
     } else {
         // For larger chunks, use general kernel with twiddles from planner
         let (twiddles_re, twiddles_im) = &planner.stage_twiddles[stage_twiddle_idx];
-        fft_dit_32_chunk_n_simd(reals, imags, twiddles_re, twiddles_im, dist);
+        simd.vectorize(|| {
+            fft_dit_32_chunk_n_simd(simd, reals, imags, twiddles_re, twiddles_im, dist)
+        });
         stage_twiddle_idx + 1
     }
 }
@@ -261,7 +275,8 @@ pub fn fft_64_dit_with_planner_and_opts(
         }
     }
 
-    recursive_dit_fft_f64(reals, imags, n, planner, opts, 0);
+    let simd_level = Level::new();
+    dispatch!(simd_level, simd => recursive_dit_fft_f64(simd, reals, imags, n, planner, opts, 0));
 
     // Scaling for inverse transform
     if let Direction::Reverse = planner.direction {
@@ -304,7 +319,8 @@ pub fn fft_32_dit_with_planner_and_opts(
         }
     }
 
-    recursive_dit_fft_f32(reals, imags, n, planner, opts, 0);
+    let simd_level = Level::new();
+    dispatch!(simd_level, simd => recursive_dit_fft_f32(simd, reals, imags, n, planner, opts, 0));
 
     // Scaling for inverse transform
     if let Direction::Reverse = planner.direction {