From 8b436ef6ee028ca8b2b4274e468269caa4f21fcc Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 20 Jan 2026 21:53:06 +0000
Subject: [PATCH 1/5] Add initial experimental BRAVO impl

---
 src/algorithms/bravo.rs | 251 ++++++++++++++++++++++++++++++++++++++++
 src/algorithms/mod.rs   |   1 +
 2 files changed, 252 insertions(+)
 create mode 100644 src/algorithms/bravo.rs

diff --git a/src/algorithms/bravo.rs b/src/algorithms/bravo.rs
new file mode 100644
index 0000000..78f7495
--- /dev/null
+++ b/src/algorithms/bravo.rs
@@ -0,0 +1,251 @@
+/// BRAVO: Bit-Reversal Algorithm using Vector permute Operations
+///
+/// This implements the algorithm from "Optimal Bit-Reversal Using Vector Permutations"
+/// by Lokhmotov and Mycroft (SPAA'07).
+///
+/// The algorithm uses vector interleaving operations to perform bit-reversal permutation.
+/// For N = 2^n elements with W-element vectors, the algorithm performs log₂(N) rounds
+/// of in-place interleave operations on pairs of vectors.
+///
+/// When nightly Rust with std::simd is available, this can use hardware SIMD instructions.
+/// For now, we implement the interleave operation manually to demonstrate the algorithm.
+/// 
+/// The initial implementation was heavily assisted by Claude Code
+
+const LANES: usize = 4; // Vector width W
+
+/// A simple vector type that mimics std::simd::Simd for f64
+#[derive(Clone, Copy)]
+struct Vec4([f64; LANES]);
+
+impl Vec4 {
+    fn from_slice(s: &[f64]) -> Self {
+        Vec4([s[0], s[1], s[2], s[3]])
+    }
+
+    fn copy_to_slice(self, s: &mut [f64]) {
+        s[0] = self.0[0];
+        s[1] = self.0[1];
+        s[2] = self.0[2];
+        s[3] = self.0[3];
+    }
+
+    /// Interleave two vectors, producing low and high halves.
+    /// For vectors [a0, a1, a2, a3] and [b0, b1, b2, b3]:
+    /// - low:  [a0, b0, a1, b1]
+    /// - high: [a2, b2, a3, b3]
+    ///
+    /// This matches std::simd::Simd::interleave() behavior.
+    fn interleave(self, other: Vec4) -> (Vec4, Vec4) {
+        let a = self.0;
+        let b = other.0;
+        let lo = Vec4([a[0], b[0], a[1], b[1]]);
+        let hi = Vec4([a[2], b[2], a[3], b[3]]);
+        (lo, hi)
+    }
+}
+
+/// Performs in-place bit-reversal permutation using the BRAVO algorithm.
+///
+/// # Arguments
+/// * `data` - The slice to permute in-place. Length must be a power of 2 and >= LANES².
+/// * `n` - The log₂ of the data length (i.e., data.len() == 2^n)
+pub fn bit_rev_bravo(data: &mut [f64], n: usize) {
+    let big_n = 1usize << n;
+    assert_eq!(data.len(), big_n, "Data length must be 2^n");
+
+    // For very small arrays, fall back to scalar bit-reversal
+    if big_n < LANES * LANES {
+        scalar_bit_reversal(data, n);
+        return;
+    }
+
+    let w = LANES;
+    let log_w = w.ilog2() as usize; // = 2 for W=4
+
+    // π = N / W² = number of equivalence classes
+    let num_classes = big_n / (w * w);
+    let class_bits = n - 2 * log_w;
+
+    // Process each equivalence class.
+    // For in-place operation, we handle class pairs that swap with each other.
+    // We only process when class_idx <= class_idx_rev to avoid double processing.
+
+    for class_idx in 0..num_classes {
+        let class_idx_rev = if class_bits > 0 {
+            class_idx.reverse_bits() >> (usize::BITS - class_bits as u32)
+        } else {
+            0
+        };
+
+        // Only process if this is the "first" of a swapping pair (or self-mapping)
+        if class_idx > class_idx_rev {
+            continue;
+        }
+
+        // Load vectors for class A
+        let mut vecs_a: [Vec4; LANES] = [Vec4([0.0; LANES]); LANES];
+        for j in 0..w {
+            let base_idx = (class_idx + j * num_classes) * w;
+            vecs_a[j] = Vec4::from_slice(&data[base_idx..base_idx + w]);
+        }
+
+        // Perform interleave rounds for class A
+        for round in 0..log_w {
+            let mut new_vecs: [Vec4; LANES] = [Vec4([0.0; LANES]); LANES];
+            let stride = 1 << round;
+
+            // W/2 pairs per round, stored as parallel arrays
+            let mut los: [Vec4; LANES / 2] = [Vec4([0.0; LANES]); LANES / 2];
+            let mut his: [Vec4; LANES / 2] = [Vec4([0.0; LANES]); LANES / 2];
+
+            let mut pair_idx = 0;
+            let mut i = 0;
+            while i < w {
+                for offset in 0..stride {
+                    let idx0 = i + offset;
+                    let idx1 = i + offset + stride;
+                    let (lo, hi) = vecs_a[idx0].interleave(vecs_a[idx1]);
+                    los[pair_idx] = lo;
+                    his[pair_idx] = hi;
+                    pair_idx += 1;
+                }
+                i += stride * 2;
+            }
+
+            for j in 0..(w / 2) {
+                let base = (j % stride) + (j / stride) * stride * 2;
+                new_vecs[base] = los[j];
+                new_vecs[base + stride] = his[j];
+            }
+
+            vecs_a = new_vecs;
+        }
+
+        if class_idx == class_idx_rev {
+            // Self-mapping class - just write back to same location
+            for j in 0..w {
+                let base_idx = (class_idx + j * num_classes) * w;
+                vecs_a[j].copy_to_slice(&mut data[base_idx..base_idx + w]);
+            }
+        } else {
+            // Swapping pair - load class B, process it, then swap both
+            let mut vecs_b: [Vec4; LANES] = [Vec4([0.0; LANES]); LANES];
+            for j in 0..w {
+                let base_idx = (class_idx_rev + j * num_classes) * w;
+                vecs_b[j] = Vec4::from_slice(&data[base_idx..base_idx + w]);
+            }
+
+            // Perform interleave rounds for class B
+            for round in 0..log_w {
+                let mut new_vecs: [Vec4; LANES] = [Vec4([0.0; LANES]); LANES];
+                let stride = 1 << round;
+
+                let mut los: [Vec4; LANES / 2] = [Vec4([0.0; LANES]); LANES / 2];
+                let mut his: [Vec4; LANES / 2] = [Vec4([0.0; LANES]); LANES / 2];
+
+                let mut pair_idx = 0;
+                let mut i = 0;
+                while i < w {
+                    for offset in 0..stride {
+                        let idx0 = i + offset;
+                        let idx1 = i + offset + stride;
+                        let (lo, hi) = vecs_b[idx0].interleave(vecs_b[idx1]);
+                        los[pair_idx] = lo;
+                        his[pair_idx] = hi;
+                        pair_idx += 1;
+                    }
+                    i += stride * 2;
+                }
+
+                for j in 0..(w / 2) {
+                    let base = (j % stride) + (j / stride) * stride * 2;
+                    new_vecs[base] = los[j];
+                    new_vecs[base + stride] = his[j];
+                }
+
+                vecs_b = new_vecs;
+            }
+
+            // Swap: write A's result to B's location and vice versa
+            for j in 0..w {
+                let base_idx_a = (class_idx + j * num_classes) * w;
+                let base_idx_b = (class_idx_rev + j * num_classes) * w;
+                vecs_a[j].copy_to_slice(&mut data[base_idx_b..base_idx_b + w]);
+                vecs_b[j].copy_to_slice(&mut data[base_idx_a..base_idx_a + w]);
+            }
+        }
+    }
+}
+
+/// Scalar bit-reversal for small arrays
+fn scalar_bit_reversal(data: &mut [f64], n: usize) {
+    let big_n = data.len();
+
+    for i in 0..big_n {
+        let j = reverse_bits_scalar(i, n as u32);
+        if i < j {
+            data.swap(i, j);
+        }
+    }
+}
+
+/// Reverse the lower `bits` bits of `x`
+fn reverse_bits_scalar(x: usize, bits: u32) -> usize {
+    if bits == 0 {
+        return 0;
+    }
+    x.reverse_bits() >> (usize::BITS - bits)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Top down bit reverse interleaving. This is a very simple and well known approach that we
+    /// only use for testing due to its lackluster performance.
+    fn top_down_bit_reverse_permutation<T: Copy + Clone>(x: &[T]) -> Vec<T> {
+        if x.len() == 1 {
+            return x.to_vec();
+        }
+        let mut y = Vec::with_capacity(x.len());
+        let mut evens = Vec::with_capacity(x.len() >> 1);
+        let mut odds = Vec::with_capacity(x.len() >> 1);
+        let mut i = 1;
+        while i < x.len() {
+            evens.push(x[i - 1]);
+            odds.push(x[i]);
+            i += 2;
+        }
+        y.extend_from_slice(&top_down_bit_reverse_permutation(&evens));
+        y.extend_from_slice(&top_down_bit_reverse_permutation(&odds));
+        y
+    }
+
+    #[test]
+    fn test_bravo_bit_reversal() {
+        for n in 8..24 {
+            let big_n = 1 << n;
+            let mut actual_re: Vec<f64> = (0..big_n).map(f64::from).collect();
+            let mut actual_im: Vec<f64> = (0..big_n).map(f64::from).collect();
+            bit_rev_bravo(&mut actual_re, n);
+            bit_rev_bravo(&mut actual_im, n);
+            let input_re: Vec<f64> = (0..big_n).map(f64::from).collect();
+            let expected_re = top_down_bit_reverse_permutation(&input_re);
+            assert_eq!(actual_re, expected_re);
+            let input_im: Vec<f64> = (0..big_n).map(f64::from).collect();
+            let expected_im = top_down_bit_reverse_permutation(&input_im);
+            assert_eq!(actual_im, expected_im);
+        }
+    }
+
+    #[test]
+    fn test_small_cases() {
+        // Test n=4 (16 elements) - smallest case with W=4 vectors
+        let mut data: Vec<f64> = (0..16).map(f64::from).collect();
+        bit_rev_bravo(&mut data, 4);
+        let expected =
+            top_down_bit_reverse_permutation(&(0..16).map(f64::from).collect::<Vec<_>>());
+        assert_eq!(data, expected);
+    }
+}
diff --git a/src/algorithms/mod.rs b/src/algorithms/mod.rs
index 9a7120d..259b316 100644
--- a/src/algorithms/mod.rs
+++ b/src/algorithms/mod.rs
@@ -18,6 +18,7 @@
 //! - Use DIF if you don't want to or don't need to apply a bit reversal on the input.
 //! - Use DIT for slightly better performance.
 
+pub mod bravo;
 pub mod cobra;
 pub mod dif;
 pub mod dit;

From 740225aa20772962e9a80e588f7638c20235e527 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 20 Jan 2026 22:03:02 +0000
Subject: [PATCH 2/5] jankily bump vector size to confirm everything still
 works for larger sizes

---
 src/algorithms/bravo.rs | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/algorithms/bravo.rs b/src/algorithms/bravo.rs
index 78f7495..1650dcb 100644
--- a/src/algorithms/bravo.rs
+++ b/src/algorithms/bravo.rs
@@ -9,10 +9,10 @@
 ///
 /// When nightly Rust with std::simd is available, this can use hardware SIMD instructions.
 /// For now, we implement the interleave operation manually to demonstrate the algorithm.
-/// 
+///
 /// The initial implementation was heavily assisted by Claude Code
 
-const LANES: usize = 4; // Vector width W
+const LANES: usize = 8; // Vector width W
 
 /// A simple vector type that mimics std::simd::Simd for f64
 #[derive(Clone, Copy)]
@@ -20,7 +20,7 @@ struct Vec4([f64; LANES]);
 
 impl Vec4 {
     fn from_slice(s: &[f64]) -> Self {
-        Vec4([s[0], s[1], s[2], s[3]])
+        Vec4([s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7]])
     }
 
     fn copy_to_slice(self, s: &mut [f64]) {
@@ -28,6 +28,10 @@ impl Vec4 {
         s[1] = self.0[1];
         s[2] = self.0[2];
         s[3] = self.0[3];
+        s[4] = self.0[4];
+        s[5] = self.0[5];
+        s[6] = self.0[6];
+        s[7] = self.0[7];
     }
 
     /// Interleave two vectors, producing low and high halves.
@@ -39,8 +43,8 @@ impl Vec4 {
     fn interleave(self, other: Vec4) -> (Vec4, Vec4) {
         let a = self.0;
         let b = other.0;
-        let lo = Vec4([a[0], b[0], a[1], b[1]]);
-        let hi = Vec4([a[2], b[2], a[3], b[3]]);
+        let lo = Vec4([a[0], b[0], a[1], b[1], a[2], b[2], a[3], b[3]]);
+        let hi = Vec4([a[4], b[4], a[5], b[5], a[6], b[6], a[7], b[7]]);
         (lo, hi)
     }
 }

From d867542badcefcf20320da581d6ee8f44c106900 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 20 Jan 2026 22:07:47 +0000
Subject: [PATCH 3/5] Make BRAVO impl generic over T as opposed to f64-only

---
 src/algorithms/bravo.rs | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/algorithms/bravo.rs b/src/algorithms/bravo.rs
index 1650dcb..2cc4f77 100644
--- a/src/algorithms/bravo.rs
+++ b/src/algorithms/bravo.rs
@@ -15,15 +15,15 @@
 const LANES: usize = 8; // Vector width W
 
 /// A simple vector type that mimics std::simd::Simd for f64
-#[derive(Clone, Copy)]
-struct Vec4([f64; LANES]);
+#[derive(Clone, Copy, Default)]
+struct Vec4<T: Default + Copy + Clone>([T; LANES]);
 
-impl Vec4 {
-    fn from_slice(s: &[f64]) -> Self {
+impl<T: Default + Copy + Clone> Vec4<T> {
+    fn from_slice(s: &[T]) -> Self {
         Vec4([s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7]])
     }
 
-    fn copy_to_slice(self, s: &mut [f64]) {
+    fn copy_to_slice(self, s: &mut [T]) {
         s[0] = self.0[0];
         s[1] = self.0[1];
         s[2] = self.0[2];
@@ -40,7 +40,7 @@ impl Vec4 {
     /// - high: [a2, b2, a3, b3]
     ///
     /// This matches std::simd::Simd::interleave() behavior.
-    fn interleave(self, other: Vec4) -> (Vec4, Vec4) {
+    fn interleave(self, other: Vec4<T>) -> (Vec4<T>, Vec4<T>) {
         let a = self.0;
         let b = other.0;
         let lo = Vec4([a[0], b[0], a[1], b[1], a[2], b[2], a[3], b[3]]);
@@ -54,7 +54,7 @@ impl Vec4 {
 /// # Arguments
 /// * `data` - The slice to permute in-place. Length must be a power of 2 and >= LANES².
 /// * `n` - The log₂ of the data length (i.e., data.len() == 2^n)
-pub fn bit_rev_bravo(data: &mut [f64], n: usize) {
+pub fn bit_rev_bravo<T: Default + Copy + Clone>(data: &mut [T], n: usize) {
     let big_n = 1usize << n;
     assert_eq!(data.len(), big_n, "Data length must be 2^n");
 
@@ -88,7 +88,7 @@ pub fn bit_rev_bravo(data: &mut [f64], n: usize) {
         }
 
         // Load vectors for class A
-        let mut vecs_a: [Vec4; LANES] = [Vec4([0.0; LANES]); LANES];
+        let mut vecs_a: [Vec4<T>; LANES] = [Vec4([T::default(); LANES]); LANES];
         for j in 0..w {
             let base_idx = (class_idx + j * num_classes) * w;
             vecs_a[j] = Vec4::from_slice(&data[base_idx..base_idx + w]);
@@ -96,12 +96,12 @@ pub fn bit_rev_bravo(data: &mut [f64], n: usize) {
 
         // Perform interleave rounds for class A
         for round in 0..log_w {
-            let mut new_vecs: [Vec4; LANES] = [Vec4([0.0; LANES]); LANES];
+            let mut new_vecs: [Vec4<T>; LANES] = [Vec4([T::default(); LANES]); LANES];
             let stride = 1 << round;
 
             // W/2 pairs per round, stored as parallel arrays
-            let mut los: [Vec4; LANES / 2] = [Vec4([0.0; LANES]); LANES / 2];
-            let mut his: [Vec4; LANES / 2] = [Vec4([0.0; LANES]); LANES / 2];
+            let mut los: [Vec4<T>; LANES / 2] = [Vec4([T::default(); LANES]); LANES / 2];
+            let mut his: [Vec4<T>; LANES / 2] = [Vec4([T::default(); LANES]); LANES / 2];
 
             let mut pair_idx = 0;
             let mut i = 0;
@@ -134,7 +134,7 @@ pub fn bit_rev_bravo(data: &mut [f64], n: usize) {
             }
         } else {
             // Swapping pair - load class B, process it, then swap both
-            let mut vecs_b: [Vec4; LANES] = [Vec4([0.0; LANES]); LANES];
+            let mut vecs_b: [Vec4<T>; LANES] = [Vec4([T::default(); LANES]); LANES];
             for j in 0..w {
                 let base_idx = (class_idx_rev + j * num_classes) * w;
                 vecs_b[j] = Vec4::from_slice(&data[base_idx..base_idx + w]);
@@ -142,11 +142,11 @@ pub fn bit_rev_bravo(data: &mut [f64], n: usize) {
 
             // Perform interleave rounds for class B
             for round in 0..log_w {
-                let mut new_vecs: [Vec4; LANES] = [Vec4([0.0; LANES]); LANES];
+                let mut new_vecs: [Vec4<T>; LANES] = [Vec4([T::default(); LANES]); LANES];
                 let stride = 1 << round;
 
-                let mut los: [Vec4; LANES / 2] = [Vec4([0.0; LANES]); LANES / 2];
-                let mut his: [Vec4; LANES / 2] = [Vec4([0.0; LANES]); LANES / 2];
+                let mut los: [Vec4<T>; LANES / 2] = [Vec4([T::default(); LANES]); LANES / 2];
+                let mut his: [Vec4<T>; LANES / 2] = [Vec4([T::default(); LANES]); LANES / 2];
 
                 let mut pair_idx = 0;
                 let mut i = 0;
@@ -183,7 +183,7 @@ pub fn bit_rev_bravo(data: &mut [f64], n: usize) {
 }
 
 /// Scalar bit-reversal for small arrays
-fn scalar_bit_reversal(data: &mut [f64], n: usize) {
+fn scalar_bit_reversal<T: Default + Copy + Clone>(data: &mut [T], n: usize) {
     let big_n = data.len();
 
     for i in 0..big_n {

From 8a6bbc5bf467f16c207f471ff0642f919e3e4060 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 20 Jan 2026 22:14:05 +0000
Subject: [PATCH 4/5] BRAVO: Port to portable_simd proper instead of stable
 polyfill

---
 Cargo.toml              |  1 +
 src/algorithms/bravo.rs | 65 ++++++++++++-----------------------------
 src/algorithms/mod.rs   |  1 +
 src/lib.rs              |  1 +
 4 files changed, 21 insertions(+), 47 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 99ae92f..ce457a7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -24,6 +24,7 @@ rayon = { version = "1.11.0", optional = true }
 default = []
 complex-nums = ["dep:num-complex", "dep:bytemuck"]
 parallel = ["dep:rayon"]
+nightly = []
 
 [dev-dependencies]
 criterion = "0.8.0"
diff --git a/src/algorithms/bravo.rs b/src/algorithms/bravo.rs
index 2cc4f77..83013e7 100644
--- a/src/algorithms/bravo.rs
+++ b/src/algorithms/bravo.rs
@@ -7,54 +7,25 @@
 /// For N = 2^n elements with W-element vectors, the algorithm performs log₂(N) rounds
 /// of in-place interleave operations on pairs of vectors.
 ///
-/// When nightly Rust with std::simd is available, this can use hardware SIMD instructions.
-/// For now, we implement the interleave operation manually to demonstrate the algorithm.
+/// Uses std::simd for hardware SIMD instructions on nightly Rust.
 ///
 /// The initial implementation was heavily assisted by Claude Code
+use std::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
 
 const LANES: usize = 8; // Vector width W
 
-/// A simple vector type that mimics std::simd::Simd for f64
-#[derive(Clone, Copy, Default)]
-struct Vec4<T: Default + Copy + Clone>([T; LANES]);
-
-impl<T: Default + Copy + Clone> Vec4<T> {
-    fn from_slice(s: &[T]) -> Self {
-        Vec4([s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7]])
-    }
-
-    fn copy_to_slice(self, s: &mut [T]) {
-        s[0] = self.0[0];
-        s[1] = self.0[1];
-        s[2] = self.0[2];
-        s[3] = self.0[3];
-        s[4] = self.0[4];
-        s[5] = self.0[5];
-        s[6] = self.0[6];
-        s[7] = self.0[7];
-    }
-
-    /// Interleave two vectors, producing low and high halves.
-    /// For vectors [a0, a1, a2, a3] and [b0, b1, b2, b3]:
-    /// - low:  [a0, b0, a1, b1]
-    /// - high: [a2, b2, a3, b3]
-    ///
-    /// This matches std::simd::Simd::interleave() behavior.
-    fn interleave(self, other: Vec4<T>) -> (Vec4<T>, Vec4<T>) {
-        let a = self.0;
-        let b = other.0;
-        let lo = Vec4([a[0], b[0], a[1], b[1], a[2], b[2], a[3], b[3]]);
-        let hi = Vec4([a[4], b[4], a[5], b[5], a[6], b[6], a[7], b[7]]);
-        (lo, hi)
-    }
-}
+type Vec8<T> = Simd<T, LANES>;
 
 /// Performs in-place bit-reversal permutation using the BRAVO algorithm.
 ///
 /// # Arguments
 /// * `data` - The slice to permute in-place. Length must be a power of 2 and >= LANES².
 /// * `n` - The log₂ of the data length (i.e., data.len() == 2^n)
-pub fn bit_rev_bravo<T: Default + Copy + Clone>(data: &mut [T], n: usize) {
+pub fn bit_rev_bravo<T>(data: &mut [T], n: usize)
+where
+    T: Default + Copy + Clone + SimdElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
     let big_n = 1usize << n;
     assert_eq!(data.len(), big_n, "Data length must be 2^n");
 
@@ -88,20 +59,20 @@ pub fn bit_rev_bravo<T: Default + Copy + Clone>(data: &mut [T], n: usize) {
         }
 
         // Load vectors for class A
-        let mut vecs_a: [Vec4<T>; LANES] = [Vec4([T::default(); LANES]); LANES];
+        let mut vecs_a: [Vec8<T>; LANES] = [Simd::splat(T::default()); LANES];
         for j in 0..w {
             let base_idx = (class_idx + j * num_classes) * w;
-            vecs_a[j] = Vec4::from_slice(&data[base_idx..base_idx + w]);
+            vecs_a[j] = Simd::from_slice(&data[base_idx..base_idx + w]);
         }
 
         // Perform interleave rounds for class A
         for round in 0..log_w {
-            let mut new_vecs: [Vec4<T>; LANES] = [Vec4([T::default(); LANES]); LANES];
+            let mut new_vecs: [Vec8<T>; LANES] = [Simd::splat(T::default()); LANES];
             let stride = 1 << round;
 
             // W/2 pairs per round, stored as parallel arrays
-            let mut los: [Vec4<T>; LANES / 2] = [Vec4([T::default(); LANES]); LANES / 2];
-            let mut his: [Vec4<T>; LANES / 2] = [Vec4([T::default(); LANES]); LANES / 2];
+            let mut los: [Vec8<T>; LANES / 2] = [Simd::splat(T::default()); LANES / 2];
+            let mut his: [Vec8<T>; LANES / 2] = [Simd::splat(T::default()); LANES / 2];
 
             let mut pair_idx = 0;
             let mut i = 0;
@@ -134,19 +105,19 @@ pub fn bit_rev_bravo<T: Default + Copy + Clone>(data: &mut [T], n: usize) {
             }
         } else {
             // Swapping pair - load class B, process it, then swap both
-            let mut vecs_b: [Vec4<T>; LANES] = [Vec4([T::default(); LANES]); LANES];
+            let mut vecs_b: [Vec8<T>; LANES] = [Simd::splat(T::default()); LANES];
             for j in 0..w {
                 let base_idx = (class_idx_rev + j * num_classes) * w;
-                vecs_b[j] = Vec4::from_slice(&data[base_idx..base_idx + w]);
+                vecs_b[j] = Simd::from_slice(&data[base_idx..base_idx + w]);
             }
 
             // Perform interleave rounds for class B
             for round in 0..log_w {
-                let mut new_vecs: [Vec4<T>; LANES] = [Vec4([T::default(); LANES]); LANES];
+                let mut new_vecs: [Vec8<T>; LANES] = [Simd::splat(T::default()); LANES];
                 let stride = 1 << round;
 
-                let mut los: [Vec4<T>; LANES / 2] = [Vec4([T::default(); LANES]); LANES / 2];
-                let mut his: [Vec4<T>; LANES / 2] = [Vec4([T::default(); LANES]); LANES / 2];
+                let mut los: [Vec8<T>; LANES / 2] = [Simd::splat(T::default()); LANES / 2];
+                let mut his: [Vec8<T>; LANES / 2] = [Simd::splat(T::default()); LANES / 2];
 
                 let mut pair_idx = 0;
                 let mut i = 0;
diff --git a/src/algorithms/mod.rs b/src/algorithms/mod.rs
index 259b316..1d9023d 100644
--- a/src/algorithms/mod.rs
+++ b/src/algorithms/mod.rs
@@ -18,6 +18,7 @@
 //! - Use DIF if you don't want to or don't need to apply a bit reversal on the input.
 //! - Use DIT for slightly better performance.
 
+#[cfg(feature = "nightly")]
 pub mod bravo;
 pub mod cobra;
 pub mod dif;
diff --git a/src/lib.rs b/src/lib.rs
index ccab469..7354096 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,3 +1,4 @@
+#![cfg_attr(feature = "nightly", feature(portable_simd))]
 #![doc = include_str!("../README.md")]
 #![warn(
     missing_docs,

From 7d8d8658ddabd24e22ea3e244591dac91f0fbdf2 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 20 Jan 2026 22:16:26 +0000
Subject: [PATCH 5/5] jankily switch from cobra to bravo to run benchmarks

---
 src/algorithms/dit.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/algorithms/dit.rs b/src/algorithms/dit.rs
index 0353354..e712719 100644
--- a/src/algorithms/dit.rs
+++ b/src/algorithms/dit.rs
@@ -14,7 +14,7 @@
 //! DIT starts with fine-grained memory access and progressively works with
 //! larger contiguous chunks.
 //!
-use crate::algorithms::cobra::cobra_apply;
+use crate::algorithms::bravo::bit_rev_bravo;
 use crate::kernels::dit::{
     fft_dit_32_chunk_n_simd, fft_dit_64_chunk_n_simd, fft_dit_chunk_16_simd_f32,
     fft_dit_chunk_16_simd_f64, fft_dit_chunk_2, fft_dit_chunk_32_simd_f32,
@@ -250,8 +250,8 @@ pub fn fft_64_dit_with_planner_and_opts(
     // DIT requires bit-reversed input
     run_maybe_in_parallel(
         opts.multithreaded_bit_reversal,
-        || cobra_apply(reals, log_n),
-        || cobra_apply(imags, log_n),
+        || bit_rev_bravo(reals, log_n),
+        || bit_rev_bravo(imags, log_n),
     );
 
     // Handle inverse FFT
@@ -293,8 +293,8 @@ pub fn fft_32_dit_with_planner_and_opts(
     // DIT requires bit-reversed input
     run_maybe_in_parallel(
         opts.multithreaded_bit_reversal,
-        || cobra_apply(reals, log_n),
-        || cobra_apply(imags, log_n),
+        || bit_rev_bravo(reals, log_n),
+        || bit_rev_bravo(imags, log_n),
     );
 
     // Handle inverse FFT