diff --git a/Cargo.toml b/Cargo.toml
index 142f4fd..c38915b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,6 +19,7 @@ num-complex = { version = "0.4.6", features = ["bytemuck"], optional = true }
 bytemuck = { version = "1.23.2", optional = true }
 wide = "1.0.3"
 rayon = { version = "1.11.0", optional = true }
+fearless_simd = { git = "https://github.com/valadaptive/fearless_simd.git", branch = "no-x4", version = "0.3.0" }
 
 [features]
 default = []
diff --git a/src/algorithms/dit.rs b/src/algorithms/dit.rs
index 0353354..4e20c8b 100644
--- a/src/algorithms/dit.rs
+++ b/src/algorithms/dit.rs
@@ -14,14 +14,10 @@
 //! DIT starts with fine-grained memory access and progressively works with
 //! larger contiguous chunks.
 //!
+use fearless_simd::{dispatch, Level, Simd};
+
 use crate::algorithms::cobra::cobra_apply;
-use crate::kernels::dit::{
-    fft_dit_32_chunk_n_simd, fft_dit_64_chunk_n_simd, fft_dit_chunk_16_simd_f32,
-    fft_dit_chunk_16_simd_f64, fft_dit_chunk_2, fft_dit_chunk_32_simd_f32,
-    fft_dit_chunk_32_simd_f64, fft_dit_chunk_4_simd_f32, fft_dit_chunk_4_simd_f64,
-    fft_dit_chunk_64_simd_f32, fft_dit_chunk_64_simd_f64, fft_dit_chunk_8_simd_f32,
-    fft_dit_chunk_8_simd_f64,
-};
+use crate::kernels::dit::*;
 use crate::options::Options;
 use crate::parallel::run_maybe_in_parallel;
 use crate::planner::{Direction, PlannerDit32, PlannerDit64};
@@ -33,7 +29,8 @@ const L1_BLOCK_SIZE: usize = 1024;
 ///
 /// Recursively divides by 2 until reaching L1 cache size, processes stages within
 /// each block, then processes cross-block stages on return.
-fn recursive_dit_fft_f64(
+fn recursive_dit_fft_f64<S: Simd>(
+    simd: S,
     reals: &mut [f64],
     imags: &mut [f64],
     size: usize,
@@ -46,6 +43,7 @@ fn recursive_dit_fft_f64(
     if size <= L1_BLOCK_SIZE {
         for stage in 0..log_size {
             stage_twiddle_idx = execute_dit_stage_f64(
+                simd,
                 &mut reals[..size],
                 &mut imags[..size],
                 stage,
@@ -63,8 +61,8 @@ fn recursive_dit_fft_f64(
         // Recursively process both halves
         run_maybe_in_parallel(
             size > opts.smallest_parallel_chunk_size,
-            || recursive_dit_fft_f64(re_first_half, im_first_half, half, planner, opts, 0),
-            || recursive_dit_fft_f64(re_second_half, im_second_half, half, planner, opts, 0),
+            || recursive_dit_fft_f64(simd, re_first_half, im_first_half, half, planner, opts, 0),
+            || recursive_dit_fft_f64(simd, re_second_half, im_second_half, half, planner, opts, 0),
         );
 
         // Both halves completed stages 0..log_half-1
@@ -74,6 +72,7 @@ fn recursive_dit_fft_f64(
         // Process remaining stages that span both halves
         for stage in log_half..log_size {
             stage_twiddle_idx = execute_dit_stage_f64(
+                simd,
                 &mut reals[..size],
                 &mut imags[..size],
                 stage,
@@ -87,7 +86,8 @@ fn recursive_dit_fft_f64(
 }
 
 /// Recursive cache-blocked DIT FFT for f32 using post-order traversal.
-fn recursive_dit_fft_f32(
+fn recursive_dit_fft_f32<S: Simd>(
+    simd: S,
     reals: &mut [f32],
     imags: &mut [f32],
     size: usize,
@@ -100,6 +100,7 @@ fn recursive_dit_fft_f32(
     if size <= L1_BLOCK_SIZE {
         for stage in 0..log_size {
             stage_twiddle_idx = execute_dit_stage_f32(
+                simd,
                 &mut reals[..size],
                 &mut imags[..size],
                 stage,
@@ -117,8 +118,8 @@ fn recursive_dit_fft_f32(
         // Recursively process both halves
         run_maybe_in_parallel(
             size > opts.smallest_parallel_chunk_size,
-            || recursive_dit_fft_f32(re_first_half, im_first_half, half, planner, opts, 0),
-            || recursive_dit_fft_f32(re_second_half, im_second_half, half, planner, opts, 0),
+            || recursive_dit_fft_f32(simd, re_first_half, im_first_half, half, planner, opts, 0),
+            || recursive_dit_fft_f32(simd, re_second_half, im_second_half, half, planner, opts, 0),
         );
 
         // Both halves completed stages 0..log_half-1
@@ -128,6 +129,7 @@ fn recursive_dit_fft_f32(
         // Process remaining stages that span both halves
         for stage in log_half..log_size {
             stage_twiddle_idx = execute_dit_stage_f32(
+                simd,
                 &mut reals[..size],
                 &mut imags[..size],
                 stage,
@@ -142,7 +144,8 @@ fn recursive_dit_fft_f32(
 
 /// Execute a single DIT stage, dispatching to appropriate kernel based on chunk size.
 /// Returns updated stage_twiddle_idx.
-fn execute_dit_stage_f64(
+fn execute_dit_stage_f64<S: Simd>(
+    simd: S,
     reals: &mut [f64],
     imags: &mut [f64],
     stage: usize,
@@ -153,34 +156,35 @@ fn execute_dit_stage_f64(
     let chunk_size = dist << 1;
 
     if chunk_size == 2 {
-        fft_dit_chunk_2(reals, imags);
+        simd.vectorize(|| fft_dit_chunk_2(simd, reals, imags));
         stage_twiddle_idx
     } else if chunk_size == 4 {
-        fft_dit_chunk_4_simd_f64(reals, imags);
+        fft_dit_chunk_4_f64(simd, reals, imags);
         stage_twiddle_idx
     } else if chunk_size == 8 {
-        fft_dit_chunk_8_simd_f64(reals, imags);
+        fft_dit_chunk_8_f64(simd, reals, imags);
         stage_twiddle_idx
     } else if chunk_size == 16 {
-        fft_dit_chunk_16_simd_f64(reals, imags);
+        fft_dit_chunk_16_f64(simd, reals, imags);
         stage_twiddle_idx
     } else if chunk_size == 32 {
-        fft_dit_chunk_32_simd_f64(reals, imags);
+        fft_dit_chunk_32_f64(simd, reals, imags);
         stage_twiddle_idx
     } else if chunk_size == 64 {
-        fft_dit_chunk_64_simd_f64(reals, imags);
+        fft_dit_chunk_64_f64(simd, reals, imags);
         stage_twiddle_idx
     } else {
         // For larger chunks, use general kernel with twiddles from planner
         let (twiddles_re, twiddles_im) = &planner.stage_twiddles[stage_twiddle_idx];
-        fft_dit_64_chunk_n_simd(reals, imags, twiddles_re, twiddles_im, dist);
+        fft_dit_chunk_n_f64(simd, reals, imags, twiddles_re, twiddles_im, dist);
         stage_twiddle_idx + 1
     }
 }
 
 /// Execute a single DIT stage, dispatching to appropriate kernel based on chunk size.
 /// Returns updated stage_twiddle_idx.
-fn execute_dit_stage_f32(
+fn execute_dit_stage_f32<S: Simd>(
+    simd: S,
     reals: &mut [f32],
     imags: &mut [f32],
     stage: usize,
@@ -191,27 +195,27 @@ fn execute_dit_stage_f32(
     let chunk_size = dist << 1;
 
     if chunk_size == 2 {
-        fft_dit_chunk_2(reals, imags);
+        simd.vectorize(|| fft_dit_chunk_2(simd, reals, imags));
         stage_twiddle_idx
     } else if chunk_size == 4 {
-        fft_dit_chunk_4_simd_f32(reals, imags);
+        fft_dit_chunk_4_f32(simd, reals, imags);
         stage_twiddle_idx
     } else if chunk_size == 8 {
-        fft_dit_chunk_8_simd_f32(reals, imags);
+        fft_dit_chunk_8_f32(simd, reals, imags);
         stage_twiddle_idx
     } else if chunk_size == 16 {
-        fft_dit_chunk_16_simd_f32(reals, imags);
+        fft_dit_chunk_16_f32(simd, reals, imags);
         stage_twiddle_idx
     } else if chunk_size == 32 {
-        fft_dit_chunk_32_simd_f32(reals, imags);
+        fft_dit_chunk_32_f32(simd, reals, imags);
         stage_twiddle_idx
     } else if chunk_size == 64 {
-        fft_dit_chunk_64_simd_f32(reals, imags);
+        fft_dit_chunk_64_f32(simd, reals, imags);
         stage_twiddle_idx
     } else {
         // For larger chunks, use general kernel with twiddles from planner
         let (twiddles_re, twiddles_im) = &planner.stage_twiddles[stage_twiddle_idx];
-        fft_dit_32_chunk_n_simd(reals, imags, twiddles_re, twiddles_im, dist);
+        fft_dit_chunk_n_f32(simd, reals, imags, twiddles_re, twiddles_im, dist);
         stage_twiddle_idx + 1
     }
 }
@@ -261,7 +265,8 @@ pub fn fft_64_dit_with_planner_and_opts(
         }
     }
 
-    recursive_dit_fft_f64(reals, imags, n, planner, opts, 0);
+    let simd_level = Level::new();
+    dispatch!(simd_level, simd => recursive_dit_fft_f64(simd, reals, imags, n, planner, opts, 0));
 
     // Scaling for inverse transform
     if let Direction::Reverse = planner.direction {
@@ -304,7 +309,8 @@ pub fn fft_32_dit_with_planner_and_opts(
         }
     }
 
-    recursive_dit_fft_f32(reals, imags, n, planner, opts, 0);
+    let simd_level = Level::new();
+    dispatch!(simd_level, simd => recursive_dit_fft_f32(simd, reals, imags, n, planner, opts, 0));
 
     // Scaling for inverse transform
     if let Direction::Reverse = planner.direction {
diff --git a/src/kernels/dit.rs b/src/kernels/dit.rs
index 1f4c4d7..5c1f50e 100644
--- a/src/kernels/dit.rs
+++ b/src/kernels/dit.rs
@@ -4,28 +4,41 @@
 //!
 use core::f32;
 
+use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8, Simd, SimdBase, SimdFloat, SimdFrom};
 use num_traits::Float;
-use wide::{f32x16, f32x4, f32x8, f64x4, f64x8};
-
-use crate::kernels::common::fft_chunk_2;
 
 /// DIT butterfly for chunk_size == 2
 /// Identical to DIF version (no twiddles at size 2)
-pub fn fft_dit_chunk_2<T: Float>(reals: &mut [T], imags: &mut [T]) {
-    fft_chunk_2(reals, imags);
+#[inline(always)] // required by fearless_simd
+pub fn fft_dit_chunk_2<S: Simd, T: Float>(_simd: S, reals: &mut [T], imags: &mut [T]) {
+    reals
+        .chunks_exact_mut(2)
+        .zip(imags.chunks_exact_mut(2))
+        .for_each(|(reals_chunk, imags_chunk)| {
+            let z0_re = reals_chunk[0];
+            let z0_im = imags_chunk[0];
+            let z1_re = reals_chunk[1];
+            let z1_im = imags_chunk[1];
+
+            reals_chunk[0] = z0_re + z1_re;
+            imags_chunk[0] = z0_im + z1_im;
+            reals_chunk[1] = z0_re - z1_re;
+            imags_chunk[1] = z0_im - z1_im;
+        });
+}
+
+/// DIT butterfly for chunk_size == 4 (f64)
+#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure
+pub fn fft_dit_chunk_4_f64<S: Simd>(simd: S, reals: &mut [f64], imags: &mut [f64]) {
+    simd.vectorize(
+        #[inline(always)]
+        || fft_dit_chunk_4_simd_f64(simd, reals, imags),
+    )
 }
 
 /// DIT butterfly for chunk_size == 4 (f64)
-#[multiversion::multiversion(targets(
-    "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86_64+avx2+fma",
-    "x86_64+sse4.2",
-    "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86+avx2+fma",
-    "x86+sse4.2",
-    "x86+sse2",
-))]
-pub fn fft_dit_chunk_4_simd_f64(reals: &mut [f64], imags: &mut [f64]) {
+#[inline(always)] // required by fearless_simd
+fn fft_dit_chunk_4_simd_f64<S: Simd>(_simd: S, reals: &mut [f64], imags: &mut [f64]) {
     const DIST: usize = 2;
     const CHUNK_SIZE: usize = DIST << 1;
 
@@ -65,16 +78,17 @@ pub fn fft_dit_chunk_4_simd_f64(reals: &mut [f64], imags: &mut [f64]) {
 }
 
 /// DIT butterfly for chunk_size == 4 (f32)
-#[multiversion::multiversion(targets(
-    "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86_64+avx2+fma",
-    "x86_64+sse4.2",
-    "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86+avx2+fma",
-    "x86+sse4.2",
-    "x86+sse2",
-))]
-pub fn fft_dit_chunk_4_simd_f32(reals: &mut [f32], imags: &mut [f32]) {
+#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure
+pub fn fft_dit_chunk_4_f32<S: Simd>(simd: S, reals: &mut [f32], imags: &mut [f32]) {
+    simd.vectorize(
+        #[inline(always)]
+        || fft_dit_chunk_4_simd_f32(simd, reals, imags),
+    )
+}
+
+/// DIT butterfly for chunk_size == 4 (f32)
+#[inline(always)] // required by fearless_simd
+fn fft_dit_chunk_4_simd_f32<S: Simd>(_simd: S, reals: &mut [f32], imags: &mut [f32]) {
     const DIST: usize = 2;
     const CHUNK_SIZE: usize = DIST << 1;
 
@@ -114,32 +128,39 @@ pub fn fft_dit_chunk_4_simd_f32(reals: &mut [f32], imags: &mut [f32]) {
 }
 
 /// DIT butterfly for chunk_size == 8 (f64) with SIMD
-#[multiversion::multiversion(targets(
-    "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86_64+avx2+fma",
-    "x86_64+sse4.2",
-    "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86+avx2+fma",
-    "x86+sse4.2",
-    "x86+sse2",
-))]
-pub fn fft_dit_chunk_8_simd_f64(reals: &mut [f64], imags: &mut [f64]) {
+#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure
+pub fn fft_dit_chunk_8_f64<S: Simd>(simd: S, reals: &mut [f64], imags: &mut [f64]) {
+    simd.vectorize(
+        #[inline(always)]
+        || fft_dit_chunk_8_simd_f64(simd, reals, imags),
+    )
+}
+
+/// DIT butterfly for chunk_size == 8 (f64) with SIMD
+#[inline(always)] // required by fearless_simd
+fn fft_dit_chunk_8_simd_f64<S: Simd>(simd: S, reals: &mut [f64], imags: &mut [f64]) {
     const DIST: usize = 4;
     const CHUNK_SIZE: usize = DIST << 1;
 
-    let two = f64x4::splat(2.0);
-    let sqrt2_2 = f64x4::new([
-        1.0,                              // W_8^0 real
-        std::f64::consts::FRAC_1_SQRT_2,  // W_8^1 real (sqrt(2)/2)
-        0.0,                              // W_8^2 real
-        -std::f64::consts::FRAC_1_SQRT_2, // W_8^3 real (-sqrt(2)/2)
-    ]);
-    let sqrt2_2_im = f64x4::new([
-        0.0,                              // W_8^0 imag
-        -std::f64::consts::FRAC_1_SQRT_2, // W_8^1 imag (-sqrt(2)/2)
-        -1.0,                             // W_8^2 imag
-        -std::f64::consts::FRAC_1_SQRT_2, // W_8^3 imag (-sqrt(2)/2)
-    ]);
+    let two = f64x4::splat(simd, 2.0);
+    let sqrt2_2 = f64x4::simd_from(
+        simd,
+        [
+            1.0,                              // W_8^0 real
+            std::f64::consts::FRAC_1_SQRT_2,  // W_8^1 real (sqrt(2)/2)
+            0.0,                              // W_8^2 real
+            -std::f64::consts::FRAC_1_SQRT_2, // W_8^3 real (-sqrt(2)/2)
+        ],
+    );
+    let sqrt2_2_im = f64x4::simd_from(
+        simd,
+        [
+            0.0,                              // W_8^0 imag
+            -std::f64::consts::FRAC_1_SQRT_2, // W_8^1 imag (-sqrt(2)/2)
+            -1.0,                             // W_8^2 imag
+            -std::f64::consts::FRAC_1_SQRT_2, // W_8^3 imag (-sqrt(2)/2)
+        ],
+    );
 
     (reals.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
         .zip(imags.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
@@ -147,10 +168,10 @@ pub fn fft_dit_chunk_8_simd_f64(reals: &mut [f64], imags: &mut [f64]) {
             let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST);
             let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST);
 
-            let in0_re = f64x4::new(reals_s0[0..4].try_into().unwrap());
-            let in1_re = f64x4::new(reals_s1[0..4].try_into().unwrap());
-            let in0_im = f64x4::new(imags_s0[0..4].try_into().unwrap());
-            let in1_im = f64x4::new(imags_s1[0..4].try_into().unwrap());
+            let in0_re = f64x4::from_slice(simd, &reals_s0[0..4]);
+            let in1_re = f64x4::from_slice(simd, &reals_s1[0..4]);
+            let in0_im = f64x4::from_slice(simd, &imags_s0[0..4]);
+            let in1_im = f64x4::from_slice(simd, &imags_s1[0..4]);
 
             // out0.re = (in0.re + w.re * in1.re) - w.im * in1.im
             let out0_re = sqrt2_2_im.mul_add(-in1_im, sqrt2_2.mul_add(in1_re, in0_re));
@@ -161,40 +182,47 @@ pub fn fft_dit_chunk_8_simd_f64(reals: &mut [f64], imags: &mut [f64]) {
             let out1_re = two.mul_sub(in0_re, out0_re);
             let out1_im = two.mul_sub(in0_im, out0_im);
 
-            reals_s0.copy_from_slice(out0_re.as_array());
-            imags_s0.copy_from_slice(out0_im.as_array());
-            reals_s1.copy_from_slice(out1_re.as_array());
-            imags_s1.copy_from_slice(out1_im.as_array());
+            out0_re.store_slice(reals_s0);
+            out0_im.store_slice(imags_s0);
+            out1_re.store_slice(reals_s1);
+            out1_im.store_slice(imags_s1);
         });
 }
 
 /// DIT butterfly for chunk_size == 8 (f32) with SIMD
-#[multiversion::multiversion(targets(
-    "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86_64+avx2+fma",
-    "x86_64+sse4.2",
-    "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86+avx2+fma",
-    "x86+sse4.2",
-    "x86+sse2",
-))]
-pub fn fft_dit_chunk_8_simd_f32(reals: &mut [f32], imags: &mut [f32]) {
+#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure
+pub fn fft_dit_chunk_8_f32<S: Simd>(simd: S, reals: &mut [f32], imags: &mut [f32]) {
+    simd.vectorize(
+        #[inline(always)]
+        || fft_dit_chunk_8_simd_f32(simd, reals, imags),
+    )
+}
+
+/// DIT butterfly for chunk_size == 8 (f32) with SIMD
+#[inline(always)] // required by fearless_simd
+fn fft_dit_chunk_8_simd_f32<S: Simd>(simd: S, reals: &mut [f32], imags: &mut [f32]) {
     const DIST: usize = 4;
     const CHUNK_SIZE: usize = DIST << 1;
 
-    let two = f32x4::splat(2.0);
-    let sqrt2_2 = f32x4::new([
-        1.0_f32,                          // W_8^0 real
-        std::f32::consts::FRAC_1_SQRT_2,  // W_8^1 real (sqrt(2)/2)
-        0.0_f32,                          // W_8^2 real
-        -std::f32::consts::FRAC_1_SQRT_2, // W_8^3 real (-sqrt(2)/2)
-    ]);
-    let sqrt2_2_im = f32x4::new([
-        0.0_f32,                          // W_8^0 imag
-        -std::f32::consts::FRAC_1_SQRT_2, // W_8^1 imag (-sqrt(2)/2)
-        -1.0_f32,                         // W_8^2 imag
-        -std::f32::consts::FRAC_1_SQRT_2, // W_8^3 imag (-sqrt(2)/2)
-    ]);
+    let two = f32x4::splat(simd, 2.0);
+    let sqrt2_2 = f32x4::simd_from(
+        simd,
+        [
+            1.0_f32,                          // W_8^0 real
+            std::f32::consts::FRAC_1_SQRT_2,  // W_8^1 real (sqrt(2)/2)
+            0.0_f32,                          // W_8^2 real
+            -std::f32::consts::FRAC_1_SQRT_2, // W_8^3 real (-sqrt(2)/2)
+        ],
+    );
+    let sqrt2_2_im = f32x4::simd_from(
+        simd,
+        [
+            0.0_f32,                          // W_8^0 imag
+            -std::f32::consts::FRAC_1_SQRT_2, // W_8^1 imag (-sqrt(2)/2)
+            -1.0_f32,                         // W_8^2 imag
+            -std::f32::consts::FRAC_1_SQRT_2, // W_8^3 imag (-sqrt(2)/2)
+        ],
+    );
 
     (reals.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
         .zip(imags.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
@@ -202,10 +230,10 @@ pub fn fft_dit_chunk_8_simd_f32(reals: &mut [f32], imags: &mut [f32]) {
             let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST);
             let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST);
 
-            let in0_re = f32x4::new(reals_s0[0..4].try_into().unwrap());
-            let in1_re = f32x4::new(reals_s1[0..4].try_into().unwrap());
-            let in0_im = f32x4::new(imags_s0[0..4].try_into().unwrap());
-            let in1_im = f32x4::new(imags_s1[0..4].try_into().unwrap());
+            let in0_re = f32x4::from_slice(simd, &reals_s0[0..4]);
+            let in1_re = f32x4::from_slice(simd, &reals_s1[0..4]);
+            let in0_im = f32x4::from_slice(simd, &imags_s0[0..4]);
+            let in1_im = f32x4::from_slice(simd, &imags_s1[0..4]);
 
             // out0.re = (in0.re + w.re * in1.re) - w.im * in1.im
             let out0_re = sqrt2_2_im.mul_add(-in1_im, sqrt2_2.mul_add(in1_re, in0_re));
@@ -216,51 +244,58 @@ pub fn fft_dit_chunk_8_simd_f32(reals: &mut [f32], imags: &mut [f32]) {
             let out1_re = two.mul_sub(in0_re, out0_re);
             let out1_im = two.mul_sub(in0_im, out0_im);
 
-            reals_s0.copy_from_slice(out0_re.as_array());
-            imags_s0.copy_from_slice(out0_im.as_array());
-            reals_s1.copy_from_slice(out1_re.as_array());
-            imags_s1.copy_from_slice(out1_im.as_array());
+            out0_re.store_slice(reals_s0);
+            out0_im.store_slice(imags_s0);
+            out1_re.store_slice(reals_s1);
+            out1_im.store_slice(imags_s1);
         });
 }
 
-/// DIT butterfly for chunk_size == 16 (f64)  
-#[multiversion::multiversion(targets(
-    "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86_64+avx2+fma",
-    "x86_64+sse4.2",
-    "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86+avx2+fma",
-    "x86+sse4.2",
-    "x86+sse2",
-))]
-pub fn fft_dit_chunk_16_simd_f64(reals: &mut [f64], imags: &mut [f64]) {
+/// DIT butterfly for chunk_size == 16 (f64)
+#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure
+pub fn fft_dit_chunk_16_f64<S: Simd>(simd: S, reals: &mut [f64], imags: &mut [f64]) {
+    simd.vectorize(
+        #[inline(always)]
+        || fft_dit_chunk_16_simd_f64(simd, reals, imags),
+    )
+}
+
+/// DIT butterfly for chunk_size == 16 (f64)
+#[inline(always)] // required by fearless_simd
+fn fft_dit_chunk_16_simd_f64<S: Simd>(simd: S, reals: &mut [f64], imags: &mut [f64]) {
     const DIST: usize = 8;
     const CHUNK_SIZE: usize = DIST << 1;
 
-    let two = f64x8::splat(2.0);
+    let two = f64x8::splat(simd, 2.0);
 
     // Twiddle factors for W_16^k where k = 0..7
-    let twiddle_re = f64x8::new([
-        1.0,                              // W_16^0
-        0.9238795325112867,               // W_16^1 = cos(pi/8)
-        std::f64::consts::FRAC_1_SQRT_2,  // W_16^2 = sqrt(2)/2
-        0.38268343236508984,              // W_16^3 = cos(3*pi/8)
-        0.0,                              // W_16^4
-        -0.38268343236508984,             // W_16^5 = -cos(3*pi/8)
-        -std::f64::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2
-        -0.9238795325112867,              // W_16^7 = -cos(pi/8)
-    ]);
-
-    let twiddle_im = f64x8::new([
-        0.0,                              // W_16^0
-        -0.38268343236508984,             // W_16^1 = -sin(pi/8)
-        -std::f64::consts::FRAC_1_SQRT_2, // W_16^2 = -sqrt(2)/2
-        -0.9238795325112867,              // W_16^3 = -sin(3*pi/8)
-        -1.0,                             // W_16^4
-        -0.9238795325112867,              // W_16^5 = -sin(3*pi/8)
-        -std::f64::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2
-        -0.38268343236508984,             // W_16^7 = -sin(pi/8)
-    ]);
+    let twiddle_re = f64x8::simd_from(
+        simd,
+        [
+            1.0,                              // W_16^0
+            0.9238795325112867,               // W_16^1 = cos(pi/8)
+            std::f64::consts::FRAC_1_SQRT_2,  // W_16^2 = sqrt(2)/2
+            0.38268343236508984,              // W_16^3 = cos(3*pi/8)
+            0.0,                              // W_16^4
+            -0.38268343236508984,             // W_16^5 = -cos(3*pi/8)
+            -std::f64::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2
+            -0.9238795325112867,              // W_16^7 = -cos(pi/8)
+        ],
+    );
+
+    let twiddle_im = f64x8::simd_from(
+        simd,
+        [
+            0.0,                              // W_16^0
+            -0.38268343236508984,             // W_16^1 = -sin(pi/8)
+            -std::f64::consts::FRAC_1_SQRT_2, // W_16^2 = -sqrt(2)/2
+            -0.9238795325112867,              // W_16^3 = -sin(3*pi/8)
+            -1.0,                             // W_16^4
+            -0.9238795325112867,              // W_16^5 = -sin(3*pi/8)
+            -std::f64::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2
+            -0.38268343236508984,             // W_16^7 = -sin(pi/8)
+        ],
+    );
 
     (reals.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
         .zip(imags.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
@@ -269,10 +304,10 @@ pub fn fft_dit_chunk_16_simd_f64(reals: &mut [f64], imags: &mut [f64]) {
             let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST);
 
             // Load all 8 elements at once
-            let in0_re = f64x8::new(reals_s0[0..8].try_into().unwrap());
-            let in1_re = f64x8::new(reals_s1[0..8].try_into().unwrap());
-            let in0_im = f64x8::new(imags_s0[0..8].try_into().unwrap());
-            let in1_im = f64x8::new(imags_s1[0..8].try_into().unwrap());
+            let in0_re = f64x8::from_slice(simd, &reals_s0[0..8]);
+            let in1_re = f64x8::from_slice(simd, &reals_s1[0..8]);
+            let in0_im = f64x8::from_slice(simd, &imags_s0[0..8]);
+            let in1_im = f64x8::from_slice(simd, &imags_s1[0..8]);
 
             let out0_re = twiddle_im.mul_add(-in1_im, twiddle_re.mul_add(in1_re, in0_re));
             let out0_im = twiddle_im.mul_add(in1_re, twiddle_re.mul_add(in1_im, in0_im));
@@ -281,51 +316,58 @@ pub fn fft_dit_chunk_16_simd_f64(reals: &mut [f64], imags: &mut [f64]) {
             let out1_re = two.mul_sub(in0_re, out0_re);
             let out1_im = two.mul_sub(in0_im, out0_im);
 
-            reals_s0.copy_from_slice(out0_re.as_array());
-            imags_s0.copy_from_slice(out0_im.as_array());
-            reals_s1.copy_from_slice(out1_re.as_array());
-            imags_s1.copy_from_slice(out1_im.as_array());
+            out0_re.store_slice(reals_s0);
+            out0_im.store_slice(imags_s0);
+            out1_re.store_slice(reals_s1);
+            out1_im.store_slice(imags_s1);
         });
 }
 
 /// DIT butterfly for chunk_size == 16 (f32)
-#[multiversion::multiversion(targets(
-    "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86_64+avx2+fma",
-    "x86_64+sse4.2",
-    "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86+avx2+fma",
-    "x86+sse4.2",
-    "x86+sse2",
-))]
-pub fn fft_dit_chunk_16_simd_f32(reals: &mut [f32], imags: &mut [f32]) {
+#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure
+pub fn fft_dit_chunk_16_f32<S: Simd>(simd: S, reals: &mut [f32], imags: &mut [f32]) {
+    simd.vectorize(
+        #[inline(always)]
+        || fft_dit_chunk_16_simd_f32(simd, reals, imags),
+    )
+}
+
+/// DIT butterfly for chunk_size == 16 (f32)
+#[inline(always)] // required by fearless_simd
+fn fft_dit_chunk_16_simd_f32<S: Simd>(simd: S, reals: &mut [f32], imags: &mut [f32]) {
     const DIST: usize = 8;
     const CHUNK_SIZE: usize = DIST << 1;
 
-    let two = f32x8::splat(2.0);
+    let two = f32x8::splat(simd, 2.0);
 
     // Twiddle factors for W_16^k where k = 0..7
-    let twiddle_re = f32x8::new([
-        1.0_f32,                          // W_16^0
-        0.923_879_5_f32,                  // W_16^1 = cos(pi/8)
-        std::f32::consts::FRAC_1_SQRT_2,  // W_16^2 = sqrt(2)/2
-        0.382_683_43_f32,                 // W_16^3 = cos(3*pi/8)
-        0.0_f32,                          // W_16^4
-        -0.382_683_43_f32,                // W_16^5 = -cos(3*pi/8)
-        -std::f32::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2
-        -0.923_879_5_f32,                 // W_16^7 = -cos(pi/8)
-    ]);
-
-    let twiddle_im = f32x8::new([
-        0.0_f32,                          // W_16^0
-        -0.382_683_43_f32,                // W_16^1 = -sin(pi/8)
-        -std::f32::consts::FRAC_1_SQRT_2, // W_16^2 = -sqrt(2)/2
-        -0.923_879_5_f32,                 // W_16^3 = -sin(3*pi/8)
-        -1.0_f32,                         // W_16^4
-        -0.923_879_5_f32,                 // W_16^5 = -sin(3*pi/8)
-        -std::f32::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2
-        -0.382_683_43_f32,                // W_16^7 = -sin(pi/8)
-    ]);
+    let twiddle_re = f32x8::simd_from(
+        simd,
+        [
+            1.0_f32,                          // W_16^0
+            0.923_879_5_f32,                  // W_16^1 = cos(pi/8)
+            std::f32::consts::FRAC_1_SQRT_2,  // W_16^2 = sqrt(2)/2
+            0.382_683_43_f32,                 // W_16^3 = cos(3*pi/8)
+            0.0_f32,                          // W_16^4
+            -0.382_683_43_f32,                // W_16^5 = -cos(3*pi/8)
+            -std::f32::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2
+            -0.923_879_5_f32,                 // W_16^7 = -cos(pi/8)
+        ],
+    );
+
+    let twiddle_im = f32x8::simd_from(
+        simd,
+        [
+            0.0_f32,                          // W_16^0
+            -0.382_683_43_f32,                // W_16^1 = -sin(pi/8)
+            -std::f32::consts::FRAC_1_SQRT_2, // W_16^2 = -sqrt(2)/2
+            -0.923_879_5_f32,                 // W_16^3 = -sin(3*pi/8)
+            -1.0_f32,                         // W_16^4
+            -0.923_879_5_f32,                 // W_16^5 = -sin(3*pi/8)
+            -std::f32::consts::FRAC_1_SQRT_2, // W_16^6 = -sqrt(2)/2
+            -0.382_683_43_f32,                // W_16^7 = -sin(pi/8)
+        ],
+    );
 
     (reals.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
         .zip(imags.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
@@ -334,10 +376,10 @@ pub fn fft_dit_chunk_16_simd_f32(reals: &mut [f32], imags: &mut [f32]) {
             let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST);
 
             // Load all 8 elements at once
-            let in0_re = f32x8::new(reals_s0[0..8].try_into().unwrap());
-            let in1_re = f32x8::new(reals_s1[0..8].try_into().unwrap());
-            let in0_im = f32x8::new(imags_s0[0..8].try_into().unwrap());
-            let in1_im = f32x8::new(imags_s1[0..8].try_into().unwrap());
+            let in0_re = f32x8::from_slice(simd, &reals_s0[0..8]);
+            let in1_re = f32x8::from_slice(simd, &reals_s1[0..8]);
+            let in0_im = f32x8::from_slice(simd, &imags_s0[0..8]);
+            let in1_im = f32x8::from_slice(simd, &imags_s1[0..8]);
 
             let out0_re = twiddle_im.mul_add(-in1_im, twiddle_re.mul_add(in1_re, in0_re));
             let out0_im = twiddle_im.mul_add(in1_re, twiddle_re.mul_add(in1_im, in0_im));
@@ -346,73 +388,86 @@ pub fn fft_dit_chunk_16_simd_f32(reals: &mut [f32], imags: &mut [f32]) {
             let out1_re = two.mul_sub(in0_re, out0_re);
             let out1_im = two.mul_sub(in0_im, out0_im);
 
-            reals_s0.copy_from_slice(out0_re.as_array());
-            imags_s0.copy_from_slice(out0_im.as_array());
-            reals_s1.copy_from_slice(out1_re.as_array());
-            imags_s1.copy_from_slice(out1_im.as_array());
+            out0_re.store_slice(reals_s0);
+            out0_im.store_slice(imags_s0);
+            out1_re.store_slice(reals_s1);
+            out1_im.store_slice(imags_s1);
         });
 }
 /// DIT butterfly for chunk_size == 32 (f64)
-#[multiversion::multiversion(targets(
-    "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86_64+avx2+fma",
-    "x86_64+sse4.2",
-    "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86+avx2+fma",
-    "x86+sse4.2",
-    "x86+sse2",
-))]
-pub fn fft_dit_chunk_32_simd_f64(reals: &mut [f64], imags: &mut [f64]) {
+#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure
+pub fn fft_dit_chunk_32_f64<S: Simd>(simd: S, reals: &mut [f64], imags: &mut [f64]) {
+    simd.vectorize(
+        #[inline(always)]
+        || fft_dit_chunk_32_simd_f64(simd, reals, imags),
+    )
+}
+
+/// DIT butterfly for chunk_size == 32 (f64)
+#[inline(always)] // required by fearless_simd
+fn fft_dit_chunk_32_simd_f64<S: Simd>(simd: S, reals: &mut [f64], imags: &mut [f64]) {
     const DIST: usize = 16;
     const CHUNK_SIZE: usize = DIST << 1;
 
-    let two = f64x8::splat(2.0);
+    let two = f64x8::splat(simd, 2.0);
 
     // First 8 twiddle factors for W_32^k where k = 0..7
-    let twiddle_re_0_7 = f64x8::new([
-        1.0,                             // W_32^0 = 1
-        0.9807852804032304,              // W_32^1 = cos(π/16)
-        0.9238795325112867,              // W_32^2 = cos(π/8)
-        0.8314696123025452,              // W_32^3 = cos(3π/16)
-        std::f64::consts::FRAC_1_SQRT_2, // W_32^4 = sqrt(2)/2
-        0.5555702330196022,              // W_32^5 = cos(5π/16)
-        0.3826834323650898,              // W_32^6 = cos(3π/8)
-        0.19509032201612825,             // W_32^7 = cos(7π/16)
-    ]);
-
-    let twiddle_im_0_7 = f64x8::new([
-        0.0,                              // W_32^0
-        -0.19509032201612825,             // W_32^1 = -sin(π/16)
-        -0.3826834323650898,              // W_32^2 = -sin(π/8)
-        -0.5555702330196022,              // W_32^3 = -sin(3π/16)
-        -std::f64::consts::FRAC_1_SQRT_2, // W_32^4 = -sqrt(2)/2
-        -0.8314696123025452,              // W_32^5 = -sin(5π/16)
-        -0.9238795325112867,              // W_32^6 = -sin(3π/8)
-        -0.9807852804032304,              // W_32^7 = -sin(7π/16)
-    ]);
+    let twiddle_re_0_7 = f64x8::simd_from(
+        simd,
+        [
+            1.0,                             // W_32^0 = 1
+            0.9807852804032304,              // W_32^1 = cos(π/16)
+            0.9238795325112867,              // W_32^2 = cos(π/8)
+            0.8314696123025452,              // W_32^3 = cos(3π/16)
+            std::f64::consts::FRAC_1_SQRT_2, // W_32^4 = sqrt(2)/2
+            0.5555702330196022,              // W_32^5 = cos(5π/16)
+            0.3826834323650898,              // W_32^6 = cos(3π/8)
+            0.19509032201612825,             // W_32^7 = cos(7π/16)
+        ],
+    );
+
+    let twiddle_im_0_7 = f64x8::simd_from(
+        simd,
+        [
+            0.0,                              // W_32^0
+            -0.19509032201612825,             // W_32^1 = -sin(π/16)
+            -0.3826834323650898,              // W_32^2 = -sin(π/8)
+            -0.5555702330196022,              // W_32^3 = -sin(3π/16)
+            -std::f64::consts::FRAC_1_SQRT_2, // W_32^4 = -sqrt(2)/2
+            -0.8314696123025452,              // W_32^5 = -sin(5π/16)
+            -0.9238795325112867,              // W_32^6 = -sin(3π/8)
+            -0.9807852804032304,              // W_32^7 = -sin(7π/16)
+        ],
+    );
 
     // Second 8 twiddle factors for W_32^k where k = 8..15
-    let twiddle_re_8_15 = f64x8::new([
-        0.0,                              // W_32^8 = 0 - i
-        -0.19509032201612825,             // W_32^9
-        -0.3826834323650898,              // W_32^10
-        -0.5555702330196022,              // W_32^11
-        -std::f64::consts::FRAC_1_SQRT_2, // W_32^12
-        -0.8314696123025452,              // W_32^13
-        -0.9238795325112867,              // W_32^14
-        -0.9807852804032304,              // W_32^15
-    ]);
-
-    let twiddle_im_8_15 = f64x8::new([
-        -1.0,                             // W_32^8
-        -0.9807852804032304,              // W_32^9
-        -0.9238795325112867,              // W_32^10
-        -0.8314696123025452,              // W_32^11
-        -std::f64::consts::FRAC_1_SQRT_2, // W_32^12
-        -0.5555702330196022,              // W_32^13
-        -0.3826834323650898,              // W_32^14
-        -0.19509032201612825,             // W_32^15
-    ]);
+    let twiddle_re_8_15 = f64x8::simd_from(
+        simd,
+        [
+            0.0,                              // W_32^8 = 0 - i
+            -0.19509032201612825,             // W_32^9
+            -0.3826834323650898,              // W_32^10
+            -0.5555702330196022,              // W_32^11
+            -std::f64::consts::FRAC_1_SQRT_2, // W_32^12
+            -0.8314696123025452,              // W_32^13
+            -0.9238795325112867,              // W_32^14
+            -0.9807852804032304,              // W_32^15
+        ],
+    );
+
+    let twiddle_im_8_15 = f64x8::simd_from(
+        simd,
+        [
+            -1.0,                             // W_32^8
+            -0.9807852804032304,              // W_32^9
+            -0.9238795325112867,              // W_32^10
+            -0.8314696123025452,              // W_32^11
+            -std::f64::consts::FRAC_1_SQRT_2, // W_32^12
+            -0.5555702330196022,              // W_32^13
+            -0.3826834323650898,              // W_32^14
+            -0.19509032201612825,             // W_32^15
+        ],
+    );
 
     (reals.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
         .zip(imags.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
@@ -421,10 +476,10 @@ pub fn fft_dit_chunk_32_simd_f64(reals: &mut [f64], imags: &mut [f64]) {
             let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST);
 
             // Process first 8 butterflies
-            let in0_re_0_7 = f64x8::new(reals_s0[0..8].try_into().unwrap());
-            let in1_re_0_7 = f64x8::new(reals_s1[0..8].try_into().unwrap());
-            let in0_im_0_7 = f64x8::new(imags_s0[0..8].try_into().unwrap());
-            let in1_im_0_7 = f64x8::new(imags_s1[0..8].try_into().unwrap());
+            let in0_re_0_7 = f64x8::from_slice(simd, &reals_s0[0..8]);
+            let in1_re_0_7 = f64x8::from_slice(simd, &reals_s1[0..8]);
+            let in0_im_0_7 = f64x8::from_slice(simd, &imags_s0[0..8]);
+            let in1_im_0_7 = f64x8::from_slice(simd, &imags_s1[0..8]);
 
             let out0_re_0_7 =
                 twiddle_im_0_7.mul_add(-in1_im_0_7, twiddle_re_0_7.mul_add(in1_re_0_7, in0_re_0_7));
@@ -434,16 +489,16 @@ pub fn fft_dit_chunk_32_simd_f64(reals: &mut [f64], imags: &mut [f64]) {
             let out1_re_0_7 = two.mul_sub(in0_re_0_7, out0_re_0_7);
             let out1_im_0_7 = two.mul_sub(in0_im_0_7, out0_im_0_7);
 
-            reals_s0[0..8].copy_from_slice(out0_re_0_7.as_array());
-            imags_s0[0..8].copy_from_slice(out0_im_0_7.as_array());
-            reals_s1[0..8].copy_from_slice(out1_re_0_7.as_array());
-            imags_s1[0..8].copy_from_slice(out1_im_0_7.as_array());
+            out0_re_0_7.store_slice(&mut reals_s0[0..8]);
+            out0_im_0_7.store_slice(&mut imags_s0[0..8]);
+            out1_re_0_7.store_slice(&mut reals_s1[0..8]);
+            out1_im_0_7.store_slice(&mut imags_s1[0..8]);
 
             // Process second 8 butterflies
-            let in0_re_8_15 = f64x8::new(reals_s0[8..16].try_into().unwrap());
-            let in1_re_8_15 = f64x8::new(reals_s1[8..16].try_into().unwrap());
-            let in0_im_8_15 = f64x8::new(imags_s0[8..16].try_into().unwrap());
-            let in1_im_8_15 = f64x8::new(imags_s1[8..16].try_into().unwrap());
+            let in0_re_8_15 = f64x8::from_slice(simd, &reals_s0[8..16]);
+            let in1_re_8_15 = f64x8::from_slice(simd, &reals_s1[8..16]);
+            let in0_im_8_15 = f64x8::from_slice(simd, &imags_s0[8..16]);
+            let in1_im_8_15 = f64x8::from_slice(simd, &imags_s1[8..16]);
 
             let out0_re_8_15 = twiddle_im_8_15.mul_add(
                 -in1_im_8_15,
@@ -457,67 +512,74 @@ pub fn fft_dit_chunk_32_simd_f64(reals: &mut [f64], imags: &mut [f64]) {
             let out1_re_8_15 = two.mul_sub(in0_re_8_15, out0_re_8_15);
             let out1_im_8_15 = two.mul_sub(in0_im_8_15, out0_im_8_15);
 
-            reals_s0[8..16].copy_from_slice(out0_re_8_15.as_array());
-            imags_s0[8..16].copy_from_slice(out0_im_8_15.as_array());
-            reals_s1[8..16].copy_from_slice(out1_re_8_15.as_array());
-            imags_s1[8..16].copy_from_slice(out1_im_8_15.as_array());
+            out0_re_8_15.store_slice(&mut reals_s0[8..16]);
+            out0_im_8_15.store_slice(&mut imags_s0[8..16]);
+            out1_re_8_15.store_slice(&mut reals_s1[8..16]);
+            out1_im_8_15.store_slice(&mut imags_s1[8..16]);
         });
 }
 
 /// DIT butterfly for chunk_size == 32 (f32)
-#[multiversion::multiversion(targets(
-    "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86_64+avx2+fma",
-    "x86_64+sse4.2",
-    "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86+avx2+fma",
-    "x86+sse4.2",
-    "x86+sse2",
-))]
-pub fn fft_dit_chunk_32_simd_f32(reals: &mut [f32], imags: &mut [f32]) {
+#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure
+pub fn fft_dit_chunk_32_f32<S: Simd>(simd: S, reals: &mut [f32], imags: &mut [f32]) {
+    simd.vectorize(
+        #[inline(always)]
+        || fft_dit_chunk_32_simd_f32(simd, reals, imags),
+    )
+}
+
+/// DIT butterfly for chunk_size == 32 (f32)
+#[inline(always)] // required by fearless_simd
+fn fft_dit_chunk_32_simd_f32<S: Simd>(simd: S, reals: &mut [f32], imags: &mut [f32]) {
     const DIST: usize = 16;
     const CHUNK_SIZE: usize = DIST << 1;
 
-    let two = f32x16::splat(2.0);
+    let two = f32x16::splat(simd, 2.0);
 
     // All 16 twiddle factors for W_32^k where k = 0..15
-    let twiddle_re = f32x16::new([
-        1.0_f32,                         // W_32^0 = 1
-        0.980_785_25_f32,                // W_32^1 = cos(π/16)
-        0.923_879_5_f32,                 // W_32^2 = cos(π/8)
-        0.831_469_6_f32,                 // W_32^3 = cos(3π/16)
-        std::f32::consts::FRAC_1_SQRT_2, // W_32^4 = sqrt(2)/2
-        0.555_570_24_f32,                // W_32^5 = cos(5π/16)
-        0.382_683_43_f32,                // W_32^6 = cos(3π/8)
-        0.195_090_32_f32,                // W_32^7 = cos(7π/16)
-        0.0_f32,                         // W_32^8 = 0 - i
-        -0.195_090_32_f32,               // W_32^9
-        -0.382_683_43_f32,               // W_32^10
-        -0.555_570_24_f32,               // W_32^11
-        -f32::consts::FRAC_1_SQRT_2,     // W_32^12
-        -0.831_469_6_f32,                // W_32^13
-        -0.923_879_5_f32,                // W_32^14
-        -0.980_785_25_f32,               // W_32^15
-    ]);
-
-    let twiddle_im = f32x16::new([
-        0.0_f32,                          // W_32^0
-        -0.195_090_32_f32,                // W_32^1 = -sin(π/16)
-        -0.382_683_43_f32,                // W_32^2 = -sin(π/8)
-        -0.555_570_24_f32,                // W_32^3 = -sin(3π/16)
-        -std::f32::consts::FRAC_1_SQRT_2, // W_32^4 = -sqrt(2)/2
-        -0.831_469_6_f32,                 // W_32^5 = -sin(5π/16)
-        -0.923_879_5_f32,                 // W_32^6 = -sin(3π/8)
-        -0.980_785_25_f32,                // W_32^7 = -sin(7π/16)
-        -1.0_f32,                         // W_32^8
-        -0.980_785_25_f32,                // W_32^9
-        -0.923_879_5_f32,                 // W_32^10
-        -0.831_469_6_f32,                 // W_32^11
-        -std::f32::consts::FRAC_1_SQRT_2, // W_32^12
-        -0.555_570_24_f32,                // W_32^13
-        -0.382_683_43_f32,                // W_32^14
-        -0.195_090_32_f32,                // W_32^15
-    ]);
+    let twiddle_re = f32x16::simd_from(
+        simd,
+        [
+            1.0_f32,                         // W_32^0 = 1
+            0.980_785_25_f32,                // W_32^1 = cos(π/16)
+            0.923_879_5_f32,                 // W_32^2 = cos(π/8)
+            0.831_469_6_f32,                 // W_32^3 = cos(3π/16)
+            std::f32::consts::FRAC_1_SQRT_2, // W_32^4 = sqrt(2)/2
+            0.555_570_24_f32,                // W_32^5 = cos(5π/16)
+            0.382_683_43_f32,                // W_32^6 = cos(3π/8)
+            0.195_090_32_f32,                // W_32^7 = cos(7π/16)
+            0.0_f32,                         // W_32^8 = 0 - i
+            -0.195_090_32_f32,               // W_32^9
+            -0.382_683_43_f32,               // W_32^10
+            -0.555_570_24_f32,               // W_32^11
+            -f32::consts::FRAC_1_SQRT_2,     // W_32^12
+            -0.831_469_6_f32,                // W_32^13
+            -0.923_879_5_f32,                // W_32^14
+            -0.980_785_25_f32,               // W_32^15
+        ],
+    );
+
+    let twiddle_im = f32x16::simd_from(
+        simd,
+        [
+            0.0_f32,                          // W_32^0
+            -0.195_090_32_f32,                // W_32^1 = -sin(π/16)
+            -0.382_683_43_f32,                // W_32^2 = -sin(π/8)
+            -0.555_570_24_f32,                // W_32^3 = -sin(3π/16)
+            -std::f32::consts::FRAC_1_SQRT_2, // W_32^4 = -sqrt(2)/2
+            -0.831_469_6_f32,                 // W_32^5 = -sin(5π/16)
+            -0.923_879_5_f32,                 // W_32^6 = -sin(3π/8)
+            -0.980_785_25_f32,                // W_32^7 = -sin(7π/16)
+            -1.0_f32,                         // W_32^8
+            -0.980_785_25_f32,                // W_32^9
+            -0.923_879_5_f32,                 // W_32^10
+            -0.831_469_6_f32,                 // W_32^11
+            -std::f32::consts::FRAC_1_SQRT_2, // W_32^12
+            -0.555_570_24_f32,                // W_32^13
+            -0.382_683_43_f32,                // W_32^14
+            -0.195_090_32_f32,                // W_32^15
+        ],
+    );
 
     (reals.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
         .zip(imags.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
@@ -526,10 +588,10 @@ pub fn fft_dit_chunk_32_simd_f32(reals: &mut [f32], imags: &mut [f32]) {
             let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST);
 
             // Process all 16 butterflies at once with f32x16
-            let in0_re = f32x16::new(reals_s0[0..16].try_into().unwrap());
-            let in1_re = f32x16::new(reals_s1[0..16].try_into().unwrap());
-            let in0_im = f32x16::new(imags_s0[0..16].try_into().unwrap());
-            let in1_im = f32x16::new(imags_s1[0..16].try_into().unwrap());
+            let in0_re = f32x16::from_slice(simd, &reals_s0[0..16]);
+            let in1_re = f32x16::from_slice(simd, &reals_s1[0..16]);
+            let in0_im = f32x16::from_slice(simd, &imags_s0[0..16]);
+            let in1_im = f32x16::from_slice(simd, &imags_s1[0..16]);
 
             let out0_re = twiddle_im.mul_add(-in1_im, twiddle_re.mul_add(in1_re, in0_re));
             let out0_im = twiddle_im.mul_add(in1_re, twiddle_re.mul_add(in1_im, in0_im));
@@ -537,121 +599,146 @@ pub fn fft_dit_chunk_32_simd_f32(reals: &mut [f32], imags: &mut [f32]) {
             let out1_re = two.mul_sub(in0_re, out0_re);
             let out1_im = two.mul_sub(in0_im, out0_im);
 
-            reals_s0.copy_from_slice(out0_re.as_array());
-            imags_s0.copy_from_slice(out0_im.as_array());
-            reals_s1.copy_from_slice(out1_re.as_array());
-            imags_s1.copy_from_slice(out1_im.as_array());
+            out0_re.store_slice(reals_s0);
+            out0_im.store_slice(imags_s0);
+            out1_re.store_slice(reals_s1);
+            out1_im.store_slice(imags_s1);
         });
 }
 
 /// DIT butterfly for chunk_size == 64 (f64)
-#[multiversion::multiversion(targets(
-    "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86_64+avx2+fma",
-    "x86_64+sse4.2",
-    "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86+avx2+fma",
-    "x86+sse4.2",
-    "x86+sse2",
-))]
-pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) {
+#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure
+pub fn fft_dit_chunk_64_f64<S: Simd>(simd: S, reals: &mut [f64], imags: &mut [f64]) {
+    simd.vectorize(
+        #[inline(always)]
+        || fft_dit_chunk_64_simd_f64(simd, reals, imags),
+    )
+}
+
+/// DIT butterfly for chunk_size == 64 (f64)
+#[inline(always)] // required by fearless_simd
+fn fft_dit_chunk_64_simd_f64<S: Simd>(simd: S, reals: &mut [f64], imags: &mut [f64]) {
     const DIST: usize = 32;
     const CHUNK_SIZE: usize = DIST << 1;
 
-    let two = f64x8::splat(2.0);
+    let two = f64x8::splat(simd, 2.0);
 
     // Process in 4 iterations of 8 butterflies each
     // Twiddles for W_64^k where k = 0..7
-    let twiddle_re_0_7 = f64x8::new([
-        1.0,                // W_64^0 = 1
-        0.9951847266721969, // W_64^1 = cos(π/32)
-        0.9807852804032304, // W_64^2 = cos(π/16)
-        0.9569403357322089, // W_64^3 = cos(3π/32)
-        0.9238795325112867, // W_64^4 = cos(π/8)
-        0.8819212643483549, // W_64^5 = cos(5π/32)
-        0.8314696123025452, // W_64^6 = cos(3π/16)
-        0.773010453362737,  // W_64^7 = cos(7π/32)
-    ]);
-
-    let twiddle_im_0_7 = f64x8::new([
-        0.0,                  // W_64^0
-        -0.0980171403295606,  // W_64^1 = -sin(π/32)
-        -0.19509032201612825, // W_64^2 = -sin(π/16)
-        -0.29028467725446233, // W_64^3 = -sin(3π/32)
-        -0.3826834323650898,  // W_64^4 = -sin(π/8)
-        -0.47139673682599764, // W_64^5 = -sin(5π/32)
-        -0.5555702330196022,  // W_64^6 = -sin(3π/16)
-        -0.6343932841636455,  // W_64^7 = -sin(7π/32)
-    ]);
+    let twiddle_re_0_7 = f64x8::simd_from(
+        simd,
+        [
+            1.0,                // W_64^0 = 1
+            0.9951847266721969, // W_64^1 = cos(π/32)
+            0.9807852804032304, // W_64^2 = cos(π/16)
+            0.9569403357322089, // W_64^3 = cos(3π/32)
+            0.9238795325112867, // W_64^4 = cos(π/8)
+            0.8819212643483549, // W_64^5 = cos(5π/32)
+            0.8314696123025452, // W_64^6 = cos(3π/16)
+            0.773010453362737,  // W_64^7 = cos(7π/32)
+        ],
+    );
+
+    let twiddle_im_0_7 = f64x8::simd_from(
+        simd,
+        [
+            0.0,                  // W_64^0
+            -0.0980171403295606,  // W_64^1 = -sin(π/32)
+            -0.19509032201612825, // W_64^2 = -sin(π/16)
+            -0.29028467725446233, // W_64^3 = -sin(3π/32)
+            -0.3826834323650898,  // W_64^4 = -sin(π/8)
+            -0.47139673682599764, // W_64^5 = -sin(5π/32)
+            -0.5555702330196022,  // W_64^6 = -sin(3π/16)
+            -0.6343932841636455,  // W_64^7 = -sin(7π/32)
+        ],
+    );
 
     // Twiddles for k = 8..15
-    let twiddle_re_8_15 = f64x8::new([
-        std::f64::consts::FRAC_1_SQRT_2, // W_64^8 = sqrt(2)/2
-        0.6343932841636455,              // W_64^9
-        0.5555702330196022,              // W_64^10
-        0.47139673682599764,             // W_64^11
-        0.3826834323650898,              // W_64^12
-        0.29028467725446233,             // W_64^13
-        0.19509032201612825,             // W_64^14
-        0.0980171403295606,              // W_64^15
-    ]);
-
-    let twiddle_im_8_15 = f64x8::new([
-        -std::f64::consts::FRAC_1_SQRT_2, // W_64^8
-        -0.773010453362737,               // W_64^9
-        -0.8314696123025452,              // W_64^10
-        -0.8819212643483549,              // W_64^11
-        -0.9238795325112867,              // W_64^12
-        -0.9569403357322089,              // W_64^13
-        -0.9807852804032304,              // W_64^14
-        -0.9951847266721969,              // W_64^15
-    ]);
+    let twiddle_re_8_15 = f64x8::simd_from(
+        simd,
+        [
+            std::f64::consts::FRAC_1_SQRT_2, // W_64^8 = sqrt(2)/2
+            0.6343932841636455,              // W_64^9
+            0.5555702330196022,              // W_64^10
+            0.47139673682599764,             // W_64^11
+            0.3826834323650898,              // W_64^12
+            0.29028467725446233,             // W_64^13
+            0.19509032201612825,             // W_64^14
+            0.0980171403295606,              // W_64^15
+        ],
+    );
+
+    let twiddle_im_8_15 = f64x8::simd_from(
+        simd,
+        [
+            -std::f64::consts::FRAC_1_SQRT_2, // W_64^8
+            -0.773010453362737,               // W_64^9
+            -0.8314696123025452,              // W_64^10
+            -0.8819212643483549,              // W_64^11
+            -0.9238795325112867,              // W_64^12
+            -0.9569403357322089,              // W_64^13
+            -0.9807852804032304,              // W_64^14
+            -0.9951847266721969,              // W_64^15
+        ],
+    );
 
     // Twiddles for k = 16..23
-    let twiddle_re_16_23 = f64x8::new([
-        0.0,                  // W_64^16 = -i
-        -0.0980171403295606,  // W_64^17
-        -0.19509032201612825, // W_64^18
-        -0.29028467725446233, // W_64^19
-        -0.3826834323650898,  // W_64^20
-        -0.47139673682599764, // W_64^21
-        -0.5555702330196022,  // W_64^22
-        -0.6343932841636455,  // W_64^23
-    ]);
-
-    let twiddle_im_16_23 = f64x8::new([
-        -1.0,                // W_64^16
-        -0.9951847266721969, // W_64^17
-        -0.9807852804032304, // W_64^18
-        -0.9569403357322089, // W_64^19
-        -0.9238795325112867, // W_64^20
-        -0.8819212643483549, // W_64^21
-        -0.8314696123025452, // W_64^22
-        -0.773010453362737,  // W_64^23
-    ]);
+    let twiddle_re_16_23 = f64x8::simd_from(
+        simd,
+        [
+            0.0,                  // W_64^16 = -i
+            -0.0980171403295606,  // W_64^17
+            -0.19509032201612825, // W_64^18
+            -0.29028467725446233, // W_64^19
+            -0.3826834323650898,  // W_64^20
+            -0.47139673682599764, // W_64^21
+            -0.5555702330196022,  // W_64^22
+            -0.6343932841636455,  // W_64^23
+        ],
+    );
+
+    let twiddle_im_16_23 = f64x8::simd_from(
+        simd,
+        [
+            -1.0,                // W_64^16
+            -0.9951847266721969, // W_64^17
+            -0.9807852804032304, // W_64^18
+            -0.9569403357322089, // W_64^19
+            -0.9238795325112867, // W_64^20
+            -0.8819212643483549, // W_64^21
+            -0.8314696123025452, // W_64^22
+            -0.773010453362737,  // W_64^23
+        ],
+    );
 
     // Twiddles for k = 24..31
-    let twiddle_re_24_31 = f64x8::new([
-        -std::f64::consts::FRAC_1_SQRT_2, // W_64^24
-        -0.773010453362737,               // W_64^25
-        -0.8314696123025452,              // W_64^26
-        -0.8819212643483549,              // W_64^27
-        -0.9238795325112867,              // W_64^28
-        -0.9569403357322089,              // W_64^29
-        -0.9807852804032304,              // W_64^30
-        -0.9951847266721969,              // W_64^31
-    ]);
-
-    let twiddle_im_24_31 = f64x8::new([
-        -std::f64::consts::FRAC_1_SQRT_2, // W_64^24
-        -0.6343932841636455,              // W_64^25
-        -0.5555702330196022,              // W_64^26
-        -0.47139673682599764,             // W_64^27
-        -0.3826834323650898,              // W_64^28
-        -0.29028467725446233,             // W_64^29
-        -0.19509032201612825,             // W_64^30
-        -0.0980171403295606,              // W_64^31
-    ]);
+    let twiddle_re_24_31 = f64x8::simd_from(
+        simd,
+        [
+            -std::f64::consts::FRAC_1_SQRT_2, // W_64^24
+            -0.773010453362737,               // W_64^25
+            -0.8314696123025452,              // W_64^26
+            -0.8819212643483549,              // W_64^27
+            -0.9238795325112867,              // W_64^28
+            -0.9569403357322089,              // W_64^29
+            -0.9807852804032304,              // W_64^30
+            -0.9951847266721969,              // W_64^31
+        ],
+    );
+
+    let twiddle_im_24_31 = f64x8::simd_from(
+        simd,
+        [
+            -std::f64::consts::FRAC_1_SQRT_2, // W_64^24
+            -0.6343932841636455,              // W_64^25
+            -0.5555702330196022,              // W_64^26
+            -0.47139673682599764,             // W_64^27
+            -0.3826834323650898,              // W_64^28
+            -0.29028467725446233,             // W_64^29
+            -0.19509032201612825,             // W_64^30
+            -0.0980171403295606,              // W_64^31
+        ],
+    );
 
     (reals.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
         .zip(imags.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
@@ -660,42 +747,42 @@ pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) {
             let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST);
 
             // Process butterflies 0..7
-            let in0_re = f64x8::new(reals_s0[0..8].try_into().unwrap());
-            let in1_re = f64x8::new(reals_s1[0..8].try_into().unwrap());
-            let in0_im = f64x8::new(imags_s0[0..8].try_into().unwrap());
-            let in1_im = f64x8::new(imags_s1[0..8].try_into().unwrap());
+            let in0_re = f64x8::from_slice(simd, &reals_s0[0..8]);
+            let in1_re = f64x8::from_slice(simd, &reals_s1[0..8]);
+            let in0_im = f64x8::from_slice(simd, &imags_s0[0..8]);
+            let in1_im = f64x8::from_slice(simd, &imags_s1[0..8]);
 
             let out0_re = twiddle_im_0_7.mul_add(-in1_im, twiddle_re_0_7.mul_add(in1_re, in0_re));
             let out0_im = twiddle_im_0_7.mul_add(in1_re, twiddle_re_0_7.mul_add(in1_im, in0_im));
             let out1_re = two.mul_sub(in0_re, out0_re);
             let out1_im = two.mul_sub(in0_im, out0_im);
 
-            reals_s0[0..8].copy_from_slice(out0_re.as_array());
-            imags_s0[0..8].copy_from_slice(out0_im.as_array());
-            reals_s1[0..8].copy_from_slice(out1_re.as_array());
-            imags_s1[0..8].copy_from_slice(out1_im.as_array());
+            out0_re.store_slice(&mut reals_s0[0..8]);
+            out0_im.store_slice(&mut imags_s0[0..8]);
+            out1_re.store_slice(&mut reals_s1[0..8]);
+            out1_im.store_slice(&mut imags_s1[0..8]);
 
             // Process butterflies 8..15
-            let in0_re = f64x8::new(reals_s0[8..16].try_into().unwrap());
-            let in1_re = f64x8::new(reals_s1[8..16].try_into().unwrap());
-            let in0_im = f64x8::new(imags_s0[8..16].try_into().unwrap());
-            let in1_im = f64x8::new(imags_s1[8..16].try_into().unwrap());
+            let in0_re = f64x8::from_slice(simd, &reals_s0[8..16]);
+            let in1_re = f64x8::from_slice(simd, &reals_s1[8..16]);
+            let in0_im = f64x8::from_slice(simd, &imags_s0[8..16]);
+            let in1_im = f64x8::from_slice(simd, &imags_s1[8..16]);
 
             let out0_re = twiddle_im_8_15.mul_add(-in1_im, twiddle_re_8_15.mul_add(in1_re, in0_re));
             let out0_im = twiddle_im_8_15.mul_add(in1_re, twiddle_re_8_15.mul_add(in1_im, in0_im));
             let out1_re = two.mul_sub(in0_re, out0_re);
             let out1_im = two.mul_sub(in0_im, out0_im);
 
-            reals_s0[8..16].copy_from_slice(out0_re.as_array());
-            imags_s0[8..16].copy_from_slice(out0_im.as_array());
-            reals_s1[8..16].copy_from_slice(out1_re.as_array());
-            imags_s1[8..16].copy_from_slice(out1_im.as_array());
+            out0_re.store_slice(&mut reals_s0[8..16]);
+            out0_im.store_slice(&mut imags_s0[8..16]);
+            out1_re.store_slice(&mut reals_s1[8..16]);
+            out1_im.store_slice(&mut imags_s1[8..16]);
 
             // Process butterflies 16..23
-            let in0_re = f64x8::new(reals_s0[16..24].try_into().unwrap());
-            let in1_re = f64x8::new(reals_s1[16..24].try_into().unwrap());
-            let in0_im = f64x8::new(imags_s0[16..24].try_into().unwrap());
-            let in1_im = f64x8::new(imags_s1[16..24].try_into().unwrap());
+            let in0_re = f64x8::from_slice(simd, &reals_s0[16..24]);
+            let in1_re = f64x8::from_slice(simd, &reals_s1[16..24]);
+            let in0_im = f64x8::from_slice(simd, &imags_s0[16..24]);
+            let in1_im = f64x8::from_slice(simd, &imags_s1[16..24]);
 
             let out0_re =
                 twiddle_im_16_23.mul_add(-in1_im, twiddle_re_16_23.mul_add(in1_re, in0_re));
@@ -704,16 +791,16 @@ pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) {
             let out1_re = two.mul_sub(in0_re, out0_re);
             let out1_im = two.mul_sub(in0_im, out0_im);
 
-            reals_s0[16..24].copy_from_slice(out0_re.as_array());
-            imags_s0[16..24].copy_from_slice(out0_im.as_array());
-            reals_s1[16..24].copy_from_slice(out1_re.as_array());
-            imags_s1[16..24].copy_from_slice(out1_im.as_array());
+            out0_re.store_slice(&mut reals_s0[16..24]);
+            out0_im.store_slice(&mut imags_s0[16..24]);
+            out1_re.store_slice(&mut reals_s1[16..24]);
+            out1_im.store_slice(&mut imags_s1[16..24]);
 
             // Process butterflies 24..31
-            let in0_re = f64x8::new(reals_s0[24..32].try_into().unwrap());
-            let in1_re = f64x8::new(reals_s1[24..32].try_into().unwrap());
-            let in0_im = f64x8::new(imags_s0[24..32].try_into().unwrap());
-            let in1_im = f64x8::new(imags_s1[24..32].try_into().unwrap());
+            let in0_re = f64x8::from_slice(simd, &reals_s0[24..32]);
+            let in1_re = f64x8::from_slice(simd, &reals_s1[24..32]);
+            let in0_im = f64x8::from_slice(simd, &imags_s0[24..32]);
+            let in1_im = f64x8::from_slice(simd, &imags_s1[24..32]);
 
             let out0_re =
                 twiddle_im_24_31.mul_add(-in1_im, twiddle_re_24_31.mul_add(in1_re, in0_re));
@@ -722,107 +809,120 @@ pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) {
             let out1_re = two.mul_sub(in0_re, out0_re);
             let out1_im = two.mul_sub(in0_im, out0_im);
 
-            reals_s0[24..32].copy_from_slice(out0_re.as_array());
-            imags_s0[24..32].copy_from_slice(out0_im.as_array());
-            reals_s1[24..32].copy_from_slice(out1_re.as_array());
-            imags_s1[24..32].copy_from_slice(out1_im.as_array());
+            out0_re.store_slice(&mut reals_s0[24..32]);
+            out0_im.store_slice(&mut imags_s0[24..32]);
+            out1_re.store_slice(&mut reals_s1[24..32]);
+            out1_im.store_slice(&mut imags_s1[24..32]);
         });
 }
 
 /// DIT butterfly for chunk_size == 64 (f32)
-#[multiversion::multiversion(targets(
-    "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86_64+avx2+fma",
-    "x86_64+sse4.2",
-    "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86+avx2+fma",
-    "x86+sse4.2",
-    "x86+sse2",
-))]
-pub fn fft_dit_chunk_64_simd_f32(reals: &mut [f32], imags: &mut [f32]) {
+#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure
+pub fn fft_dit_chunk_64_f32<S: Simd>(simd: S, reals: &mut [f32], imags: &mut [f32]) {
+    simd.vectorize(
+        #[inline(always)]
+        || fft_dit_chunk_64_simd_f32(simd, reals, imags),
+    )
+}
+
+/// DIT butterfly for chunk_size == 64 (f32)
+#[inline(always)] // required by fearless_simd
+fn fft_dit_chunk_64_simd_f32<S: Simd>(simd: S, reals: &mut [f32], imags: &mut [f32]) {
     const DIST: usize = 32;
     const CHUNK_SIZE: usize = DIST << 1;
 
-    let two = f32x16::splat(2.0);
+    let two = f32x16::splat(simd, 2.0);
 
     // Process in 2 iterations of 16 butterflies each
     // Twiddles for W_64^k where k = 0..15
-    let twiddle_re_0_15 = f32x16::new([
-        1.0_f32,                         // W_64^0 = 1
-        0.995_184_7_f32,                 // W_64^1 = cos(π/32)
-        0.980_785_25_f32,                // W_64^2 = cos(π/16)
-        0.956_940_35_f32,                // W_64^3 = cos(3π/32)
-        0.923_879_5_f32,                 // W_64^4 = cos(π/8)
-        0.881_921_3_f32,                 // W_64^5 = cos(5π/32)
-        0.831_469_6_f32,                 // W_64^6 = cos(3π/16)
-        0.773_010_43_f32,                // W_64^7 = cos(7π/32)
-        std::f32::consts::FRAC_1_SQRT_2, // W_64^8 = sqrt(2)/2
-        0.634_393_3_f32,                 // W_64^9
-        0.555_570_24_f32,                // W_64^10
-        0.471_396_74_f32,                // W_64^11
-        0.382_683_43_f32,                // W_64^12
-        0.290_284_66_f32,                // W_64^13
-        0.195_090_32_f32,                // W_64^14
-        0.098_017_14_f32,                // W_64^15
-    ]);
-
-    let twiddle_im_0_15 = f32x16::new([
-        0.0_f32,                          // W_64^0
-        -0.098_017_14_f32,                // W_64^1 = -sin(π/32)
-        -0.195_090_32_f32,                // W_64^2 = -sin(π/16)
-        -0.290_284_66_f32,                // W_64^3 = -sin(3π/32)
-        -0.382_683_43_f32,                // W_64^4 = -sin(π/8)
-        -0.471_396_74_f32,                // W_64^5 = -sin(5π/32)
-        -0.555_570_24_f32,                // W_64^6 = -sin(3π/16)
-        -0.634_393_3_f32,                 // W_64^7 = -sin(7π/32)
-        -std::f32::consts::FRAC_1_SQRT_2, // W_64^8
-        -0.773_010_43_f32,                // W_64^9
-        -0.831_469_6_f32,                 // W_64^10
-        -0.881_921_3_f32,                 // W_64^11
-        -0.923_879_5_f32,                 // W_64^12
-        -0.956_940_35_f32,                // W_64^13
-        -0.980_785_25_f32,                // W_64^14
-        -0.995_184_7_f32,                 // W_64^15
-    ]);
+    let twiddle_re_0_15 = f32x16::simd_from(
+        simd,
+        [
+            1.0_f32,                         // W_64^0 = 1
+            0.995_184_7_f32,                 // W_64^1 = cos(π/32)
+            0.980_785_25_f32,                // W_64^2 = cos(π/16)
+            0.956_940_35_f32,                // W_64^3 = cos(3π/32)
+            0.923_879_5_f32,                 // W_64^4 = cos(π/8)
+            0.881_921_3_f32,                 // W_64^5 = cos(5π/32)
+            0.831_469_6_f32,                 // W_64^6 = cos(3π/16)
+            0.773_010_43_f32,                // W_64^7 = cos(7π/32)
+            std::f32::consts::FRAC_1_SQRT_2, // W_64^8 = sqrt(2)/2
+            0.634_393_3_f32,                 // W_64^9
+            0.555_570_24_f32,                // W_64^10
+            0.471_396_74_f32,                // W_64^11
+            0.382_683_43_f32,                // W_64^12
+            0.290_284_66_f32,                // W_64^13
+            0.195_090_32_f32,                // W_64^14
+            0.098_017_14_f32,                // W_64^15
+        ],
+    );
+
+    let twiddle_im_0_15 = f32x16::simd_from(
+        simd,
+        [
+            0.0_f32,                          // W_64^0
+            -0.098_017_14_f32,                // W_64^1 = -sin(π/32)
+            -0.195_090_32_f32,                // W_64^2 = -sin(π/16)
+            -0.290_284_66_f32,                // W_64^3 = -sin(3π/32)
+            -0.382_683_43_f32,                // W_64^4 = -sin(π/8)
+            -0.471_396_74_f32,                // W_64^5 = -sin(5π/32)
+            -0.555_570_24_f32,                // W_64^6 = -sin(3π/16)
+            -0.634_393_3_f32,                 // W_64^7 = -sin(7π/32)
+            -std::f32::consts::FRAC_1_SQRT_2, // W_64^8
+            -0.773_010_43_f32,                // W_64^9
+            -0.831_469_6_f32,                 // W_64^10
+            -0.881_921_3_f32,                 // W_64^11
+            -0.923_879_5_f32,                 // W_64^12
+            -0.956_940_35_f32,                // W_64^13
+            -0.980_785_25_f32,                // W_64^14
+            -0.995_184_7_f32,                 // W_64^15
+        ],
+    );
 
     // Twiddles for k = 16..31
-    let twiddle_re_16_31 = f32x16::new([
-        0.0_f32,                          // W_64^16 = -i
-        -0.098_017_14_f32,                // W_64^17
-        -0.195_090_32_f32,                // W_64^18
-        -0.290_284_66_f32,                // W_64^19
-        -0.382_683_43_f32,                // W_64^20
-        -0.471_396_74_f32,                // W_64^21
-        -0.555_570_24_f32,                // W_64^22
-        -0.634_393_3_f32,                 // W_64^23
-        -std::f32::consts::FRAC_1_SQRT_2, // W_64^24
-        -0.773_010_43_f32,                // W_64^25
-        -0.831_469_6_f32,                 // W_64^26
-        -0.881_921_3_f32,                 // W_64^27
-        -0.923_879_5_f32,                 // W_64^28
-        -0.956_940_35_f32,                // W_64^29
-        -0.980_785_25_f32,                // W_64^30
-        -0.995_184_7_f32,                 // W_64^31
-    ]);
-
-    let twiddle_im_16_31 = f32x16::new([
-        -1.0_f32,                         // W_64^16
-        -0.995_184_7_f32,                 // W_64^17
-        -0.980_785_25_f32,                // W_64^18
-        -0.956_940_35_f32,                // W_64^19
-        -0.923_879_5_f32,                 // W_64^20
-        -0.881_921_3_f32,                 // W_64^21
-        -0.831_469_6_f32,                 // W_64^22
-        -0.773_010_43_f32,                // W_64^23
-        -std::f32::consts::FRAC_1_SQRT_2, // W_64^24
-        -0.634_393_3_f32,                 // W_64^25
-        -0.555_570_24_f32,                // W_64^26
-        -0.471_396_74_f32,                // W_64^27
-        -0.382_683_43_f32,                // W_64^28
-        -0.290_284_66_f32,                // W_64^29
-        -0.195_090_32_f32,                // W_64^30
-        -0.098_017_14_f32,                // W_64^31
-    ]);
+    let twiddle_re_16_31 = f32x16::simd_from(
+        simd,
+        [
+            0.0_f32,                          // W_64^16 = -i
+            -0.098_017_14_f32,                // W_64^17
+            -0.195_090_32_f32,                // W_64^18
+            -0.290_284_66_f32,                // W_64^19
+            -0.382_683_43_f32,                // W_64^20
+            -0.471_396_74_f32,                // W_64^21
+            -0.555_570_24_f32,                // W_64^22
+            -0.634_393_3_f32,                 // W_64^23
+            -std::f32::consts::FRAC_1_SQRT_2, // W_64^24
+            -0.773_010_43_f32,                // W_64^25
+            -0.831_469_6_f32,                 // W_64^26
+            -0.881_921_3_f32,                 // W_64^27
+            -0.923_879_5_f32,                 // W_64^28
+            -0.956_940_35_f32,                // W_64^29
+            -0.980_785_25_f32,                // W_64^30
+            -0.995_184_7_f32,                 // W_64^31
+        ],
+    );
+
+    let twiddle_im_16_31 = f32x16::simd_from(
+        simd,
+        [
+            -1.0_f32,                         // W_64^16
+            -0.995_184_7_f32,                 // W_64^17
+            -0.980_785_25_f32,                // W_64^18
+            -0.956_940_35_f32,                // W_64^19
+            -0.923_879_5_f32,                 // W_64^20
+            -0.881_921_3_f32,                 // W_64^21
+            -0.831_469_6_f32,                 // W_64^22
+            -0.773_010_43_f32,                // W_64^23
+            -std::f32::consts::FRAC_1_SQRT_2, // W_64^24
+            -0.634_393_3_f32,                 // W_64^25
+            -0.555_570_24_f32,                // W_64^26
+            -0.471_396_74_f32,                // W_64^27
+            -0.382_683_43_f32,                // W_64^28
+            -0.290_284_66_f32,                // W_64^29
+            -0.195_090_32_f32,                // W_64^30
+            -0.098_017_14_f32,                // W_64^31
+        ],
+    );
 
     (reals.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
         .zip(imags.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
@@ -831,26 +931,26 @@ pub fn fft_dit_chunk_64_simd_f32(reals: &mut [f32], imags: &mut [f32]) {
             let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST);
 
             // Process butterflies 0..15
-            let in0_re = f32x16::new(reals_s0[0..16].try_into().unwrap());
-            let in1_re = f32x16::new(reals_s1[0..16].try_into().unwrap());
-            let in0_im = f32x16::new(imags_s0[0..16].try_into().unwrap());
-            let in1_im = f32x16::new(imags_s1[0..16].try_into().unwrap());
+            let in0_re = f32x16::from_slice(simd, &reals_s0[0..16]);
+            let in1_re = f32x16::from_slice(simd, &reals_s1[0..16]);
+            let in0_im = f32x16::from_slice(simd, &imags_s0[0..16]);
+            let in1_im = f32x16::from_slice(simd, &imags_s1[0..16]);
 
             let out0_re = twiddle_im_0_15.mul_add(-in1_im, twiddle_re_0_15.mul_add(in1_re, in0_re));
             let out0_im = twiddle_im_0_15.mul_add(in1_re, twiddle_re_0_15.mul_add(in1_im, in0_im));
             let out1_re = two.mul_sub(in0_re, out0_re);
             let out1_im = two.mul_sub(in0_im, out0_im);
 
-            reals_s0[0..16].copy_from_slice(out0_re.as_array());
-            imags_s0[0..16].copy_from_slice(out0_im.as_array());
-            reals_s1[0..16].copy_from_slice(out1_re.as_array());
-            imags_s1[0..16].copy_from_slice(out1_im.as_array());
+            out0_re.store_slice(&mut reals_s0[0..16]);
+            out0_im.store_slice(&mut imags_s0[0..16]);
+            out1_re.store_slice(&mut reals_s1[0..16]);
+            out1_im.store_slice(&mut imags_s1[0..16]);
 
             // Process butterflies 16..31
-            let in0_re = f32x16::new(reals_s0[16..32].try_into().unwrap());
-            let in1_re = f32x16::new(reals_s1[16..32].try_into().unwrap());
-            let in0_im = f32x16::new(imags_s0[16..32].try_into().unwrap());
-            let in1_im = f32x16::new(imags_s1[16..32].try_into().unwrap());
+            let in0_re = f32x16::from_slice(simd, &reals_s0[16..32]);
+            let in1_re = f32x16::from_slice(simd, &reals_s1[16..32]);
+            let in0_im = f32x16::from_slice(simd, &imags_s0[16..32]);
+            let in1_im = f32x16::from_slice(simd, &imags_s1[16..32]);
 
             let out0_re =
                 twiddle_im_16_31.mul_add(-in1_im, twiddle_re_16_31.mul_add(in1_re, in0_re));
@@ -859,24 +959,33 @@ pub fn fft_dit_chunk_64_simd_f32(reals: &mut [f32], imags: &mut [f32]) {
             let out1_re = two.mul_sub(in0_re, out0_re);
             let out1_im = two.mul_sub(in0_im, out0_im);
 
-            reals_s0[16..32].copy_from_slice(out0_re.as_array());
-            imags_s0[16..32].copy_from_slice(out0_im.as_array());
-            reals_s1[16..32].copy_from_slice(out1_re.as_array());
-            imags_s1[16..32].copy_from_slice(out1_im.as_array());
+            out0_re.store_slice(&mut reals_s0[16..32]);
+            out0_im.store_slice(&mut imags_s0[16..32]);
+            out1_re.store_slice(&mut reals_s1[16..32]);
+            out1_im.store_slice(&mut imags_s1[16..32]);
         });
 }
 
 /// General DIT butterfly for f64
-#[multiversion::multiversion(targets(
-    "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86_64+avx2+fma",
-    "x86_64+sse4.2",
-    "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86+avx2+fma",
-    "x86+sse4.2",
-    "x86+sse2",
-))]
-pub fn fft_dit_64_chunk_n_simd(
+#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure
+pub fn fft_dit_chunk_n_f64<S: Simd>(
+    simd: S,
+    reals: &mut [f64],
+    imags: &mut [f64],
+    twiddles_re: &[f64],
+    twiddles_im: &[f64],
+    dist: usize,
+) {
+    simd.vectorize(
+        #[inline(always)]
+        || fft_dit_chunk_n_simd_f64(simd, reals, imags, twiddles_re, twiddles_im, dist),
+    )
+}
+
+/// General DIT butterfly for f64
+#[inline(always)] // required by fearless_simd
+fn fft_dit_chunk_n_simd_f64<S: Simd>(
+    simd: S,
     reals: &mut [f64],
     imags: &mut [f64],
     twiddles_re: &[f64],
@@ -901,14 +1010,14 @@ pub fn fft_dit_64_chunk_n_simd(
                 .zip(twiddles_re.as_chunks::<LANES>().0.iter())
                 .zip(twiddles_im.as_chunks::<LANES>().0.iter())
                 .for_each(|(((((re_s0, re_s1), im_s0), im_s1), tw_re), tw_im)| {
-                    let two = f64x8::splat(2.0);
-                    let in0_re = f64x8::new(*re_s0);
-                    let in1_re = f64x8::new(*re_s1);
-                    let in0_im = f64x8::new(*im_s0);
-                    let in1_im = f64x8::new(*im_s1);
+                    let two = f64x8::splat(simd, 2.0);
+                    let in0_re = f64x8::simd_from(simd, *re_s0);
+                    let in1_re = f64x8::simd_from(simd, *re_s1);
+                    let in0_im = f64x8::simd_from(simd, *im_s0);
+                    let in1_im = f64x8::simd_from(simd, *im_s1);
 
-                    let tw_re = f64x8::new(*tw_re);
-                    let tw_im = f64x8::new(*tw_im);
+                    let tw_re = f64x8::simd_from(simd, *tw_re);
+                    let tw_im = f64x8::simd_from(simd, *tw_im);
 
                     // out0.re = (in0.re + tw_re * in1.re) - tw_im * in1.im
                     let out0_re = tw_im.mul_add(-in1_im, tw_re.mul_add(in1_re, in0_re));
@@ -919,25 +1028,34 @@ pub fn fft_dit_64_chunk_n_simd(
                     let out1_re = two.mul_sub(in0_re, out0_re);
                     let out1_im = two.mul_sub(in0_im, out0_im);
 
-                    re_s0.copy_from_slice(out0_re.as_array());
-                    im_s0.copy_from_slice(out0_im.as_array());
-                    re_s1.copy_from_slice(out1_re.as_array());
-                    im_s1.copy_from_slice(out1_im.as_array());
+                    out0_re.store_slice(re_s0);
+                    out0_im.store_slice(im_s0);
+                    out1_re.store_slice(re_s1);
+                    out1_im.store_slice(im_s1);
                 });
         });
 }
 
 /// General DIT butterfly for f32
-#[multiversion::multiversion(targets(
-    "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86_64+avx2+fma",
-    "x86_64+sse4.2",
-    "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni",
-    "x86+avx2+fma",
-    "x86+sse4.2",
-    "x86+sse2",
-))]
-pub fn fft_dit_32_chunk_n_simd(
+#[inline(never)] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure
+pub fn fft_dit_chunk_n_f32<S: Simd>(
+    simd: S,
+    reals: &mut [f32],
+    imags: &mut [f32],
+    twiddles_re: &[f32],
+    twiddles_im: &[f32],
+    dist: usize,
+) {
+    simd.vectorize(
+        #[inline(always)]
+        || fft_dit_chunk_n_simd_f32(simd, reals, imags, twiddles_re, twiddles_im, dist),
+    )
+}
+
+/// General DIT butterfly for f32
+#[inline(always)] // required by fearless_simd
+fn fft_dit_chunk_n_simd_f32<S: Simd>(
+    simd: S,
     reals: &mut [f32],
     imags: &mut [f32],
     twiddles_re: &[f32],
@@ -962,14 +1080,14 @@ pub fn fft_dit_32_chunk_n_simd(
                 .zip(twiddles_re.as_chunks::<LANES>().0.iter())
                 .zip(twiddles_im.as_chunks::<LANES>().0.iter())
                 .for_each(|(((((re_s0, re_s1), im_s0), im_s1), tw_re), tw_im)| {
-                    let two = f32x16::splat(2.0);
-                    let in0_re = f32x16::new(*re_s0);
-                    let in1_re = f32x16::new(*re_s1);
-                    let in0_im = f32x16::new(*im_s0);
-                    let in1_im = f32x16::new(*im_s1);
+                    let two = f32x16::splat(simd, 2.0);
+                    let in0_re = f32x16::simd_from(simd, *re_s0);
+                    let in1_re = f32x16::simd_from(simd, *re_s1);
+                    let in0_im = f32x16::simd_from(simd, *im_s0);
+                    let in1_im = f32x16::simd_from(simd, *im_s1);
 
-                    let tw_re = f32x16::new(*tw_re);
-                    let tw_im = f32x16::new(*tw_im);
+                    let tw_re = f32x16::simd_from(simd, *tw_re);
+                    let tw_im = f32x16::simd_from(simd, *tw_im);
 
                     // out0.re = (in0.re + tw_re * in1.re) - tw_im * in1.im
                     let out0_re = tw_im.mul_add(-in1_im, tw_re.mul_add(in1_re, in0_re));
@@ -980,10 +1098,10 @@ pub fn fft_dit_32_chunk_n_simd(
                     let out1_re = two.mul_sub(in0_re, out0_re);
                     let out1_im = two.mul_sub(in0_im, out0_im);
 
-                    re_s0.copy_from_slice(out0_re.as_array());
-                    im_s0.copy_from_slice(out0_im.as_array());
-                    re_s1.copy_from_slice(out1_re.as_array());
-                    im_s1.copy_from_slice(out1_im.as_array());
+                    out0_re.store_slice(re_s0);
+                    out0_im.store_slice(im_s0);
+                    out1_re.store_slice(re_s1);
+                    out1_im.store_slice(im_s1);
                 });
         });
 }