diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
index 853a47e9..384c0c63 100644
--- a/fearless_simd/src/generated/avx2.rs
+++ b/fearless_simd/src/generated/avx2.rs
@@ -178,6 +178,10 @@ impl Simd for Avx2 {
         unsafe { _mm256_setr_m128(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn widen_f32x4(self, a: f32x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_cvtps_pd(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn reinterpret_f64_f32x4(self, a: f32x4<Self>) -> f64x2<Self> {
         f64x2 {
             val: bytemuck::cast(a.val),
@@ -1449,6 +1453,15 @@ impl Simd for Avx2 {
         }
     }
     #[inline(always)]
+    fn widen_f32x8(self, a: f32x8<Self>) -> f64x8<Self> {
+        unsafe {
+            let (a0, a1) = self.split_f32x8(a);
+            let high = _mm256_cvtps_pd(a0.into()).simd_into(self);
+            let low = _mm256_cvtps_pd(a1.into()).simd_into(self);
+            self.combine_f64x4(high, low)
+        }
+    }
+    #[inline(always)]
     fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
         f64x4 {
             val: bytemuck::cast(a.val),
@@ -2818,6 +2831,10 @@ impl Simd for Avx2 {
         }
     }
     #[inline(always)]
+    fn narrow_f64x4(self, a: f64x4<Self>) -> f32x4<Self> {
+        unsafe { _mm256_cvtpd_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
         f32x8 {
             val: bytemuck::cast(a.val),
@@ -3052,6 +3069,18 @@ impl Simd for Avx2 {
         (b0.simd_into(self), b1.simd_into(self))
     }
     #[inline(always)]
+    fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
+        crate::Fallback::new()
+            .load_interleaved_128_f32x16(src)
+            .val
+            .simd_into(self)
+    }
+    #[inline(always)]
+    fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
+        let fb = crate::Fallback::new();
+        fb.store_interleaved_128_f32x16(a.val.simd_into(fb), dest);
+    }
+    #[inline(always)]
     fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f32x16(a);
         self.combine_f64x4(
@@ -3068,18 +3097,6 @@ impl Simd for Avx2 {
         )
     }
     #[inline(always)]
-    fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
-        crate::Fallback::new()
-            .load_interleaved_128_f32x16(src)
-            .val
-            .simd_into(self)
-    }
-    #[inline(always)]
-    fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
-        let fb = crate::Fallback::new();
-        fb.store_interleaved_128_f32x16(a.val.simd_into(fb), dest);
-    }
-    #[inline(always)]
     fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
         let (a0, a1) = self.split_f32x16(a);
         self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
@@ -4484,6 +4501,15 @@ impl Simd for Avx2 {
         (b0.simd_into(self), b1.simd_into(self))
     }
     #[inline(always)]
+    fn narrow_f64x8(self, a: f64x8<Self>) -> f32x8<Self> {
+        let (a, b) = self.split_f64x8(a);
+        unsafe {
+            let lo = _mm256_cvtpd_ps(a.into());
+            let hi = _mm256_cvtpd_ps(b.into());
+            _mm256_setr_m128(lo, hi).simd_into(self)
+        }
+    }
+    #[inline(always)]
     fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f64x8(a);
         self.combine_f32x8(
diff --git a/fearless_simd/src/generated/fallback.rs b/fearless_simd/src/generated/fallback.rs
index 8e7d1db5..75de5b59 100644
--- a/fearless_simd/src/generated/fallback.rs
+++ b/fearless_simd/src/generated/fallback.rs
@@ -338,6 +338,16 @@ impl Simd for Fallback {
         result.simd_into(self)
     }
     #[inline(always)]
+    fn widen_f32x4(self, a: f32x4<Self>) -> f64x4<Self> {
+        [
+            a[0usize] as f64,
+            a[1usize] as f64,
+            a[2usize] as f64,
+            a[3usize] as f64,
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
     fn reinterpret_f64_f32x4(self, a: f32x4<Self>) -> f64x2<Self> {
         f64x2 {
             val: bytemuck::cast(a.val),
@@ -3251,6 +3261,11 @@ impl Simd for Fallback {
         (b0.simd_into(self), b1.simd_into(self))
     }
     #[inline(always)]
+    fn widen_f32x8(self, a: f32x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f64x4(self.widen_f32x4(a0), self.widen_f32x4(a1))
+    }
+    #[inline(always)]
     fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
         let (a0, a1) = self.split_f32x8(a);
         self.combine_f64x2(
@@ -4684,6 +4699,16 @@ impl Simd for Fallback {
         (b0.simd_into(self), b1.simd_into(self))
     }
     #[inline(always)]
+    fn narrow_f64x4(self, a: f64x4<Self>) -> f32x4<Self> {
+        [
+            a[0usize] as f32,
+            a[1usize] as f32,
+            a[2usize] as f32,
+            a[3usize] as f32,
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
     fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
         let (a0, a1) = self.split_f64x4(a);
         self.combine_f32x4(
@@ -4934,22 +4959,6 @@ impl Simd for Fallback {
         (b0.simd_into(self), b1.simd_into(self))
     }
     #[inline(always)]
-    fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f64x4(
-            self.reinterpret_f64_f32x8(a0),
-            self.reinterpret_f64_f32x8(a1),
-        )
-    }
-    #[inline(always)]
-    fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_i32x8(
-            self.reinterpret_i32_f32x8(a0),
-            self.reinterpret_i32_f32x8(a1),
-        )
-    }
-    #[inline(always)]
     fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
         [
             src[0usize],
@@ -4980,6 +4989,22 @@ impl Simd for Fallback {
         ];
     }
     #[inline(always)]
+    fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f64x4(
+            self.reinterpret_f64_f32x8(a0),
+            self.reinterpret_f64_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_i32x8(
+            self.reinterpret_i32_f32x8(a0),
+            self.reinterpret_i32_f32x8(a1),
+        )
+    }
+    #[inline(always)]
     fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
         let (a0, a1) = self.split_f32x16(a);
         self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
@@ -6489,6 +6514,11 @@ impl Simd for Fallback {
         (b0.simd_into(self), b1.simd_into(self))
     }
     #[inline(always)]
+    fn narrow_f64x8(self, a: f64x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f32x4(self.narrow_f64x4(a0), self.narrow_f64x4(a1))
+    }
+    #[inline(always)]
     fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f64x8(a);
         self.combine_f32x8(
diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs
index a533ce76..2f03bb9b 100644
--- a/fearless_simd/src/generated/neon.rs
+++ b/fearless_simd/src/generated/neon.rs
@@ -184,6 +184,14 @@ impl Simd for Neon {
         result.simd_into(self)
     }
     #[inline(always)]
+    fn widen_f32x4(self, a: f32x4<Self>) -> f64x4<Self> {
+        unsafe {
+            let low = vcvt_f64_f32(vget_low_f32(a.into()));
+            let high = vcvt_high_f64_f32(a.into());
+            float64x2x2_t(low, high).simd_into(self)
+        }
+    }
+    #[inline(always)]
     fn reinterpret_f64_f32x4(self, a: f32x4<Self>) -> f64x2<Self> {
         unsafe { vreinterpretq_f64_f32(a.into()).simd_into(self) }
     }
@@ -1401,6 +1409,11 @@ impl Simd for Neon {
         (b0.simd_into(self), b1.simd_into(self))
     }
     #[inline(always)]
+    fn widen_f32x8(self, a: f32x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f64x4(self.widen_f32x4(a0), self.widen_f32x4(a1))
+    }
+    #[inline(always)]
     fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
         let (a0, a1) = self.split_f32x8(a);
         self.combine_f64x2(
@@ -2821,6 +2834,13 @@ impl Simd for Neon {
         (b0.simd_into(self), b1.simd_into(self))
     }
     #[inline(always)]
+    fn narrow_f64x4(self, a: f64x4<Self>) -> f32x4<Self> {
+        unsafe {
+            let converted: float64x2x2_t = a.into();
+            vcvt_high_f32_f64(vcvt_f32_f64(converted.0), converted.1).simd_into(self)
+        }
+    }
+    #[inline(always)]
     fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
         let (a0, a1) = self.split_f64x4(a);
         self.combine_f32x4(
@@ -3071,6 +3091,14 @@ impl Simd for Neon {
         (b0.simd_into(self), b1.simd_into(self))
     }
     #[inline(always)]
+    fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
+        unsafe { vld4q_f32(src.as_ptr()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
+        unsafe { vst4q_f32(dest.as_mut_ptr(), a.into()) }
+    }
+    #[inline(always)]
     fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f32x16(a);
         self.combine_f64x4(
@@ -3087,14 +3115,6 @@ impl Simd for Neon {
         )
     }
     #[inline(always)]
-    fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
-        unsafe { vld4q_f32(src.as_ptr()).simd_into(self) }
-    }
-    #[inline(always)]
-    fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
-        unsafe { vst4q_f32(dest.as_mut_ptr(), a.into()) }
-    }
-    #[inline(always)]
     fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
         let (a0, a1) = self.split_f32x16(a);
         self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
@@ -4465,6 +4485,11 @@ impl Simd for Neon {
         (b0.simd_into(self), b1.simd_into(self))
     }
     #[inline(always)]
+    fn narrow_f64x8(self, a: f64x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f32x4(self.narrow_f64x4(a0), self.narrow_f64x4(a1))
+    }
+    #[inline(always)]
     fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f64x8(a);
         self.combine_f32x8(
diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs
index e762c279..b10ea490 100644
--- a/fearless_simd/src/generated/simd_trait.rs
+++ b/fearless_simd/src/generated/simd_trait.rs
@@ -96,6 +96,7 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static {
     fn trunc_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
     fn select_f32x4(self, a: mask32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self>;
     fn combine_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x8<Self>;
+    fn widen_f32x4(self, a: f32x4<Self>) -> f64x4<Self>;
     fn reinterpret_f64_f32x4(self, a: f32x4<Self>) -> f64x2<Self>;
     fn reinterpret_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self>;
     fn reinterpret_u8_f32x4(self, a: f32x4<Self>) -> u8x16<Self>;
@@ -374,6 +375,7 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static {
     fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self>;
     fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self>;
     fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>);
+    fn widen_f32x8(self, a: f32x8<Self>) -> f64x8<Self>;
     fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self>;
     fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self>;
     fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self>;
@@ -619,6 +621,7 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static {
     fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self>;
     fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self>;
     fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>);
+    fn narrow_f64x4(self, a: f64x4<Self>) -> f32x4<Self>;
     fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self>;
     fn splat_mask64x4(self, val: i64) -> mask64x4<Self>;
     fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self>;
@@ -663,10 +666,10 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static {
     fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
     fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self>;
     fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>);
-    fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self>;
-    fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self>;
     fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self>;
     fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> ();
+    fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self>;
+    fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self>;
     fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self>;
     fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self>;
     fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self>;
@@ -905,6 +908,7 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static {
     fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
     fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self>;
     fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>);
+    fn narrow_f64x8(self, a: f64x8<Self>) -> f32x8<Self>;
     fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self>;
     fn splat_mask64x8(self, val: i64) -> mask64x8<Self>;
     fn not_mask64x8(self, a: mask64x8<Self>) -> mask64x8<Self>;
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
index b6d53c39..9a48e569 100644
--- a/fearless_simd/src/generated/sse4_2.rs
+++ b/fearless_simd/src/generated/sse4_2.rs
@@ -186,6 +186,16 @@ impl Simd for Sse4_2 {
         result.simd_into(self)
     }
     #[inline(always)]
+    fn widen_f32x4(self, a: f32x4<Self>) -> f64x4<Self> {
+        unsafe {
+            let raw = a.into();
+            let high = _mm_cvtps_pd(raw).simd_into(self);
+            let low = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128::<8>(_mm_castps_si128(raw))))
+                .simd_into(self);
+            self.combine_f64x2(high, low)
+        }
+    }
+    #[inline(always)]
     fn reinterpret_f64_f32x4(self, a: f32x4<Self>) -> f64x2<Self> {
         f64x2 {
             val: bytemuck::cast(a.val),
@@ -1527,6 +1537,11 @@ impl Simd for Sse4_2 {
         (b0.simd_into(self), b1.simd_into(self))
     }
     #[inline(always)]
+    fn widen_f32x8(self, a: f32x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f64x4(self.widen_f32x4(a0), self.widen_f32x4(a1))
+    }
+    #[inline(always)]
     fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
         let (a0, a1) = self.split_f32x8(a);
         self.combine_f64x2(
@@ -2949,6 +2964,15 @@ impl Simd for Sse4_2 {
         (b0.simd_into(self), b1.simd_into(self))
     }
     #[inline(always)]
+    fn narrow_f64x4(self, a: f64x4<Self>) -> f32x4<Self> {
+        let (a, b) = self.split_f64x4(a);
+        unsafe {
+            let lo = _mm_cvtpd_ps(a.into());
+            let hi = _mm_cvtpd_ps(b.into());
+            _mm_movelh_ps(lo, hi).simd_into(self)
+        }
+    }
+    #[inline(always)]
     fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
         let (a0, a1) = self.split_f64x4(a);
         self.combine_f32x4(
@@ -3199,6 +3223,18 @@ impl Simd for Sse4_2 {
         (b0.simd_into(self), b1.simd_into(self))
     }
     #[inline(always)]
+    fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
+        crate::Fallback::new()
+            .load_interleaved_128_f32x16(src)
+            .val
+            .simd_into(self)
+    }
+    #[inline(always)]
+    fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
+        let fb = crate::Fallback::new();
+        fb.store_interleaved_128_f32x16(a.val.simd_into(fb), dest);
+    }
+    #[inline(always)]
     fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f32x16(a);
         self.combine_f64x4(
@@ -3215,18 +3251,6 @@ impl Simd for Sse4_2 {
         )
     }
     #[inline(always)]
-    fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
-        crate::Fallback::new()
-            .load_interleaved_128_f32x16(src)
-            .val
-            .simd_into(self)
-    }
-    #[inline(always)]
-    fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
-        let fb = crate::Fallback::new();
-        fb.store_interleaved_128_f32x16(a.val.simd_into(fb), dest);
-    }
-    #[inline(always)]
     fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
         let (a0, a1) = self.split_f32x16(a);
         self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
@@ -4623,6 +4647,11 @@ impl Simd for Sse4_2 {
         (b0.simd_into(self), b1.simd_into(self))
     }
     #[inline(always)]
+    fn narrow_f64x8(self, a: f64x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f32x4(self.narrow_f64x4(a0), self.narrow_f64x4(a1))
+    }
+    #[inline(always)]
     fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f64x8(a);
         self.combine_f32x8(
diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs
index 1383f7ca..f6f9623f 100644
--- a/fearless_simd/src/generated/wasm.rs
+++ b/fearless_simd/src/generated/wasm.rs
@@ -169,6 +169,12 @@ impl Simd for WasmSimd128 {
         result.simd_into(self)
     }
     #[inline(always)]
+    fn widen_f32x4(self, a: f32x4<Self>) -> f64x4<Self> {
+        let low = f64x2_promote_low_f32x4(a.into());
+        let high = f64x2_promote_low_f32x4(u64x2_shuffle::<1, 1>(a.into(), a.into()));
+        self.combine_f64x2(low.simd_into(self), high.simd_into(self))
+    }
+    #[inline(always)]
     fn reinterpret_f64_f32x4(self, a: f32x4<Self>) -> f64x2<Self> {
         <v128>::from(a).simd_into(self)
     }
@@ -1356,6 +1362,11 @@ impl Simd for WasmSimd128 {
         (b0.simd_into(self), b1.simd_into(self))
     }
     #[inline(always)]
+    fn widen_f32x8(self, a: f32x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f64x4(self.widen_f32x4(a0), self.widen_f32x4(a1))
+    }
+    #[inline(always)]
     fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
         let (a0, a1) = self.split_f32x8(a);
         self.combine_f64x2(
@@ -2776,6 +2787,14 @@ impl Simd for WasmSimd128 {
         (b0.simd_into(self), b1.simd_into(self))
     }
     #[inline(always)]
+    fn narrow_f64x4(self, a: f64x4<Self>) -> f32x4<Self> {
+        let (low, high) = self.split_f64x4(a);
+        let low = f32x4_demote_f64x2_zero(low.into());
+        let high = f32x4_demote_f64x2_zero(high.into());
+        let result = u64x2_shuffle::<0, 2>(low, high);
+        result.simd_into(self)
+    }
+    #[inline(always)]
     fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
         let (a0, a1) = self.split_f64x4(a);
         self.combine_f32x4(
@@ -3026,22 +3045,6 @@ impl Simd for WasmSimd128 {
         (b0.simd_into(self), b1.simd_into(self))
     }
     #[inline(always)]
-    fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f64x4(
-            self.reinterpret_f64_f32x8(a0),
-            self.reinterpret_f64_f32x8(a1),
-        )
-    }
-    #[inline(always)]
-    fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_i32x8(
-            self.reinterpret_i32_f32x8(a0),
-            self.reinterpret_i32_f32x8(a1),
-        )
-    }
-    #[inline(always)]
     fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
         let v0: v128 = unsafe { v128_load(src[0 * 4usize..].as_ptr() as *const v128) };
         let v1: v128 = unsafe { v128_load(src[1 * 4usize..].as_ptr() as *const v128) };
@@ -3084,6 +3087,22 @@ impl Simd for WasmSimd128 {
         }
     }
     #[inline(always)]
+    fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f64x4(
+            self.reinterpret_f64_f32x8(a0),
+            self.reinterpret_f64_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_i32x8(
+            self.reinterpret_i32_f32x8(a0),
+            self.reinterpret_i32_f32x8(a1),
+        )
+    }
+    #[inline(always)]
     fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
         let (a0, a1) = self.split_f32x16(a);
         self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
@@ -4580,6 +4599,11 @@ impl Simd for WasmSimd128 {
         (b0.simd_into(self), b1.simd_into(self))
     }
     #[inline(always)]
+    fn narrow_f64x8(self, a: f64x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f32x4(self.narrow_f64x4(a0), self.narrow_f64x4(a1))
+    }
+    #[inline(always)]
     fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f64x8(a);
         self.combine_f32x8(
diff --git a/fearless_simd_gen/src/mk_avx2.rs b/fearless_simd_gen/src/mk_avx2.rs
index 5e565d4d..a11965cf 100644
--- a/fearless_simd_gen/src/mk_avx2.rs
+++ b/fearless_simd_gen/src/mk_avx2.rs
@@ -352,8 +352,8 @@ pub(crate) fn handle_widen_narrow(
         }
         "narrow" => {
             let dst_width = t.n_bits();
-            match (dst_width, vec_ty.n_bits()) {
-                (128, 256) => {
+            match (vec_ty.scalar, dst_width, vec_ty.n_bits()) {
+                (ScalarType::Unsigned, 128, 256) => {
                     let mask = match t.scalar_bits {
                         8 => {
                             quote! { 0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1 }
@@ -371,7 +371,14 @@ pub(crate) fn handle_widen_narrow(
                         }
                     }
                 }
-                (256, 512) => {
+                (ScalarType::Float, 128, 256) => {
+                    quote! {
+                        unsafe {
+                            _mm256_cvtpd_ps(a.into()).simd_into(self)
+                        }
+                    }
+                }
+                (ScalarType::Unsigned, 256, 512) => {
                     let mask = set1_intrinsic(&VecType::new(
                         vec_ty.scalar,
                         vec_ty.scalar_bits,
@@ -398,6 +405,17 @@ pub(crate) fn handle_widen_narrow(
                         }
                     }
                 }
+                (ScalarType::Float, 256, 512) => {
+                    let split = format_ident!("split_{}", vec_ty.rust_name());
+                    quote! {
+                        let (a, b) = self.#split(a);
+                        unsafe {
+                            let lo = _mm256_cvtpd_ps(a.into());
+                            let hi = _mm256_cvtpd_ps(b.into());
+                            _mm256_setr_m128(lo, hi).simd_into(self)
+                        }
+                    }
+                }
                 _ => unimplemented!(),
             }
         }
diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs
index 693ae9fc..0551781f 100644
--- a/fearless_simd_gen/src/mk_neon.rs
+++ b/fearless_simd_gen/src/mk_neon.rs
@@ -185,49 +185,125 @@ fn mk_simd_impl(level: Level) -> TokenStream {
                     }
                 }
                 OpSig::WidenNarrow(target_ty) => {
+                    assert_eq!(
+                        vec_ty.scalar, target_ty.scalar,
+                        "widen/narrow ops are done between the same scalar type"
+                    );
+                    assert_eq!(
+                        vec_ty.len, target_ty.len,
+                        "widen/narrow ops are done preserve the vector length"
+                    );
+
                     let vec_scalar_ty = vec_ty.scalar.rust(vec_ty.scalar_bits);
                     let target_scalar_ty = target_ty.scalar.rust(target_ty.scalar_bits);
 
-                    if method == "narrow" {
-                        let arch = neon::arch_ty(vec_ty);
-
-                        let id1 =
-                            Ident::new(&format!("vmovn_{}", vec_scalar_ty), Span::call_site());
-                        let id2 = Ident::new(
-                            &format!("vcombine_{}", target_scalar_ty),
-                            Span::call_site(),
-                        );
-
-                        quote! {
-                            #method_sig {
-                                unsafe {
-                                    let converted: #arch = a.into();
-                                    let low = #id1(converted.0);
-                                    let high = #id1(converted.1);
+                    match method {
+                        "widen" => {
+                            assert_eq!(
+                                vec_ty.scalar_bits * 2,
+                                target_ty.scalar_bits,
+                                "the destination is twice as wide as the source"
+                            );
 
-                                    #id2(low, high).simd_into(self)
+                            let arch = neon::arch_ty(&target_ty);
+
+                            if vec_ty.scalar == ScalarType::Float {
+                                assert_eq!(
+                                    vec_ty.scalar_bits, 32,
+                                    "only widening from f32x4 to f64x4 is supported"
+                                );
+                                assert_eq!(
+                                    vec_ty.len, 4,
+                                    "only widening from f32x4 to f64x4 is supported"
+                                );
+
+                                quote! {
+                                    #method_sig {
+                                        unsafe {
+                                            let low = vcvt_f64_f32(vget_low_f32(a.into()));
+                                            let high = vcvt_high_f64_f32(a.into());
+
+                                            #arch(low, high).simd_into(self)
+                                        }
+                                    }
+                                }
+                            } else {
+                                let id1 = Ident::new(
+                                    &format!("vmovl_{}", vec_scalar_ty),
+                                    Span::call_site(),
+                                );
+                                let id2 = Ident::new(
+                                    &format!("vget_low_{}", vec_scalar_ty),
+                                    Span::call_site(),
+                                );
+                                let id3 = Ident::new(
+                                    &format!("vget_high_{}", vec_scalar_ty),
+                                    Span::call_site(),
+                                );
+
+                                quote! {
+                                    #method_sig {
+                                        unsafe {
+                                            let low = #id1(#id2(a.into()));
+                                            let high = #id1(#id3(a.into()));
+
+                                            #arch(low, high).simd_into(self)
+                                        }
+                                    }
                                 }
                             }
                         }
-                    } else {
-                        let arch = neon::arch_ty(&target_ty);
-                        let id1 =
-                            Ident::new(&format!("vmovl_{}", vec_scalar_ty), Span::call_site());
-                        let id2 =
-                            Ident::new(&format!("vget_low_{}", vec_scalar_ty), Span::call_site());
-                        let id3 =
-                            Ident::new(&format!("vget_high_{}", vec_scalar_ty), Span::call_site());
-
-                        quote! {
-                            #method_sig {
-                                unsafe {
-                                    let low = #id1(#id2(a.into()));
-                                    let high = #id1(#id3(a.into()));
+                        "narrow" => {
+                            assert_eq!(
+                                vec_ty.scalar_bits / 2,
+                                target_ty.scalar_bits,
+                                "the destination is half as wide as the source"
+                            );
 
-                                    #arch(low, high).simd_into(self)
+                            let arch = neon::arch_ty(vec_ty);
+
+                            if vec_ty.scalar == ScalarType::Float {
+                                assert_eq!(
+                                    vec_ty.scalar_bits, 64,
+                                    "only narrowing from f64x4 to f32x4 is supported"
+                                );
+                                assert_eq!(
+                                    vec_ty.len, 4,
+                                    "only narrowing from f64x4 to f32x4 is supported"
+                                );
+
+                                quote! {
+                                    #method_sig {
+                                        unsafe {
+                                            let converted: #arch = a.into();
+                                            vcvt_high_f32_f64(vcvt_f32_f64(converted.0), converted.1).simd_into(self)
+                                        }
+                                    }
+                                }
+                            } else {
+                                let id1 = Ident::new(
+                                    &format!("vmovn_{}", vec_scalar_ty),
+                                    Span::call_site(),
+                                );
+                                let id2 = Ident::new(
+                                    &format!("vcombine_{}", target_scalar_ty),
+                                    Span::call_site(),
+                                );
+
+                                quote! {
+                                    #method_sig {
+                                        unsafe {
+                                            let converted: #arch = a.into();
+                                            let low = #id1(converted.0);
+                                            let high = #id1(converted.1);
+
+                                            #id2(low, high).simd_into(self)
+                                        }
+                                    }
                                 }
                             }
                         }
+                        _ => unreachable!(),
                     }
                 }
                 OpSig::Binary => {
diff --git a/fearless_simd_gen/src/mk_sse4_2.rs b/fearless_simd_gen/src/mk_sse4_2.rs
index e0bcd653..f3ad873b 100644
--- a/fearless_simd_gen/src/mk_sse4_2.rs
+++ b/fearless_simd_gen/src/mk_sse4_2.rs
@@ -348,6 +348,23 @@ pub(crate) fn handle_widen_narrow(
                 }
                 .rust_name()
             );
+            let shifted = if vec_ty.scalar == ScalarType::Float {
+                let to_ints = cast_ident(
+                    ScalarType::Float,
+                    ScalarType::Int,
+                    vec_ty.scalar_bits,
+                    vec_ty.n_bits(),
+                );
+                let from_ints = cast_ident(
+                    ScalarType::Int,
+                    ScalarType::Float,
+                    vec_ty.scalar_bits,
+                    vec_ty.n_bits(),
+                );
+                quote! { #from_ints(_mm_srli_si128::<8>(#to_ints(raw))) }
+            } else {
+                quote! { _mm_srli_si128::<8>(raw) }
+            };
             quote! {
                 #method_sig {
                     unsafe {
@@ -355,37 +372,55 @@ pub(crate) fn handle_widen_narrow(
                         let high = #extend(raw).simd_into(self);
                         // Shift by 8 since we want to get the higher part into the
                         // lower position.
-                        let low = #extend(_mm_srli_si128::<8>(raw)).simd_into(self);
+                        let low = #extend(#shifted).simd_into(self);
                         self.#combine(high, low)
                     }
                 }
             }
         }
         "narrow" => {
-            let mask = set1_intrinsic(&VecType::new(
-                vec_ty.scalar,
-                vec_ty.scalar_bits,
-                vec_ty.len / 2,
-            ));
-            let pack = pack_intrinsic(
-                vec_ty.scalar_bits,
-                matches!(vec_ty.scalar, ScalarType::Int),
-                t.n_bits(),
-            );
-            let split = format_ident!("split_{}", vec_ty.rust_name());
-            quote! {
-                #method_sig {
-                    let (a, b) = self.#split(a);
-                    unsafe {
-                        // Note that SSE4.2 only has an intrinsic for saturating cast,
-                        // but not wrapping.
-                        let mask = #mask(0xFF);
-                        let lo_masked = _mm_and_si128(a.into(), mask);
-                        let hi_masked = _mm_and_si128(b.into(), mask);
-                        let result = #pack(lo_masked, hi_masked);
-                        result.simd_into(self)
+            match (vec_ty.scalar, t.n_bits(), vec_ty.n_bits()) {
+                (ScalarType::Unsigned, 128, 256) => {
+                    let mask = set1_intrinsic(&VecType::new(
+                        vec_ty.scalar,
+                        vec_ty.scalar_bits,
+                        vec_ty.len / 2,
+                    ));
+                    let pack = pack_intrinsic(
+                        vec_ty.scalar_bits,
+                        matches!(vec_ty.scalar, ScalarType::Int),
+                        t.n_bits(),
+                    );
+                    let split = format_ident!("split_{}", vec_ty.rust_name());
+                    quote! {
+                        #method_sig {
+                            let (a, b) = self.#split(a);
+                            unsafe {
+                                // Note that SSE4.2 only has an intrinsic for saturating cast,
+                                // but not wrapping.
+                                let mask = #mask(0xFF);
+                                let lo_masked = _mm_and_si128(a.into(), mask);
+                                let hi_masked = _mm_and_si128(b.into(), mask);
+                                let result = #pack(lo_masked, hi_masked);
+                                result.simd_into(self)
+                            }
+                        }
                     }
                 }
+                (ScalarType::Float, 128, 256) => {
+                    let split = format_ident!("split_{}", vec_ty.rust_name());
+                    quote! {
+                        #method_sig {
+                            let (a, b) = self.#split(a);
+                            unsafe {
+                                let lo = _mm_cvtpd_ps(a.into());
+                                let hi = _mm_cvtpd_ps(b.into());
+                                _mm_movelh_ps(lo, hi).simd_into(self)
+                            }
+                        }
+                    }
+                }
+                _ => unimplemented!(),
             }
         }
         _ => unreachable!(),
diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs
index 56305658..6e9fc660 100644
--- a/fearless_simd_gen/src/mk_wasm.rs
+++ b/fearless_simd_gen/src/mk_wasm.rs
@@ -325,35 +325,67 @@ fn mk_simd_impl(level: Level) -> TokenStream {
                     }
                 }
                 OpSig::WidenNarrow(to_ty) => {
+                    assert_eq!(vec_ty.scalar, to_ty.scalar);
+                    assert_eq!(vec_ty.len, to_ty.len);
                     match method {
                         "widen" => {
-                            assert_eq!(vec_ty.rust_name(), "u8x16");
-                            assert_eq!(to_ty.rust_name(), "u16x16");
-                            quote! {
-                                #method_sig {
-                                    let low = u16x8_extend_low_u8x16(a.into());
-                                    let high = u16x8_extend_high_u8x16(a.into());
-                                    self.combine_u16x8(low.simd_into(self), high.simd_into(self))
+                            assert_eq!(vec_ty.scalar_bits * 2, to_ty.scalar_bits);
+
+                            match (vec_ty.scalar, vec_ty.len, vec_ty.scalar_bits) {
+                                (ScalarType::Unsigned, 16, 8) => {
+                                    quote! {
+                                        #method_sig {
+                                            let low = u16x8_extend_low_u8x16(a.into());
+                                            let high = u16x8_extend_high_u8x16(a.into());
+                                            self.combine_u16x8(low.simd_into(self), high.simd_into(self))
+                                        }
+                                    }
+                                }
+                                (ScalarType::Float, 4, 32) => {
+                                    quote! {
+                                        #method_sig {
+                                            let low = f64x2_promote_low_f32x4(a.into());
+                                            let high = f64x2_promote_low_f32x4(u64x2_shuffle::<1, 1>(a.into(), a.into()));
+                                            self.combine_f64x2(low.simd_into(self), high.simd_into(self))
+                                        }
+                                    }
                                 }
+                                _ => unimplemented!(),
                             }
                         }
                         "narrow" => {
-                            assert_eq!(vec_ty.rust_name(), "u16x16");
-                            assert_eq!(to_ty.rust_name(), "u8x16");
-                            // WASM SIMD only has saturating narrowing instructions, so we emulate
-                            // truncated narrowing by masking out the
-                            quote! {
-                                #method_sig {
-                                    let mask = u16x8_splat(0xFF);
-                                    let (low, high) = self.split_u16x16(a);
-                                    let low_masked = v128_and(low.into(), mask);
-                                    let high_masked = v128_and(high.into(), mask);
-                                    let result = u8x16_narrow_i16x8(low_masked, high_masked);
-                                    result.simd_into(self)
+                            assert_eq!(vec_ty.scalar_bits / 2, to_ty.scalar_bits);
+
+                            match (vec_ty.scalar, vec_ty.len, vec_ty.scalar_bits) {
+                                (ScalarType::Unsigned, 16, 16) => {
+                                    // WASM SIMD only has saturating narrowing instructions, so we emulate
+                                    // truncated narrowing by masking out the upper byte of each u16
+                                    quote! {
+                                        #method_sig {
+                                            let mask = u16x8_splat(0xFF);
+                                            let (low, high) = self.split_u16x16(a);
+                                            let low_masked = v128_and(low.into(), mask);
+                                            let high_masked = v128_and(high.into(), mask);
+                                            let result = u8x16_narrow_i16x8(low_masked, high_masked);
+                                            result.simd_into(self)
+                                        }
+                                    }
+                                }
+                                (ScalarType::Float, 4, 64) => {
+                                    quote! {
+                                        #method_sig {
+                                            let (low, high) = self.split_f64x4(a);
+                                            let low = f32x4_demote_f64x2_zero(low.into());
+                                            let high = f32x4_demote_f64x2_zero(high.into());
+                                            let result = u64x2_shuffle::<0, 2>(low, high);
+                                            result.simd_into(self)
+                                        }
+                                    }
                                 }
+                                _ => unimplemented!(),
                             }
                         }
-                        _ => unimplemented!(),
+                        _ => unreachable!(),
                     }
                 }
                 OpSig::LoadInterleaved(block_size, count) => {
diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs
index bf6b1631..d876ea2e 100644
--- a/fearless_simd_gen/src/ops.rs
+++ b/fearless_simd_gen/src/ops.rs
@@ -127,8 +127,23 @@ pub fn ops_for_type(ty: &VecType, cvt: bool) -> Vec<(&str, OpSig)> {
         ops.push(("neg", OpSig::Unary));
     }
 
-    if ty.scalar == ScalarType::Float {
-        if cvt {
+    if let (ScalarType::Unsigned | ScalarType::Float, 8 | 16 | 32, 512) =
+        (ty.scalar, ty.scalar_bits, ty.n_bits())
+    {
+        ops.push(("load_interleaved_128", OpSig::LoadInterleaved(128, 4)));
+        ops.push(("store_interleaved_128", OpSig::StoreInterleaved(128, 4)));
+    }
+
+    if cvt {
+        if let Some(widened) = ty.widened() {
+            ops.push(("widen", OpSig::WidenNarrow(widened)));
+        }
+
+        if let Some(narrowed) = ty.narrowed() {
+            ops.push(("narrow", OpSig::WidenNarrow(narrowed)));
+        }
+
+        if ty.scalar == ScalarType::Float {
             if ty.scalar_bits == 64 {
                 ops.push(("reinterpret_f32", OpSig::Reinterpret(ScalarType::Float, 32)));
             } else {
@@ -138,44 +153,23 @@ pub fn ops_for_type(ty: &VecType, cvt: bool) -> Vec<(&str, OpSig)> {
             }
         }
 
-        if ty.scalar_bits == 64 {
-            return ops;
-        }
-    }
-
-    if matches!(ty.scalar, ScalarType::Unsigned | ScalarType::Float) && ty.n_bits() == 512 {
-        ops.push(("load_interleaved_128", OpSig::LoadInterleaved(128, 4)));
-    }
-
-    if matches!(ty.scalar, ScalarType::Unsigned | ScalarType::Float) && ty.n_bits() == 512 {
-        ops.push(("store_interleaved_128", OpSig::StoreInterleaved(128, 4)));
-    }
-
-    if cvt {
-        if matches!(ty.scalar, ScalarType::Unsigned) {
-            if let Some(widened) = ty.widened() {
-                ops.push(("widen", OpSig::WidenNarrow(widened)));
+        // TODO: provide reinterpret ops for f64; it should be very straightforward
+        if !(ty.scalar == ScalarType::Float && ty.scalar_bits == 64) {
+            if valid_reinterpret(ty, ScalarType::Unsigned, 8) {
+                ops.push((
+                    "reinterpret_u8",
+                    OpSig::Reinterpret(ScalarType::Unsigned, 8),
+                ));
             }
 
-            if let Some(narrowed) = ty.narrowed() {
-                ops.push(("narrow", OpSig::WidenNarrow(narrowed)));
+            if valid_reinterpret(ty, ScalarType::Unsigned, 32) {
+                ops.push((
+                    "reinterpret_u32",
+                    OpSig::Reinterpret(ScalarType::Unsigned, 32),
+                ));
             }
         }
 
-        if valid_reinterpret(ty, ScalarType::Unsigned, 8) {
-            ops.push((
-                "reinterpret_u8",
-                OpSig::Reinterpret(ScalarType::Unsigned, 8),
-            ));
-        }
-
-        if valid_reinterpret(ty, ScalarType::Unsigned, 32) {
-            ops.push((
-                "reinterpret_u32",
-                OpSig::Reinterpret(ScalarType::Unsigned, 32),
-            ));
-        }
-
         match (ty.scalar, ty.scalar_bits) {
             (ScalarType::Float, 32) => {
                 ops.push(("cvt_u32", OpSig::Cvt(ScalarType::Unsigned, 32)));
diff --git a/fearless_simd_gen/src/types.rs b/fearless_simd_gen/src/types.rs
index ed0639a0..de544fa3 100644
--- a/fearless_simd_gen/src/types.rs
+++ b/fearless_simd_gen/src/types.rs
@@ -74,27 +74,19 @@ impl VecType {
     }
 
     pub fn widened(&self) -> Option<VecType> {
-        if matches!(self.scalar, ScalarType::Mask | ScalarType::Float)
-            || self.n_bits() > 256
-            || self.scalar_bits != 8
-        {
-            return None;
+        match (self.scalar, self.n_bits(), self.scalar_bits) {
+            (ScalarType::Unsigned, 128 | 256, 8) => Some(Self::new(self.scalar, 16, self.len)),
+            (ScalarType::Float, 128 | 256, 32) => Some(Self::new(self.scalar, 64, self.len)),
+            _ => None,
         }
-
-        let scalar_bits = self.scalar_bits * 2;
-        Some(Self::new(self.scalar, scalar_bits, self.len))
     }
 
     pub fn narrowed(&self) -> Option<VecType> {
-        if matches!(self.scalar, ScalarType::Mask | ScalarType::Float)
-            || self.n_bits() < 256
-            || self.scalar_bits != 16
-        {
-            return None;
+        match (self.scalar, self.n_bits(), self.scalar_bits) {
+            (ScalarType::Unsigned, 256 | 512, 16) => Some(Self::new(self.scalar, 8, self.len)),
+            (ScalarType::Float, 256 | 512, 64) => Some(Self::new(self.scalar, 32, self.len)),
+            _ => None,
         }
-
-        let scalar_bits = self.scalar_bits / 2;
-        Some(Self::new(self.scalar, scalar_bits, self.len))
     }
 
     pub fn mask_ty(&self) -> Self {
diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs
index c7706348..3b899de3 100644
--- a/fearless_simd_tests/tests/harness/mod.rs
+++ b/fearless_simd_tests/tests/harness/mod.rs
@@ -9,6 +9,10 @@
     clippy::unseparated_literal_suffix,
     reason = "TODO: https://github.com/linebender/fearless_simd/issues/40"
 )]
+#![expect(
+    clippy::approx_constant,
+    reason = "these constants are good test vectors"
+)]
 
 //! Tests for `fearless_simd`.
 
@@ -2380,6 +2384,71 @@ fn narrow_u16x32<S: Simd>(simd: S) {
     );
 }
 
+#[simd_test]
+fn widen_f32x4<S: Simd>(simd: S) {
+    let a = f32x4::from_slice(simd, &[1.0, -2.5, 3.14159, 0.0]);
+    assert_eq!(simd.widen_f32x4(a).val, [1.0, -2.5, 3.141590118408203, 0.0]);
+}
+
+#[simd_test]
+fn widen_f32x8<S: Simd>(simd: S) {
+    let a = f32x8::from_slice(
+        simd,
+        &[1.0, -2.5, 3.14159, 0.0, 100.25, -0.5, f32::MAX, f32::MIN],
+    );
+    let result = simd.widen_f32x8(a);
+    assert_eq!(
+        result.val,
+        [
+            1.0,
+            -2.5,
+            3.141590118408203,
+            0.0,
+            100.25,
+            -0.5,
+            f32::MAX as f64,
+            f32::MIN as f64
+        ]
+    );
+}
+
+#[simd_test]
+fn narrow_f64x4<S: Simd>(simd: S) {
+    let a = f64x4::from_slice(simd, &[1.0, -2.5, 3.14159265358979, 0.0]);
+    assert_eq!(simd.narrow_f64x4(a).val, [1.0, -2.5, 3.1415927, 0.0]);
+}
+
+#[simd_test]
+fn narrow_f64x8<S: Simd>(simd: S) {
+    let a = f64x8::from_slice(
+        simd,
+        &[
+            1.0,
+            -2.5,
+            3.14159265358979,
+            0.0,
+            100.25,
+            -0.5,
+            f64::MAX,
+            f64::MIN,
+        ],
+    );
+    let result = simd.narrow_f64x8(a);
+    assert_eq!(
+        result.val,
+        [
+            1.0,
+            -2.5,
+            3.1415927,
+            0.0,
+            100.25,
+            -0.5,
+            f32::INFINITY,
+            f32::NEG_INFINITY
+        ]
+    );
+}
+
 #[simd_test]
 fn abs_f64x2<S: Simd>(simd: S) {
     let a = f64x2::from_slice(simd, &[-1.5, 2.5]);