diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
index f09e03d0..005c74ae 100644
--- a/fearless_simd/src/generated/avx2.rs
+++ b/fearless_simd/src/generated/avx2.rs
@@ -253,6 +253,10 @@ impl Simd for Avx2 {
         unsafe { _mm_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn mul_neg_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_fnmadd_ps(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
         unsafe {
             _mm_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
@@ -1921,6 +1925,10 @@ impl Simd for Avx2 {
         unsafe { _mm_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn mul_neg_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_fnmadd_pd(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
         unsafe {
             _mm_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
@@ -2238,6 +2246,10 @@ impl Simd for Avx2 {
         unsafe { _mm256_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn mul_neg_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_fnmadd_ps(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
         unsafe {
             _mm256_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
@@ -4238,6 +4250,10 @@ impl Simd for Avx2 {
         unsafe { _mm256_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn mul_neg_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_fnmadd_pd(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
         unsafe {
             _mm256_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
@@ -4627,6 +4643,16 @@ impl Simd for Avx2 {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let (c0, c1) = self.split_f32x16(c);
+        self.combine_f32x8(
+            self.mul_neg_add_f32x8(a0, b0, c0),
+            self.mul_neg_add_f32x8(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f32x16(a);
         self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
@@ -6988,6 +7014,16 @@ impl Simd for Avx2 {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let (c0, c1) = self.split_f64x8(c);
+        self.combine_f64x4(
+            self.mul_neg_add_f64x4(a0, b0, c0),
+            self.mul_neg_add_f64x4(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f64x8(a);
         self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
diff --git a/fearless_simd/src/generated/fallback.rs b/fearless_simd/src/generated/fallback.rs
index c2f39a1b..4db585fd 100644
--- a/fearless_simd/src/generated/fallback.rs
+++ b/fearless_simd/src/generated/fallback.rs
@@ -397,6 +397,10 @@ impl Simd for Fallback {
         a.mul(b).sub(c)
     }
     #[inline(always)]
+    fn mul_neg_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
+        c.sub(a.mul(b))
+    }
+    #[inline(always)]
     fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
         [
             f32::floor(a[0usize]),
@@ -3772,6 +3776,10 @@ impl Simd for Fallback {
         a.mul(b).sub(c)
     }
     #[inline(always)]
+    fn mul_neg_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
+        c.sub(a.mul(b))
+    }
+    #[inline(always)]
     fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
         [f64::floor(a[0usize]), f64::floor(a[1usize])].simd_into(self)
     }
@@ -4141,6 +4149,16 @@ impl Simd for Fallback {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        let (c0, c1) = self.split_f32x8(c);
+        self.combine_f32x4(
+            self.mul_neg_add_f32x4(a0, b0, c0),
+            self.mul_neg_add_f32x4(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
         let (a0, a1) = self.split_f32x8(a);
         self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1))
@@ -6185,6 +6203,16 @@ impl Simd for Fallback {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        let (c0, c1) = self.split_f64x4(c);
+        self.combine_f64x2(
+            self.mul_neg_add_f64x2(a0, b0, c0),
+            self.mul_neg_add_f64x2(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
         let (a0, a1) = self.split_f64x4(a);
         self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1))
@@ -6577,6 +6605,16 @@ impl Simd for Fallback {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let (c0, c1) = self.split_f32x16(c);
+        self.combine_f32x8(
+            self.mul_neg_add_f32x8(a0, b0, c0),
+            self.mul_neg_add_f32x8(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f32x16(a);
         self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
@@ -8743,6 +8781,16 @@ impl Simd for Fallback {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let (c0, c1) = self.split_f64x8(c);
+        self.combine_f64x4(
+            self.mul_neg_add_f64x4(a0, b0, c0),
+            self.mul_neg_add_f64x4(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f64x8(a);
         self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs
index def528dd..d96fc8c2 100644
--- a/fearless_simd/src/generated/neon.rs
+++ b/fearless_simd/src/generated/neon.rs
@@ -245,6 +245,10 @@ impl Simd for Neon {
         unsafe { vnegq_f32(vfmsq_f32(c.into(), b.into(), a.into())).simd_into(self) }
     }
     #[inline(always)]
+    fn mul_neg_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
+        unsafe { vfmsq_f32(c.into(), b.into(), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
         unsafe { vrndmq_f32(a.into()).simd_into(self) }
     }
@@ -1801,6 +1805,10 @@ impl Simd for Neon {
         unsafe { vnegq_f64(vfmsq_f64(c.into(), b.into(), a.into())).simd_into(self) }
     }
     #[inline(always)]
+    fn mul_neg_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
+        unsafe { vfmsq_f64(c.into(), b.into(), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
         unsafe { vrndmq_f64(a.into()).simd_into(self) }
     }
@@ -2146,6 +2154,16 @@ impl Simd for Neon {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        let (c0, c1) = self.split_f32x8(c);
+        self.combine_f32x4(
+            self.mul_neg_add_f32x4(a0, b0, c0),
+            self.mul_neg_add_f32x4(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
         let (a0, a1) = self.split_f32x8(a);
         self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1))
@@ -4247,6 +4265,16 @@ impl Simd for Neon {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        let (c0, c1) = self.split_f64x4(c);
+        self.combine_f64x2(
+            self.mul_neg_add_f64x2(a0, b0, c0),
+            self.mul_neg_add_f64x2(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
         let (a0, a1) = self.split_f64x4(a);
         self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1))
@@ -4653,6 +4681,16 @@ impl Simd for Neon {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let (c0, c1) = self.split_f32x16(c);
+        self.combine_f32x8(
+            self.mul_neg_add_f32x8(a0, b0, c0),
+            self.mul_neg_add_f32x8(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f32x16(a);
         self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
@@ -6708,6 +6746,16 @@ impl Simd for Neon {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let (c0, c1) = self.split_f64x8(c);
+        self.combine_f64x4(
+            self.mul_neg_add_f64x4(a0, b0, c0),
+            self.mul_neg_add_f64x4(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f64x8(a);
         self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs
index b0732adf..2b4c13cd 100644
--- a/fearless_simd/src/generated/simd_trait.rs
+++ b/fearless_simd/src/generated/simd_trait.rs
@@ -184,6 +184,8 @@ pub trait Simd:
     fn mul_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."]
     fn mul_sub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self>;
+    #[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."]
+    fn mul_neg_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."]
     fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."]
@@ -841,6 +843,8 @@ pub trait Simd:
     fn mul_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."]
     fn mul_sub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self>;
+    #[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."]
+    fn mul_neg_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."]
     fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."]
@@ -966,6 +970,8 @@ pub trait Simd:
     fn mul_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."]
     fn mul_sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self>;
+    #[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."]
+    fn mul_neg_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."]
     fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."]
@@ -1645,6 +1651,8 @@ pub trait Simd:
     fn mul_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."]
     fn mul_sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self>;
+    #[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."]
+    fn mul_neg_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."]
     fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."]
@@ -1774,6 +1782,8 @@ pub trait Simd:
     fn mul_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."]
     fn mul_sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self>;
+    #[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."]
+    fn mul_neg_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."]
     fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."]
@@ -2447,6 +2457,8 @@ pub trait Simd:
     fn mul_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."]
     fn mul_sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self>;
+    #[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."]
+    fn mul_neg_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."]
     fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."]
@@ -2688,6 +2700,8 @@ pub trait SimdFloat<S: Simd>:
     fn mul_add(self, op1: impl SimdInto<Self, S>, op2: impl SimdInto<Self, S>) -> Self;
     #[doc = "Compute `(self * op1) - op2` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."]
     fn mul_sub(self, op1: impl SimdInto<Self, S>, op2: impl SimdInto<Self, S>) -> Self;
+    #[doc = "Compute `op2 - (self * op1)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."]
+    fn mul_neg_add(self, op1: impl SimdInto<Self, S>, op2: impl SimdInto<Self, S>) -> Self;
     #[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."]
     fn floor(self) -> Self;
     #[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."]
diff --git a/fearless_simd/src/generated/simd_types.rs b/fearless_simd/src/generated/simd_types.rs
index 6768951e..0cd8042a 100644
--- a/fearless_simd/src/generated/simd_types.rs
+++ b/fearless_simd/src/generated/simd_types.rs
@@ -193,6 +193,11 @@ impl<S: Simd> crate::SimdFloat<S> for f32x4<S> {
             .mul_sub_f32x4(self, op1.simd_into(self.simd), op2.simd_into(self.simd))
     }
     #[inline(always)]
+    fn mul_neg_add(self, op1: impl SimdInto<Self, S>, op2: impl SimdInto<Self, S>) -> Self {
+        self.simd
+            .mul_neg_add_f32x4(self, op1.simd_into(self.simd), op2.simd_into(self.simd))
+    }
+    #[inline(always)]
     fn floor(self) -> Self {
         self.simd.floor_f32x4(self)
     }
@@ -1884,6 +1889,11 @@ impl<S: Simd> crate::SimdFloat<S> for f64x2<S> {
             .mul_sub_f64x2(self, op1.simd_into(self.simd), op2.simd_into(self.simd))
     }
     #[inline(always)]
+    fn mul_neg_add(self, op1: impl SimdInto<Self, S>, op2: impl SimdInto<Self, S>) -> Self {
+        self.simd
+            .mul_neg_add_f64x2(self, op1.simd_into(self.simd), op2.simd_into(self.simd))
+    }
+    #[inline(always)]
     fn floor(self) -> Self {
         self.simd.floor_f64x2(self)
     }
@@ -2247,6 +2257,11 @@ impl<S: Simd> crate::SimdFloat<S> for f32x8<S> {
             .mul_sub_f32x8(self, op1.simd_into(self.simd), op2.simd_into(self.simd))
     }
     #[inline(always)]
+    fn mul_neg_add(self, op1: impl SimdInto<Self, S>, op2: impl SimdInto<Self, S>) -> Self {
+        self.simd
+            .mul_neg_add_f32x8(self, op1.simd_into(self.simd), op2.simd_into(self.simd))
+    }
+    #[inline(always)]
     fn floor(self) -> Self {
         self.simd.floor_f32x8(self)
     }
@@ -4018,6 +4033,11 @@ impl<S: Simd> crate::SimdFloat<S> for f64x4<S> {
             .mul_sub_f64x4(self, op1.simd_into(self.simd), op2.simd_into(self.simd))
     }
     #[inline(always)]
+    fn mul_neg_add(self, op1: impl SimdInto<Self, S>, op2: impl SimdInto<Self, S>) -> Self {
+        self.simd
+            .mul_neg_add_f64x4(self, op1.simd_into(self.simd), op2.simd_into(self.simd))
+    }
+    #[inline(always)]
     fn floor(self) -> Self {
         self.simd.floor_f64x4(self)
     }
@@ -4401,6 +4421,11 @@ impl<S: Simd> crate::SimdFloat<S> for f32x16<S> {
             .mul_sub_f32x16(self, op1.simd_into(self.simd), op2.simd_into(self.simd))
     }
     #[inline(always)]
+    fn mul_neg_add(self, op1: impl SimdInto<Self, S>, op2: impl SimdInto<Self, S>) -> Self {
+        self.simd
+            .mul_neg_add_f32x16(self, op1.simd_into(self.simd), op2.simd_into(self.simd))
+    }
+    #[inline(always)]
     fn floor(self) -> Self {
         self.simd.floor_f32x16(self)
     }
@@ -6122,6 +6147,11 @@ impl<S: Simd> crate::SimdFloat<S> for f64x8<S> {
             .mul_sub_f64x8(self, op1.simd_into(self.simd), op2.simd_into(self.simd))
     }
     #[inline(always)]
+    fn mul_neg_add(self, op1: impl SimdInto<Self, S>, op2: impl SimdInto<Self, S>) -> Self {
+        self.simd
+            .mul_neg_add_f64x8(self, op1.simd_into(self.simd), op2.simd_into(self.simd))
+    }
+    #[inline(always)]
     fn floor(self) -> Self {
         self.simd.floor_f64x8(self)
     }
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
index a9f7c1ad..1d99bb2e 100644
--- a/fearless_simd/src/generated/sse4_2.rs
+++ b/fearless_simd/src/generated/sse4_2.rs
@@ -258,6 +258,10 @@ impl Simd for Sse4_2 {
         a * b - c
     }
     #[inline(always)]
+    fn mul_neg_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
+        c - a * b
+    }
+    #[inline(always)]
     fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
         unsafe {
             _mm_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
@@ -1961,6 +1965,10 @@ impl Simd for Sse4_2 {
         a * b - c
     }
     #[inline(always)]
+    fn mul_neg_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
+        c - a * b
+    }
+    #[inline(always)]
     fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
         unsafe {
             _mm_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
@@ -2324,6 +2332,16 @@ impl Simd for Sse4_2 {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        let (c0, c1) = self.split_f32x8(c);
+        self.combine_f32x4(
+            self.mul_neg_add_f32x4(a0, b0, c0),
+            self.mul_neg_add_f32x4(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
         let (a0, a1) = self.split_f32x8(a);
         self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1))
@@ -4537,6 +4555,16 @@ impl Simd for Sse4_2 {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        let (c0, c1) = self.split_f64x4(c);
+        self.combine_f64x2(
+            self.mul_neg_add_f64x2(a0, b0, c0),
+            self.mul_neg_add_f64x2(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
         let (a0, a1) = self.split_f64x4(a);
         self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1))
@@ -4971,6 +4999,16 @@ impl Simd for Sse4_2 {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let (c0, c1) = self.split_f32x16(c);
+        self.combine_f32x8(
+            self.mul_neg_add_f32x8(a0, b0, c0),
+            self.mul_neg_add_f32x8(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f32x16(a);
         self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
@@ -7384,6 +7422,16 @@ impl Simd for Sse4_2 {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let (c0, c1) = self.split_f64x8(c);
+        self.combine_f64x4(
+            self.mul_neg_add_f64x4(a0, b0, c0),
+            self.mul_neg_add_f64x4(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f64x8(a);
         self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs
index 59c94768..88c470a3 100644
--- a/fearless_simd/src/generated/wasm.rs
+++ b/fearless_simd/src/generated/wasm.rs
@@ -264,6 +264,17 @@ impl Simd for WasmSimd128 {
         }
     }
     #[inline(always)]
+    fn mul_neg_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
+        #[cfg(target_feature = "relaxed-simd")]
+        {
+            f32x4_relaxed_nmadd(a.into(), b.into(), c.into()).simd_into(self)
+        }
+        #[cfg(not(target_feature = "relaxed-simd"))]
+        {
+            self.sub_f32x4(c, self.mul_f32x4(a, b))
+        }
+    }
+    #[inline(always)]
     fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
         f32x4_floor(a.into()).simd_into(self)
     }
@@ -1906,6 +1917,17 @@ impl Simd for WasmSimd128 {
         }
     }
     #[inline(always)]
+    fn mul_neg_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
+        #[cfg(target_feature = "relaxed-simd")]
+        {
+            f64x2_relaxed_nmadd(a.into(), b.into(), c.into()).simd_into(self)
+        }
+        #[cfg(not(target_feature = "relaxed-simd"))]
+        {
+            self.sub_f64x2(c, self.mul_f64x2(a, b))
+        }
+    }
+    #[inline(always)]
     fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
         f64x2_floor(a.into()).simd_into(self)
     }
@@ -2274,6 +2296,16 @@ impl Simd for WasmSimd128 {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        let (c0, c1) = self.split_f32x8(c);
+        self.combine_f32x4(
+            self.mul_neg_add_f32x4(a0, b0, c0),
+            self.mul_neg_add_f32x4(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
         let (a0, a1) = self.split_f32x8(a);
         self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1))
@@ -4485,6 +4517,16 @@ impl Simd for WasmSimd128 {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        let (c0, c1) = self.split_f64x4(c);
+        self.combine_f64x2(
+            self.mul_neg_add_f64x2(a0, b0, c0),
+            self.mul_neg_add_f64x2(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
         let (a0, a1) = self.split_f64x4(a);
         self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1))
@@ -4919,6 +4961,16 @@ impl Simd for WasmSimd128 {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let (c0, c1) = self.split_f32x16(c);
+        self.combine_f32x8(
+            self.mul_neg_add_f32x8(a0, b0, c0),
+            self.mul_neg_add_f32x8(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f32x16(a);
         self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
@@ -7324,6 +7376,16 @@ impl Simd for WasmSimd128 {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let (c0, c1) = self.split_f64x8(c);
+        self.combine_f64x4(
+            self.mul_neg_add_f64x4(a0, b0, c0),
+            self.mul_neg_add_f64x4(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f64x8(a);
         self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
diff --git a/fearless_simd_gen/src/arch/neon.rs b/fearless_simd_gen/src/arch/neon.rs
index 081a3242..713576ca 100644
--- a/fearless_simd_gen/src/arch/neon.rs
+++ b/fearless_simd_gen/src/arch/neon.rs
@@ -37,6 +37,7 @@ fn translate_op(op: &str) -> Option<&'static str> {
         "min_precise" => "vminnm",
         "mul_add" => "vfma",
         "mul_sub" => "vfms",
+        "mul_neg_add" => "vfms",
         _ => return None,
     })
 }
diff --git a/fearless_simd_gen/src/mk_fallback.rs b/fearless_simd_gen/src/mk_fallback.rs
index c6c95533..b5106c41 100644
--- a/fearless_simd_gen/src/mk_fallback.rs
+++ b/fearless_simd_gen/src/mk_fallback.rs
@@ -241,6 +241,12 @@ impl Level for Fallback {
                             a.mul(b).sub(c)
                         }
                     }
+                } else if method == "mul_neg_add" {
+                    quote! {
+                        #method_sig {
+                            c.sub(a.mul(b))
+                        }
+                    }
                 } else {
                     let args = [
                         quote! { a.into() },
diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs
index 366359e1..274707b1 100644
--- a/fearless_simd_gen/src/mk_neon.rs
+++ b/fearless_simd_gen/src/mk_neon.rs
@@ -271,7 +271,7 @@ impl Level for Neon {
             }
             OpSig::Ternary => {
                 let args = match method {
-                    "mul_add" | "mul_sub" => [
+                    "mul_add" | "mul_sub" | "mul_neg_add" => [
                         quote! { c.into() },
                         quote! { b.into() },
                         quote! { a.into() },
@@ -289,6 +289,7 @@ impl Level for Neon {
                     let neg = simple_intrinsic("vneg", vec_ty);
                     expr = quote! { #neg(#expr) };
                 }
+                // mul_neg_add computes c - (a * b), which is exactly what vfms does
                 quote! {
                     #method_sig {
                         unsafe {
diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs
index d098d765..693342f8 100644
--- a/fearless_simd_gen/src/mk_wasm.rs
+++ b/fearless_simd_gen/src/mk_wasm.rs
@@ -192,30 +192,53 @@ impl Level for WasmSimd128 {
                 }
             }
             OpSig::Ternary => {
-                if matches!(method, "mul_add" | "mul_sub") {
-                    let add_sub =
-                        generic_op_name(if method == "mul_add" { "add" } else { "sub" }, vec_ty);
+                if matches!(method, "mul_add" | "mul_sub" | "mul_neg_add") {
+                    let fallback_op = if method == "mul_neg_add" {
+                        "sub"
+                    } else if method == "mul_add" {
+                        "add"
+                    } else {
+                        "sub"
+                    };
+                    let add_sub = generic_op_name(fallback_op, vec_ty);
                     let mul = generic_op_name("mul", vec_ty);
 
-                    let c = if method == "mul_sub" {
+                    let (relaxed_intrinsic, c_expr, fallback_expr) = if method == "mul_add" {
+                        let relaxed_madd = simple_intrinsic("relaxed_madd", vec_ty);
+                        (
+                            relaxed_madd,
+                            quote! { c.into() },
+                            quote! { self.#add_sub(self.#mul(a, b), c) },
+                        )
+                    } else if method == "mul_sub" {
                         // WebAssembly just... forgot fused multiply-subtract? It seems the
                         // initial proposal
                         // (https://github.com/WebAssembly/relaxed-simd/issues/27) confused it
                         // with negate multiply-add, and nobody ever resolved the confusion.
                         let negate = simple_intrinsic("neg", vec_ty);
-                        quote! { #negate(c.into()) }
+                        let relaxed_madd = simple_intrinsic("relaxed_madd", vec_ty);
+                        (
+                            relaxed_madd,
+                            quote! { #negate(c.into()) },
+                            quote! { self.#add_sub(self.#mul(a, b), c) },
+                        )
                     } else {
-                        quote! { c.into() }
+                        // mul_neg_add: c - (a * b)
+                        let relaxed_nmadd = simple_intrinsic("relaxed_nmadd", vec_ty);
+                        (
+                            relaxed_nmadd,
+                            quote! { c.into() },
+                            quote! { self.#add_sub(c, self.#mul(a, b)) },
+                        )
                     };
-                    let relaxed_madd = simple_intrinsic("relaxed_madd", vec_ty);
 
                     quote! {
                         #method_sig {
                             #[cfg(target_feature = "relaxed-simd")]
-                            { #relaxed_madd(a.into(), b.into(), #c).simd_into(self) }
+                            { #relaxed_intrinsic(a.into(), b.into(), #c_expr).simd_into(self) }
 
                             #[cfg(not(target_feature = "relaxed-simd"))]
-                            { self.#add_sub(self.#mul(a, b), c) }
+                            { #fallback_expr }
                         }
                     }
                 } else {
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index 90700218..45b9122c 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -622,6 +622,14 @@ impl X86 {
                     }
                 }
             }
+            "mul_neg_add" if *self == Self::Avx2 => {
+                let intrinsic = simple_intrinsic("fnmadd", vec_ty);
+                quote! {
+                    #method_sig {
+                        unsafe { #intrinsic(a.into(), b.into(), c.into()).simd_into(self) }
+                    }
+                }
+            }
             "mul_add" => {
                 quote! {
                     #method_sig {
@@ -636,6 +644,13 @@ impl X86 {
                     }
                 }
             }
+            "mul_neg_add" => {
+                quote! {
+                    #method_sig {
+                        c - a * b
+                    }
+                }
+            }
             _ => {
                 let args = [
                     quote! { a.into() },
diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs
index a7b50452..273aea36 100644
--- a/fearless_simd_gen/src/ops.rs
+++ b/fearless_simd_gen/src/ops.rs
@@ -631,6 +631,13 @@ const FLOAT_OPS: &[Op] = &[
         "Compute `({arg0} * {arg1}) - {arg2}` (fused multiply-subtract) for each element.\n\n\
         Depending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors.",
     ),
+    Op::new(
+        "mul_neg_add",
+        OpKind::VecTraitMethod,
+        OpSig::Ternary,
+        "Compute `{arg2} - ({arg0} * {arg1})` (fused negated multiply-add) for each element.\n\n\
+        Depending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors.",
+    ),
     Op::new(
         "floor",
         OpKind::VecTraitMethod,
diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs
index 3b618e87..1e4d07f0 100644
--- a/fearless_simd_tests/tests/harness/mod.rs
+++ b/fearless_simd_tests/tests/harness/mod.rs
@@ -145,6 +145,14 @@ fn msub_f32x4<S: Simd>(simd: S) {
     assert_eq!(*a.mul_sub(b, c), [19.0, 28.0, 37.0, 46.0]);
 }
 
+#[simd_test]
+fn mul_neg_add_f32x4<S: Simd>(simd: S) {
+    let a = f32x4::from_slice(simd, &[2.0, 3.0, 4.0, 5.0]);
+    let b = f32x4::from_slice(simd, &[10.0, 10.0, 10.0, 10.0]);
+    let c = f32x4::from_slice(simd, &[100.0, 50.0, 25.0, 10.0]);
+    assert_eq!(*a.mul_neg_add(b, c), [80.0, 20.0, -15.0, -40.0]);
+}
+
 #[simd_test]
 fn max_precise_f32x4_with_nan<S: Simd>(simd: S) {
     let a = f32x4::from_slice(simd, &[f32::NAN, -3.0, f32::INFINITY, 0.5]);
@@ -2670,6 +2678,14 @@ fn madd_f64x2<S: Simd>(simd: S) {
     assert_eq!(*a.mul_add(b, c), [6.0, 13.0]);
 }
 
+#[simd_test]
+fn mul_neg_add_f64x2<S: Simd>(simd: S) {
+    let a = f64x2::from_slice(simd, &[2.0, 3.0]);
+    let b = f64x2::from_slice(simd, &[4.0, 5.0]);
+    let c = f64x2::from_slice(simd, &[20.0, 30.0]);
+    assert_eq!(*a.mul_neg_add(b, c), [12.0, 15.0]);
+}
+
 #[simd_test]
 fn floor_f64x2<S: Simd>(simd: S) {
     let a = f64x2::from_slice(simd, &[1.7, -2.3]);