diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs index f09e03d0..005c74ae 100644 --- a/fearless_simd/src/generated/avx2.rs +++ b/fearless_simd/src/generated/avx2.rs @@ -253,6 +253,10 @@ impl Simd for Avx2 { unsafe { _mm_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) } } #[inline(always)] + fn mul_neg_add_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { + unsafe { _mm_fnmadd_ps(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] fn floor_f32x4(self, a: f32x4) -> f32x4 { unsafe { _mm_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) @@ -1921,6 +1925,10 @@ impl Simd for Avx2 { unsafe { _mm_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) } } #[inline(always)] + fn mul_neg_add_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { + unsafe { _mm_fnmadd_pd(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] fn floor_f64x2(self, a: f64x2) -> f64x2 { unsafe { _mm_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) @@ -2238,6 +2246,10 @@ impl Simd for Avx2 { unsafe { _mm256_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) } } #[inline(always)] + fn mul_neg_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + unsafe { _mm256_fnmadd_ps(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] fn floor_f32x8(self, a: f32x8) -> f32x8 { unsafe { _mm256_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) @@ -4238,6 +4250,10 @@ impl Simd for Avx2 { unsafe { _mm256_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) } } #[inline(always)] + fn mul_neg_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + unsafe { _mm256_fnmadd_pd(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] fn floor_f64x4(self, a: f64x4) -> f64x4 { unsafe { _mm256_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) @@ -4627,6 +4643,16 @@ impl Simd for Avx2 { ) } #[inline(always)] + fn mul_neg_add_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let (c0, c1) = self.split_f32x16(c); + self.combine_f32x8( + self.mul_neg_add_f32x8(a0, b0, c0), + self.mul_neg_add_f32x8(a1, b1, c1), + ) + } + #[inline(always)] fn floor_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1)) @@ -6988,6 +7014,16 @@ impl Simd for Avx2 { ) } #[inline(always)] + fn mul_neg_add_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let (c0, c1) = self.split_f64x8(c); + self.combine_f64x4( + self.mul_neg_add_f64x4(a0, b0, c0), + self.mul_neg_add_f64x4(a1, b1, c1), + ) + } + #[inline(always)] fn floor_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1)) diff --git a/fearless_simd/src/generated/fallback.rs b/fearless_simd/src/generated/fallback.rs index c2f39a1b..4db585fd 100644 --- a/fearless_simd/src/generated/fallback.rs +++ b/fearless_simd/src/generated/fallback.rs @@ -397,6 +397,10 @@ impl Simd for Fallback { a.mul(b).sub(c) } #[inline(always)] + fn mul_neg_add_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { + c.sub(a.mul(b)) + } + #[inline(always)] fn floor_f32x4(self, a: f32x4) -> f32x4 { [ f32::floor(a[0usize]), @@ -3772,6 +3776,10 @@ impl Simd for Fallback { a.mul(b).sub(c) } #[inline(always)] + fn mul_neg_add_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { + c.sub(a.mul(b)) + } + #[inline(always)] fn floor_f64x2(self, a: f64x2) -> f64x2 { [f64::floor(a[0usize]), f64::floor(a[1usize])].simd_into(self) } @@ -4141,6 +4149,16 @@ impl Simd for Fallback { ) } #[inline(always)] + fn mul_neg_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + let (c0, c1) = self.split_f32x8(c); + self.combine_f32x4( + self.mul_neg_add_f32x4(a0, b0, c0), + self.mul_neg_add_f32x4(a1, b1, c1), + ) + } + #[inline(always)] fn floor_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1)) @@ -6185,6 +6203,16 @@ impl Simd for Fallback { ) } #[inline(always)] + fn mul_neg_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + let (c0, c1) = self.split_f64x4(c); + self.combine_f64x2( + self.mul_neg_add_f64x2(a0, b0, c0), + self.mul_neg_add_f64x2(a1, b1, c1), + ) + } + #[inline(always)] fn floor_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1)) @@ -6577,6 +6605,16 @@ impl Simd for Fallback { ) } #[inline(always)] + fn mul_neg_add_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let (c0, c1) = self.split_f32x16(c); + self.combine_f32x8( + self.mul_neg_add_f32x8(a0, b0, c0), + self.mul_neg_add_f32x8(a1, b1, c1), + ) + } + #[inline(always)] fn floor_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1)) @@ -8743,6 +8781,16 @@ impl Simd for Fallback { ) } #[inline(always)] + fn mul_neg_add_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let (c0, c1) = self.split_f64x8(c); + self.combine_f64x4( + self.mul_neg_add_f64x4(a0, b0, c0), + self.mul_neg_add_f64x4(a1, b1, c1), + ) + } + #[inline(always)] fn floor_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1)) diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs index def528dd..d96fc8c2 100644 --- a/fearless_simd/src/generated/neon.rs +++ b/fearless_simd/src/generated/neon.rs @@ -245,6 +245,10 @@ impl Simd for Neon { unsafe { vnegq_f32(vfmsq_f32(c.into(), b.into(), a.into())).simd_into(self) } } #[inline(always)] + fn mul_neg_add_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { + unsafe { vfmsq_f32(c.into(), b.into(), a.into()).simd_into(self) } + } + #[inline(always)] fn floor_f32x4(self, a: f32x4) -> f32x4 { unsafe { vrndmq_f32(a.into()).simd_into(self) } } @@ -1801,6 +1805,10 @@ impl Simd for Neon { unsafe { vnegq_f64(vfmsq_f64(c.into(), b.into(), a.into())).simd_into(self) } } #[inline(always)] + fn mul_neg_add_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { + unsafe { vfmsq_f64(c.into(), b.into(), a.into()).simd_into(self) } + } + #[inline(always)] fn floor_f64x2(self, a: f64x2) -> f64x2 { unsafe { vrndmq_f64(a.into()).simd_into(self) } } @@ -2146,6 +2154,16 @@ impl Simd for Neon { ) } #[inline(always)] + fn mul_neg_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + let (c0, c1) = self.split_f32x8(c); + self.combine_f32x4( + self.mul_neg_add_f32x4(a0, b0, c0), + self.mul_neg_add_f32x4(a1, b1, c1), + ) + } + #[inline(always)] fn floor_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1)) @@ -4247,6 +4265,16 @@ impl Simd for Neon { ) } #[inline(always)] + fn mul_neg_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + let (c0, c1) = self.split_f64x4(c); + self.combine_f64x2( + self.mul_neg_add_f64x2(a0, b0, c0), + self.mul_neg_add_f64x2(a1, b1, c1), + ) + } + #[inline(always)] fn floor_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1)) @@ -4653,6 +4681,16 @@ impl Simd for Neon { ) } #[inline(always)] + fn mul_neg_add_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let (c0, c1) = self.split_f32x16(c); + self.combine_f32x8( + self.mul_neg_add_f32x8(a0, b0, c0), + self.mul_neg_add_f32x8(a1, b1, c1), + ) + } + #[inline(always)] fn floor_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1)) @@ -6708,6 +6746,16 @@ impl Simd for Neon { ) } #[inline(always)] + fn mul_neg_add_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let (c0, c1) = self.split_f64x8(c); + self.combine_f64x4( + self.mul_neg_add_f64x4(a0, b0, c0), + self.mul_neg_add_f64x4(a1, b1, c1), + ) + } + #[inline(always)] fn floor_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1)) diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs index b0732adf..2b4c13cd 100644 --- a/fearless_simd/src/generated/simd_trait.rs +++ b/fearless_simd/src/generated/simd_trait.rs @@ -184,6 +184,8 @@ pub trait Simd: fn mul_add_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4; #[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."] fn mul_sub_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4; + #[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."] + fn mul_neg_add_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4; #[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."] fn floor_f32x4(self, a: f32x4) -> f32x4; #[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."] @@ -841,6 +843,8 @@ pub trait Simd: fn mul_add_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2; #[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."] fn mul_sub_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2; + #[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."] + fn mul_neg_add_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2; #[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."] fn floor_f64x2(self, a: f64x2) -> f64x2; #[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."] @@ -966,6 +970,8 @@ pub trait Simd: fn mul_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8; #[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."] fn mul_sub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8; + #[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."] + fn mul_neg_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8; #[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."] fn floor_f32x8(self, a: f32x8) -> f32x8; #[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."] @@ -1645,6 +1651,8 @@ pub trait Simd: fn mul_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4; #[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."] fn mul_sub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4; + #[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."] + fn mul_neg_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4; #[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."] fn floor_f64x4(self, a: f64x4) -> f64x4; #[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."] @@ -1774,6 +1782,8 @@ pub trait Simd: fn mul_add_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16; #[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."] fn mul_sub_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16; + #[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."] + fn mul_neg_add_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16; #[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."] fn floor_f32x16(self, a: f32x16) -> f32x16; #[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."] @@ -2447,6 +2457,8 @@ pub trait Simd: fn mul_add_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8; #[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."] fn mul_sub_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8; + #[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."] + fn mul_neg_add_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8; #[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."] fn floor_f64x8(self, a: f64x8) -> f64x8; #[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."] @@ -2688,6 +2700,8 @@ pub trait SimdFloat: fn mul_add(self, op1: impl SimdInto, op2: impl SimdInto) -> Self; #[doc = "Compute `(self * op1) - op2` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."] fn mul_sub(self, op1: impl SimdInto, op2: impl SimdInto) -> Self; + #[doc = "Compute `op2 - (self * op1)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."] + fn mul_neg_add(self, op1: impl SimdInto, op2: impl SimdInto) -> Self; #[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."] fn floor(self) -> Self; #[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."] diff --git a/fearless_simd/src/generated/simd_types.rs b/fearless_simd/src/generated/simd_types.rs index 6768951e..0cd8042a 100644 --- a/fearless_simd/src/generated/simd_types.rs +++ b/fearless_simd/src/generated/simd_types.rs @@ -193,6 +193,11 @@ impl crate::SimdFloat for f32x4 { .mul_sub_f32x4(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) } #[inline(always)] + fn mul_neg_add(self, op1: impl SimdInto, op2: impl SimdInto) -> Self { + self.simd + .mul_neg_add_f32x4(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) + } + #[inline(always)] fn floor(self) -> Self { self.simd.floor_f32x4(self) } @@ -1884,6 +1889,11 @@ impl crate::SimdFloat for f64x2 { .mul_sub_f64x2(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) } #[inline(always)] + fn mul_neg_add(self, op1: impl SimdInto, op2: impl SimdInto) -> Self { + self.simd + .mul_neg_add_f64x2(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) + } + #[inline(always)] fn floor(self) -> Self { self.simd.floor_f64x2(self) } @@ -2247,6 +2257,11 @@ impl crate::SimdFloat for f32x8 { .mul_sub_f32x8(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) } #[inline(always)] + fn mul_neg_add(self, op1: impl SimdInto, op2: impl SimdInto) -> Self { + self.simd + .mul_neg_add_f32x8(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) + } + #[inline(always)] fn floor(self) -> Self { self.simd.floor_f32x8(self) } @@ -4018,6 +4033,11 @@ impl crate::SimdFloat for f64x4 { .mul_sub_f64x4(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) } #[inline(always)] + fn mul_neg_add(self, op1: impl SimdInto, op2: impl SimdInto) -> Self { + self.simd + .mul_neg_add_f64x4(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) + } + #[inline(always)] fn floor(self) -> Self { self.simd.floor_f64x4(self) } @@ -4401,6 +4421,11 @@ impl crate::SimdFloat for f32x16 { .mul_sub_f32x16(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) } #[inline(always)] + fn mul_neg_add(self, op1: impl SimdInto, op2: impl SimdInto) -> Self { + self.simd + .mul_neg_add_f32x16(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) + } + #[inline(always)] fn floor(self) -> Self { self.simd.floor_f32x16(self) } @@ -6122,6 +6147,11 @@ impl crate::SimdFloat for f64x8 { .mul_sub_f64x8(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) } #[inline(always)] + fn mul_neg_add(self, op1: impl SimdInto, op2: impl SimdInto) -> Self { + self.simd + .mul_neg_add_f64x8(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) + } + #[inline(always)] fn floor(self) -> Self { self.simd.floor_f64x8(self) } diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs index a9f7c1ad..1d99bb2e 100644 --- a/fearless_simd/src/generated/sse4_2.rs +++ b/fearless_simd/src/generated/sse4_2.rs @@ -258,6 +258,10 @@ impl Simd for Sse4_2 { a * b - c } #[inline(always)] + fn mul_neg_add_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { + c - a * b + } + #[inline(always)] fn floor_f32x4(self, a: f32x4) -> f32x4 { unsafe { _mm_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) @@ -1961,6 +1965,10 @@ impl Simd for Sse4_2 { a * b - c } #[inline(always)] + fn mul_neg_add_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { + c - a * b + } + #[inline(always)] fn floor_f64x2(self, a: f64x2) -> f64x2 { unsafe { _mm_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) @@ -2324,6 +2332,16 @@ impl Simd for Sse4_2 { ) } #[inline(always)] + fn mul_neg_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + let (c0, c1) = self.split_f32x8(c); + self.combine_f32x4( + self.mul_neg_add_f32x4(a0, b0, c0), + self.mul_neg_add_f32x4(a1, b1, c1), + ) + } + #[inline(always)] fn floor_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1)) @@ -4537,6 +4555,16 @@ impl Simd for Sse4_2 { ) } #[inline(always)] + fn mul_neg_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + let (c0, c1) = self.split_f64x4(c); + self.combine_f64x2( + self.mul_neg_add_f64x2(a0, b0, c0), + self.mul_neg_add_f64x2(a1, b1, c1), + ) + } + #[inline(always)] fn floor_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1)) @@ -4971,6 +4999,16 @@ impl Simd for Sse4_2 { ) } #[inline(always)] + fn mul_neg_add_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let (c0, c1) = self.split_f32x16(c); + self.combine_f32x8( + self.mul_neg_add_f32x8(a0, b0, c0), + self.mul_neg_add_f32x8(a1, b1, c1), + ) + } + #[inline(always)] fn floor_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1)) @@ -7384,6 +7422,16 @@ impl Simd for Sse4_2 { ) } #[inline(always)] + fn mul_neg_add_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let (c0, c1) = self.split_f64x8(c); + self.combine_f64x4( + self.mul_neg_add_f64x4(a0, b0, c0), + self.mul_neg_add_f64x4(a1, b1, c1), + ) + } + #[inline(always)] fn floor_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1)) diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs index 59c94768..88c470a3 100644 --- a/fearless_simd/src/generated/wasm.rs +++ b/fearless_simd/src/generated/wasm.rs @@ -264,6 +264,17 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn mul_neg_add_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { + #[cfg(target_feature = "relaxed-simd")] + { + f32x4_relaxed_nmadd(a.into(), b.into(), c.into()).simd_into(self) + } + #[cfg(not(target_feature = "relaxed-simd"))] + { + self.sub_f32x4(c, self.mul_f32x4(a, b)) + } + } + #[inline(always)] fn floor_f32x4(self, a: f32x4) -> f32x4 { f32x4_floor(a.into()).simd_into(self) } @@ -1906,6 +1917,17 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn mul_neg_add_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { + #[cfg(target_feature = "relaxed-simd")] + { + f64x2_relaxed_nmadd(a.into(), b.into(), c.into()).simd_into(self) + } + #[cfg(not(target_feature = "relaxed-simd"))] + { + self.sub_f64x2(c, self.mul_f64x2(a, b)) + } + } + #[inline(always)] fn floor_f64x2(self, a: f64x2) -> f64x2 { f64x2_floor(a.into()).simd_into(self) } @@ -2274,6 +2296,16 @@ impl Simd for WasmSimd128 { ) } #[inline(always)] + fn mul_neg_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + let (c0, c1) = self.split_f32x8(c); + self.combine_f32x4( + self.mul_neg_add_f32x4(a0, b0, c0), + self.mul_neg_add_f32x4(a1, b1, c1), + ) + } + #[inline(always)] fn floor_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1)) @@ -4485,6 +4517,16 @@ impl Simd for WasmSimd128 { ) } #[inline(always)] + fn mul_neg_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + let (c0, c1) = self.split_f64x4(c); + self.combine_f64x2( + self.mul_neg_add_f64x2(a0, b0, c0), + self.mul_neg_add_f64x2(a1, b1, c1), + ) + } + #[inline(always)] fn floor_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1)) @@ -4919,6 +4961,16 @@ impl Simd for WasmSimd128 { ) } #[inline(always)] + fn mul_neg_add_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let (c0, c1) = self.split_f32x16(c); + self.combine_f32x8( + self.mul_neg_add_f32x8(a0, b0, c0), + self.mul_neg_add_f32x8(a1, b1, c1), + ) + } + #[inline(always)] fn floor_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1)) @@ -7324,6 +7376,16 @@ impl Simd for WasmSimd128 { ) } #[inline(always)] + fn mul_neg_add_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let (c0, c1) = self.split_f64x8(c); + self.combine_f64x4( + self.mul_neg_add_f64x4(a0, b0, c0), + self.mul_neg_add_f64x4(a1, b1, c1), + ) + } + #[inline(always)] fn floor_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1)) diff --git a/fearless_simd_gen/src/arch/neon.rs b/fearless_simd_gen/src/arch/neon.rs index 081a3242..713576ca 100644 --- a/fearless_simd_gen/src/arch/neon.rs +++ b/fearless_simd_gen/src/arch/neon.rs @@ -37,6 +37,7 @@ fn translate_op(op: &str) -> Option<&'static str> { "min_precise" => "vminnm", "mul_add" => "vfma", "mul_sub" => "vfms", + "mul_neg_add" => "vfms", _ => return None, }) } diff --git a/fearless_simd_gen/src/mk_fallback.rs b/fearless_simd_gen/src/mk_fallback.rs index c6c95533..b5106c41 100644 --- a/fearless_simd_gen/src/mk_fallback.rs +++ b/fearless_simd_gen/src/mk_fallback.rs @@ -241,6 +241,12 @@ impl Level for Fallback { a.mul(b).sub(c) } } + } else if method == "mul_neg_add" { + quote! { + #method_sig { + c.sub(a.mul(b)) + } + } } else { let args = [ quote! { a.into() }, diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs index 366359e1..274707b1 100644 --- a/fearless_simd_gen/src/mk_neon.rs +++ b/fearless_simd_gen/src/mk_neon.rs @@ -271,7 +271,7 @@ impl Level for Neon { } OpSig::Ternary => { let args = match method { - "mul_add" | "mul_sub" => [ + "mul_add" | "mul_sub" | "mul_neg_add" => [ quote! { c.into() }, quote! { b.into() }, quote! { a.into() }, @@ -289,6 +289,7 @@ impl Level for Neon { let neg = simple_intrinsic("vneg", vec_ty); expr = quote! { #neg(#expr) }; } + // mul_neg_add computes c - (a * b), which is exactly what vfms does quote! { #method_sig { unsafe { diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs index d098d765..693342f8 100644 --- a/fearless_simd_gen/src/mk_wasm.rs +++ b/fearless_simd_gen/src/mk_wasm.rs @@ -192,30 +192,53 @@ impl Level for WasmSimd128 { } } OpSig::Ternary => { - if matches!(method, "mul_add" | "mul_sub") { - let add_sub = - generic_op_name(if method == "mul_add" { "add" } else { "sub" }, vec_ty); + if matches!(method, "mul_add" | "mul_sub" | "mul_neg_add") { + let fallback_op = if method == "mul_neg_add" { + "sub" + } else if method == "mul_add" { + "add" + } else { + "sub" + }; + let add_sub = generic_op_name(fallback_op, vec_ty); let mul = generic_op_name("mul", vec_ty); - let c = if method == "mul_sub" { + let (relaxed_intrinsic, c_expr, fallback_expr) = if method == "mul_add" { + let relaxed_madd = simple_intrinsic("relaxed_madd", vec_ty); + ( + relaxed_madd, + quote! { c.into() }, + quote! { self.#add_sub(self.#mul(a, b), c) }, + ) + } else if method == "mul_sub" { // WebAssembly just... forgot fused multiply-subtract? It seems the // initial proposal // (https://github.com/WebAssembly/relaxed-simd/issues/27) confused it // with negate multiply-add, and nobody ever resolved the confusion. let negate = simple_intrinsic("neg", vec_ty); - quote! { #negate(c.into()) } + let relaxed_madd = simple_intrinsic("relaxed_madd", vec_ty); + ( + relaxed_madd, + quote! { #negate(c.into()) }, + quote! { self.#add_sub(self.#mul(a, b), c) }, + ) } else { - quote! { c.into() } + // mul_neg_add: c - (a * b) + let relaxed_nmadd = simple_intrinsic("relaxed_nmadd", vec_ty); + ( + relaxed_nmadd, + quote! { c.into() }, + quote! { self.#add_sub(c, self.#mul(a, b)) }, + ) }; - let relaxed_madd = simple_intrinsic("relaxed_madd", vec_ty); quote! { #method_sig { #[cfg(target_feature = "relaxed-simd")] - { #relaxed_madd(a.into(), b.into(), #c).simd_into(self) } + { #relaxed_intrinsic(a.into(), b.into(), #c_expr).simd_into(self) } #[cfg(not(target_feature = "relaxed-simd"))] - { self.#add_sub(self.#mul(a, b), c) } + { #fallback_expr } } } } else { diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index 90700218..45b9122c 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -622,6 +622,14 @@ impl X86 { } } } + "mul_neg_add" if *self == Self::Avx2 => { + let intrinsic = simple_intrinsic("fnmadd", vec_ty); + quote! { + #method_sig { + unsafe { #intrinsic(a.into(), b.into(), c.into()).simd_into(self) } + } + } + } "mul_add" => { quote! { #method_sig { @@ -636,6 +644,13 @@ impl X86 { } } } + "mul_neg_add" => { + quote! { + #method_sig { + c - a * b + } + } + } _ => { let args = [ quote! { a.into() }, diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs index a7b50452..273aea36 100644 --- a/fearless_simd_gen/src/ops.rs +++ b/fearless_simd_gen/src/ops.rs @@ -631,6 +631,13 @@ const FLOAT_OPS: &[Op] = &[ "Compute `({arg0} * {arg1}) - {arg2}` (fused multiply-subtract) for each element.\n\n\ Depending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors.", ), + Op::new( + "mul_neg_add", + OpKind::VecTraitMethod, + OpSig::Ternary, + "Compute `{arg2} - ({arg0} * {arg1})` (fused negated multiply-add) for each element.\n\n\ + Depending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors.", + ), Op::new( "floor", OpKind::VecTraitMethod, diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs index 3b618e87..1e4d07f0 100644 --- a/fearless_simd_tests/tests/harness/mod.rs +++ b/fearless_simd_tests/tests/harness/mod.rs @@ -145,6 +145,14 @@ fn msub_f32x4(simd: S) { assert_eq!(*a.mul_sub(b, c), [19.0, 28.0, 37.0, 46.0]); } +#[simd_test] +fn mul_neg_add_f32x4(simd: S) { + let a = f32x4::from_slice(simd, &[2.0, 3.0, 4.0, 5.0]); + let b = f32x4::from_slice(simd, &[10.0, 10.0, 10.0, 10.0]); + let c = f32x4::from_slice(simd, &[100.0, 50.0, 25.0, 10.0]); + assert_eq!(*a.mul_neg_add(b, c), [80.0, 20.0, -15.0, -40.0]); +} + #[simd_test] fn max_precise_f32x4_with_nan(simd: S) { let a = f32x4::from_slice(simd, &[f32::NAN, -3.0, f32::INFINITY, 0.5]); @@ -2670,6 +2678,14 @@ fn madd_f64x2(simd: S) { assert_eq!(*a.mul_add(b, c), [6.0, 13.0]); } +#[simd_test] +fn mul_neg_add_f64x2(simd: S) { + let a = f64x2::from_slice(simd, &[2.0, 3.0]); + let b = f64x2::from_slice(simd, &[4.0, 5.0]); + let c = f64x2::from_slice(simd, &[20.0, 30.0]); + assert_eq!(*a.mul_neg_add(b, c), [12.0, 15.0]); +} + #[simd_test] fn floor_f64x2(simd: S) { let a = f64x2::from_slice(simd, &[1.7, -2.3]);