libm: Implement exp and its variants for i586 with inline assembly

quaternic · web-flow · commit 30342079136d · 2025-12-06T08:00:15.000Z
Resolve the severe imprecision (~2%) that is due to inconsistent rounding. Closes: #1021
diff --git a/libm-test/src/precision.rs b/libm-test/src/precision.rs
@@ -83,6 +83,19 @@ pub fn default_ulp(ctx: &CheckCtx) -> u32 {
         Bn::Tgamma => 20,
     };
 
+    // These have a separate implementation on i586
+    if cfg!(x86_no_sse) {
+        match ctx.fn_ident {
+            Id::Exp => ulp = 1,
+            Id::Exp2 => ulp = 1,
+            Id::Exp10 => ulp = 1,
+            Id::Expf => ulp = 0,
+            Id::Exp2f => ulp = 0,
+            Id::Exp10f => ulp = 0,
+            _ => (),
+        }
+    }
+
     // There are some cases where musl's approximation is less accurate than ours. For these
     // cases, increase the ULP.
     if ctx.basis == Musl {
@@ -98,6 +111,8 @@ pub fn default_ulp(ctx: &CheckCtx) -> u32 {
             Id::Cbrt => ulp = 2,
             // FIXME(#401): musl has an incorrect result here.
             Id::Fdim => ulp = 2,
+            Id::Exp2f => ulp = 1,
+            Id::Expf => ulp = 1,
             Id::Sincosf => ulp = 500,
             Id::Tgamma => ulp = 20,
             _ => (),
@@ -124,8 +139,6 @@ pub fn default_ulp(ctx: &CheckCtx) -> u32 {
             Id::Asinh => ulp = 3,
             Id::Asinhf => ulp = 3,
             Id::Cbrt => ulp = 1,
-            Id::Exp10 | Id::Exp10f => ulp = 1_000_000,
-            Id::Exp2 | Id::Exp2f => ulp = 10_000_000,
             Id::Log1p | Id::Log1pf => ulp = 2,
             Id::Tan => ulp = 2,
             _ => (),
@@ -218,15 +231,6 @@ impl MaybeOverride<(f32,)> for SpecialCase {
             return XFAIL("expm1 representable numbers");
         }
 
-        if cfg!(x86_no_sse)
-            && ctx.base_name == BaseName::Exp2
-            && !expected.is_infinite()
-            && actual.is_infinite()
-        {
-            // We return infinity when there is a representable value. Test input: 127.97238
-            return XFAIL("586 exp2 representable numbers");
-        }
-
         if ctx.base_name == BaseName::Sinh && input.0.abs() > 80.0 && actual.is_nan() {
             // we return some NaN that should be real values or infinite
             if ctx.basis == CheckBasis::Musl {
@@ -278,14 +282,6 @@ impl MaybeOverride<(f64,)> for SpecialCase {
             return XFAIL("i586 rint rounding mode");
         }
 
-        if cfg!(x86_no_sse)
-            && (ctx.fn_ident == Identifier::Exp10 || ctx.fn_ident == Identifier::Exp2)
-        {
-            // FIXME: i586 has very imprecise results with ULP > u32::MAX for these
-            // operations so we can't reasonably provide a limit.
-            return XFAIL_NOCHECK;
-        }
-
         if ctx.base_name == BaseName::J0 && input.0 < -1e300 {
             // Errors get huge close to -inf
             return XFAIL_NOCHECK;
diff --git a/libm/src/math/arch/i586.rs b/libm/src/math/arch/i586.rs
@@ -60,3 +60,62 @@ pub fn floor(mut x: f64) -> f64 {
     }
     x
 }
+/// Implements the exponential functions with `x87` assembly.
+///
+/// This relies on the instruction `f2xm1`, which computes `2^x - 1` (for
+/// |x| < 1). This transcendental instruction is documented to produce results
+/// with error below 1ulp (in the native double-extended precision format). This
+/// translates to correctly rounded results for f32, but results in f64 may have
+/// 1ulp error, which may depend on the hardware.
+macro_rules! x87exp {
+    ($float_ty:ident, $word_size:literal, $fn_name:ident,  $load_op:literal) => {
+        pub fn $fn_name(mut x: $float_ty) -> $float_ty { unsafe {
+            core::arch::asm!(
+                // Prepare the register stack as
+                // ```
+                // st(0) = y = x*log2(base)
+                // st(1) = 1.0
+                // st(2) = round(y)
+                // ```
+                concat!($load_op, " ", $word_size, " ptr [{x}]"),
+                "fld1",
+                "fld st(1)",
+                "frndint",
+                "fxch st(2)",
+
+                // Compare y with round(y) to determine if y is finite and
+                // not an integer. If so, compute `exp2(y - round(y))` into
+                // st(1). Otherwise skip ahead with `st(1) = 1.0`
+                "fucom st(2)",
+                "fstsw ax",
+                "test ax, 0x4000",
+                "jnz 2f",
+                "fsub st(0), st(2)", // st(0) = y - round(y)
+                "f2xm1",             // st(0) = 2^st(0) - 1.0
+                "fadd st(1), st(0)", // st(1) = 1 + st(0) = exp2(y - round(y))
+                "2:",
+
+                // Finally, scale by `exp2(round(y))` and clear the stack.
+                "fstp st(0)",
+                "fscale",
+                concat!("fstp ", $word_size, " ptr [{x}]"),
+                "fstp st(0)",
+                x = in(reg) &mut x,
+                out("ax") _,
+                out("st(0)") _, out("st(1)") _,
+                out("st(2)") _, out("st(3)") _,
+                out("st(4)") _, out("st(5)") _,
+                out("st(6)") _, out("st(7)") _,
+                options(nostack),
+            );
+            x
+        }}
+    };
+}
+
+x87exp!(f32, "dword", x87_exp2f, "fld");
+x87exp!(f64, "qword", x87_exp2, "fld");
+x87exp!(f32, "dword", x87_exp10f, "fldl2t\nfmul");
+x87exp!(f64, "qword", x87_exp10, "fldl2t\nfmul");
+x87exp!(f32, "dword", x87_expf, "fldl2e\nfmul");
+x87exp!(f64, "qword", x87_exp, "fldl2e\nfmul");
diff --git a/libm/src/math/arch/mod.rs b/libm/src/math/arch/mod.rs
@@ -48,3 +48,8 @@ cfg_if! {
         pub use i586::{ceil, floor};
     }
 }
+cfg_if! {
+    if #[cfg(x86_no_sse)] {
+        pub use i586::{x87_exp10f, x87_exp10, x87_expf, x87_exp, x87_exp2f, x87_exp2};
+    }
+}
diff --git a/libm/src/math/exp.rs b/libm/src/math/exp.rs
@@ -83,6 +83,12 @@ const P5: f64 = 4.13813679705723846039e-08; /* 0x3E663769, 0x72BEA4D0 */
 /// (where *e* is the base of the natural system of logarithms, approximately 2.71828).
 #[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn exp(mut x: f64) -> f64 {
+    select_implementation! {
+        name: x87_exp,
+        use_arch_required: x86_no_sse,
+        args: x,
+    }
+
     let x1p1023 = f64::from_bits(0x7fe0000000000000); // 0x1p1023 === 2 ^ 1023
     let x1p_149 = f64::from_bits(0x36a0000000000000); // 0x1p-149 === 2 ^ -149
 
diff --git a/libm/src/math/exp10.rs b/libm/src/math/exp10.rs
@@ -9,6 +9,12 @@ const P10: &[f64] = &[
 /// Calculates 10 raised to the power of `x` (f64).
 #[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn exp10(x: f64) -> f64 {
+    select_implementation! {
+        name: x87_exp10,
+        use_arch_required: x86_no_sse,
+        args: x,
+    }
+
     let (mut y, n) = modf(x);
     let u: u64 = n.to_bits();
     /* fabs(n) < 16 without raising invalid on nan */
diff --git a/libm/src/math/exp10f.rs b/libm/src/math/exp10f.rs
@@ -9,6 +9,12 @@ const P10: &[f32] = &[
 /// Calculates 10 raised to the power of `x` (f32).
 #[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn exp10f(x: f32) -> f32 {
+    select_implementation! {
+        name: x87_exp10f,
+        use_arch_required: x86_no_sse,
+        args: x,
+    }
+
     let (mut y, n) = modff(x);
     let u = n.to_bits();
     /* fabsf(n) < 8 without raising invalid on nan */
diff --git a/libm/src/math/exp2.rs b/libm/src/math/exp2.rs
@@ -324,6 +324,12 @@ static TBL: [u64; TBLSIZE * 2] = [
 /// Calculate `2^x`, that is, 2 raised to the power `x`.
 #[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn exp2(mut x: f64) -> f64 {
+    select_implementation! {
+        name: x87_exp2,
+        use_arch_required: x86_no_sse,
+        args: x,
+    }
+
     let redux = f64::from_bits(0x4338000000000000) / TBLSIZE as f64;
     let p1 = f64::from_bits(0x3fe62e42fefa39ef);
     let p2 = f64::from_bits(0x3fcebfbdff82c575);
diff --git a/libm/src/math/exp2f.rs b/libm/src/math/exp2f.rs
@@ -75,6 +75,12 @@ static EXP2FT: [u64; TBLSIZE] = [
 /// Calculate `2^x`, that is, 2 raised to the power `x`.
 #[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn exp2f(mut x: f32) -> f32 {
+    select_implementation! {
+        name: x87_exp2f,
+        use_arch_required: x86_no_sse,
+        args: x,
+    }
+
     let redux = f32::from_bits(0x4b400000) / TBLSIZE as f32;
     let p1 = f32::from_bits(0x3f317218);
     let p2 = f32::from_bits(0x3e75fdf0);
diff --git a/libm/src/math/expf.rs b/libm/src/math/expf.rs
@@ -32,6 +32,12 @@ const P2: f32 = -2.7667332906e-3; /* -0xb55215.0p-32 */
 /// (where *e* is the base of the natural system of logarithms, approximately 2.71828).
 #[cfg_attr(assert_no_panic, no_panic::no_panic)]
 pub fn expf(mut x: f32) -> f32 {
+    select_implementation! {
+        name: x87_expf,
+        use_arch_required: x86_no_sse,
+        args: x,
+    }
+
     let x1p127 = f32::from_bits(0x7f000000); // 0x1p127f === 2 ^ 127
     let x1p_126 = f32::from_bits(0x800000); // 0x1p-126f === 2 ^ -126  /*original 0x1p-149f    ??????????? */
     let mut hx = x.to_bits();

Original file line number	Diff line number	Diff line change
`@@ -48,3 +48,8 @@ cfg_if! {`
`48`	`48`	`pub use i586::{ceil, floor};`
`49`	`49`	`}`
`50`	`50`	`}`
	`51`	`+cfg_if! {`
	`52`	`+ if #[cfg(x86_no_sse)] {`
	`53`	`+ pub use i586::{x87_exp10f, x87_exp10, x87_expf, x87_exp, x87_exp2f, x87_exp2};`
	`54`	`+ }`
	`55`	`+}`