From ab3c498d81295116fc578adc2facb4dd1e932020 Mon Sep 17 00:00:00 2001 From: PingTakPeterTang Date: Wed, 6 Mar 2024 17:55:10 -0800 Subject: [PATCH] fix formatting --- include/foo.h | 11 -- include/rvvlm_erfcinvD.inc.h | 229 ++++++++++++++------------- include/rvvlm_erfinvD.inc.h | 207 ++++++++++++++----------- include/rvvlm_inverrorfuncsD.h | 276 ++++++++++++++++----------------- test/src/test_erfcinv.cpp | 46 ++++-- test/src/test_erfinv.cpp | 59 ++++--- 6 files changed, 444 insertions(+), 384 deletions(-) delete mode 100755 include/foo.h diff --git a/include/foo.h b/include/foo.h deleted file mode 100755 index 49c5391..0000000 --- a/include/foo.h +++ /dev/null @@ -1,11 +0,0 @@ - -#define PSTEP_I_SLL(COEFF_j, X, K, POLY, vlen) \ - __riscv_vsadd(__riscv_vsll( \ - __riscv_vsmul((POLY), (X), 1, (vlen)), (K), (vlen)), \ - (COEFF_j), (vlen)) - -#define PSTEP_I_SRA(COEFF_j, X, K, POLY, vlen) \ - __riscv_vsadd(__riscv_vsra( \ - __riscv_vsmul((POLY), (X), 1, (vlen)), (K), (vlen)), \ - (COEFF_j), (vlen)) - diff --git a/include/rvvlm_erfcinvD.inc.h b/include/rvvlm_erfcinvD.inc.h index 15816ee..609f797 100644 --- a/include/rvvlm_erfcinvD.inc.h +++ b/include/rvvlm_erfcinvD.inc.h @@ -8,66 +8,65 @@ #define F_VER1 RVVLM_ERFCINVDI_STD #endif -//Erfcinv is defined on (0, 2). Suffices to consider (0, 1] -//Two regions of approximation: left is [0, 0x1.2p-2) and right is [0x1.2p-1, 1) -//Both are done with rational functions. -//For right, t*P(t)/Q(t) t = 1-x; x in [0x1.2p-1, 1) -//For left, y*P(t)/Q(t), y = sqrt(-log(x)); and t = 1/y - -// P_coefficients in asending order, all in Q79. p0_delta is in floating point -#define P_right_0 -0x48dbe9f5b3eabaa -#define P_right_1 -0xb35279f1a626ae5 -#define P_right_2 0x33789911873d184a -#define P_right_3 -0x1bf9138fc77c0fbf -#define P_right_4 -0x2d4ec43bc48403d4 -#define P_right_5 0x2d61deb53842cca1 -#define P_right_6 0x23324eca6b3ff02 -#define P_right_7 -0xd4ec1d31542c4fc -#define P_right_8 0x2ecf3c60308b0f2 -#define P_right_9 0x7c917b3378071e -#define P_right_10 -0x1e09b597f226ca +// Erfcinv is defined on (0, 2). Suffices to consider (0, 1] +// Two regions of approximation: left is [0, 0x1.2p-2) and right is [0x1.2p-1, +// 1) Both are done with rational functions. For right, t*P(t)/Q(t) t = 1-x; x +// in [0x1.2p-1, 1) For left, y*P(t)/Q(t), y = sqrt(-log(x)); and t = 1/y + +// P_coefficients in asending order, all in Q79. p0_delta is in floating point +#define P_right_0 -0x48dbe9f5b3eabaa +#define P_right_1 -0xb35279f1a626ae5 +#define P_right_2 0x33789911873d184a +#define P_right_3 -0x1bf9138fc77c0fbf +#define P_right_4 -0x2d4ec43bc48403d4 +#define P_right_5 0x2d61deb53842cca1 +#define P_right_6 0x23324eca6b3ff02 +#define P_right_7 -0xd4ec1d31542c4fc +#define P_right_8 0x2ecf3c60308b0f2 +#define P_right_9 0x7c917b3378071e +#define P_right_10 -0x1e09b597f226ca #define DELTA_P0_right -0x1.ec7dc41c17860p-2 -// Q_coefficients in asending order, all in Q79. q0_delta is in floating point -#define Q_right_0 -0x52366e5b14c0970 -#define Q_right_1 -0xca57e95abcc599b -#define Q_right_2 0x3b6c91ec67f5759c -#define Q_right_3 -0x1c40d5daa3be22bc -#define Q_right_4 -0x41f11eb5d837386c -#define Q_right_5 0x3c6ce478fcd75c9a -#define Q_right_6 0xbb1cd7270cfba1d -#define Q_right_7 -0x1988a4116498f1af -#define Q_right_8 0x44dc3042f103d20 -#define Q_right_9 0x2390e683d02edf3 -#define Q_right_10 -0x8ec66f2a7e410c +// Q_coefficients in asending order, all in Q79. q0_delta is in floating point +#define Q_right_0 -0x52366e5b14c0970 +#define Q_right_1 -0xca57e95abcc599b +#define Q_right_2 0x3b6c91ec67f5759c +#define Q_right_3 -0x1c40d5daa3be22bc +#define Q_right_4 -0x41f11eb5d837386c +#define Q_right_5 0x3c6ce478fcd75c9a +#define Q_right_6 0xbb1cd7270cfba1d +#define Q_right_7 -0x1988a4116498f1af +#define Q_right_8 0x44dc3042f103d20 +#define Q_right_9 0x2390e683d02edf3 +#define Q_right_10 -0x8ec66f2a7e410c #define DELTA_Q0_right -0x1.29a0161e99446p-3 -// P_coefficients in asending order, all in Q67. p0_delta is in floating point -#define P_left_0 0x17a0bb69321df -#define P_left_1 0x402eb416ae6015 -#define P_left_2 0x2973eb18028ce34 -#define P_left_3 0x8034a7ece1d5370 -#define P_left_4 0xa76c08a74dae273 -#define P_left_5 0x11dd3876b83dd078 -#define P_left_6 0xfdd7693c3b77653 -#define P_left_7 0xb33d66152b3c223 -#define P_left_8 0x5a564c28c6a41a9 -#define P_left_9 0x1190449fe630213 -#define P_left_10 -0x659c784274e1 +// P_coefficients in asending order, all in Q67. p0_delta is in floating point +#define P_left_0 0x17a0bb69321df +#define P_left_1 0x402eb416ae6015 +#define P_left_2 0x2973eb18028ce34 +#define P_left_3 0x8034a7ece1d5370 +#define P_left_4 0xa76c08a74dae273 +#define P_left_5 0x11dd3876b83dd078 +#define P_left_6 0xfdd7693c3b77653 +#define P_left_7 0xb33d66152b3c223 +#define P_left_8 0x5a564c28c6a41a9 +#define P_left_9 0x1190449fe630213 +#define P_left_10 -0x659c784274e1 #define DELTA_P0_left -0x1.d622f4cbe0eeep-2 -// Q_coefficients in asending order, all in Q67. q0_delta is in floating point -#define Q_left_0 0x17a09aabf9cee -#define Q_left_1 0x4030b9059ffcad -#define Q_left_2 0x29b26b0d87f7855 -#define Q_left_3 0x87572a13d3fa2dd -#define Q_left_4 0xd7a728b5620ac3c -#define Q_left_5 0x1754392b473fd439 -#define Q_left_6 0x1791b9a091a816c2 -#define Q_left_7 0x167f71db9e13b075 -#define Q_left_8 0xcb9f5f3e5e618a4 -#define Q_left_9 0x68271fae767c68e -#define Q_left_10 0x13745c4fa224b25 +// Q_coefficients in asending order, all in Q67. q0_delta is in floating point +#define Q_left_0 0x17a09aabf9cee +#define Q_left_1 0x4030b9059ffcad +#define Q_left_2 0x29b26b0d87f7855 +#define Q_left_3 0x87572a13d3fa2dd +#define Q_left_4 0xd7a728b5620ac3c +#define Q_left_5 0x1754392b473fd439 +#define Q_left_6 0x1791b9a091a816c2 +#define Q_left_7 0x167f71db9e13b075 +#define Q_left_8 0xcb9f5f3e5e618a4 +#define Q_left_9 0x68271fae767c68e +#define Q_left_10 0x13745c4fa224b25 #define DELTA_Q0_left 0x1.f7e7557a34ae6p-2 void F_VER1(API) { @@ -75,7 +74,7 @@ void F_VER1(API) { VFLOAT vx, vx_sign, vy, vy_special; VBOOL special_args; - VFLOAT dummy; + VFLOAT dummy; SET_ROUNDTONEAREST; // stripmining over input arguments @@ -92,7 +91,7 @@ void F_VER1(API) { VFLOAT two_minus_x = __riscv_vfadd(vx_sign, fp_posOne, vlen); VBOOL x_gt_1 = __riscv_vmflt(vx_sign, fp_posZero, vlen); vx = __riscv_vmerge(vx, two_minus_x, x_gt_1, vlen); - // vx is now in (0, 1] + // vx is now in (0, 1] VBOOL x_in_left = __riscv_vmfle(vx, 0x1.2p-2, vlen); VFLOAT w_hi, w_lo, w_hi_left, w_lo_left, y_hi, y_lo; @@ -100,22 +99,22 @@ void F_VER1(API) { VBOOL x_is_tiny; if (__riscv_vcpop(x_in_left, vlen) > 0) { - VFLOAT x_left = VFMV_VF(0x1.0p-3, vlen); - x_left = __riscv_vmerge(x_left, vx, x_in_left, vlen); - x_is_tiny = __riscv_vmflt(x_left, 0x1.0p-52, vlen); - INT n_adjust = 60; - x_left = __riscv_vfmul(x_left, 0x1.0p60, vlen); - NEG_LOGX_4_TRANSFORM(x_left, n_adjust, y_hi, y_lo, vlen); - - SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_left, w_lo_left, T_left, - 0x1.0p63, 0x1.0p-63, vlen); - if (__riscv_vcpop(x_is_tiny, vlen) > 0) { - VFLOAT w_hi_dummy, w_lo_dummy; - SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_dummy, w_lo_dummy, T_tiny, - 0x1.0p65, 0x1.0p-65, vlen); - dummy = __riscv_vfmul(__riscv_vfcvt_f(T_tiny, vlen), 0x1.0p-2, vlen); - dummy = __riscv_vfsub(dummy, __riscv_vfcvt_f(T_left, vlen), vlen); - } + VFLOAT x_left = VFMV_VF(0x1.0p-3, vlen); + x_left = __riscv_vmerge(x_left, vx, x_in_left, vlen); + x_is_tiny = __riscv_vmflt(x_left, 0x1.0p-52, vlen); + INT n_adjust = 60; + x_left = __riscv_vfmul(x_left, 0x1.0p60, vlen); + NEG_LOGX_4_TRANSFORM(x_left, n_adjust, y_hi, y_lo, vlen); + + SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_left, w_lo_left, T_left, 0x1.0p63, + 0x1.0p-63, vlen); + if (__riscv_vcpop(x_is_tiny, vlen) > 0) { + VFLOAT w_hi_dummy, w_lo_dummy; + SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_dummy, w_lo_dummy, T_tiny, 0x1.0p65, + 0x1.0p-65, vlen); + dummy = __riscv_vfmul(__riscv_vfcvt_f(T_tiny, vlen), 0x1.0p-2, vlen); + dummy = __riscv_vfsub(dummy, __riscv_vfcvt_f(T_left, vlen), vlen); + } } w_hi = VFMV_VF(fp_posOne, vlen); w_hi = __riscv_vfsub(w_hi, vx, vlen); @@ -125,54 +124,76 @@ void F_VER1(API) { VFLOAT delta_t = __riscv_vfmul(w_lo, 0x1.0p63, vlen); T = __riscv_vadd(T, __riscv_vfcvt_x(delta_t, vlen), vlen); T = __riscv_vmerge(T, T_left, x_in_left, vlen); - + w_hi = __riscv_vmerge(w_hi, w_hi_left, x_in_left, vlen); w_lo = __riscv_vmerge(w_lo, w_lo_left, x_in_left, vlen); - //For transformed branch, compute (w_hi + w_lo) * P(T)/Q(T) + // For transformed branch, compute (w_hi + w_lo) * P(T)/Q(T) VINT P, Q; P = __riscv_vmerge(VMVI_VX(P_right_10, vlen), P_left_10, x_in_left, vlen); - P = PSTEP_I_ab(x_in_left, P_left_6, P_right_6, T, - PSTEP_I_ab(x_in_left, P_left_7, P_right_7, T, - PSTEP_I_ab(x_in_left, P_left_8, P_right_8, T, - PSTEP_I_ab(x_in_left, P_left_9, P_right_9, T, P, - vlen), vlen), vlen), vlen); + P = PSTEP_I_ab(x_in_left, P_left_6, P_right_6, T, + PSTEP_I_ab(x_in_left, P_left_7, P_right_7, T, + PSTEP_I_ab(x_in_left, P_left_8, P_right_8, T, + PSTEP_I_ab(x_in_left, P_left_9, + P_right_9, T, P, vlen), + vlen), + vlen), + vlen); Q = __riscv_vmerge(VMVI_VX(Q_right_10, vlen), Q_left_10, x_in_left, vlen); - Q = PSTEP_I_ab(x_in_left, Q_left_6, Q_right_6, T, - PSTEP_I_ab(x_in_left, Q_left_7, Q_right_7, T, - PSTEP_I_ab(x_in_left, Q_left_8, Q_right_8, T, - PSTEP_I_ab(x_in_left, Q_left_9, Q_right_9, T, Q, - vlen), vlen), vlen), vlen); - - P = PSTEP_I_ab(x_in_left, P_left_0, P_right_0, T, - PSTEP_I_ab(x_in_left, P_left_1, P_right_1, T, - PSTEP_I_ab(x_in_left, P_left_2, P_right_2, T, - PSTEP_I_ab(x_in_left, P_left_3, P_right_3, T, - PSTEP_I_ab(x_in_left, P_left_4, P_right_4, T, - PSTEP_I_ab(x_in_left, P_left_5, P_right_5, T, P, - vlen), vlen), vlen), vlen), vlen), vlen); - - Q = PSTEP_I_ab(x_in_left, Q_left_0, Q_right_0, T, - PSTEP_I_ab(x_in_left, Q_left_1, Q_right_1, T, - PSTEP_I_ab(x_in_left, Q_left_2, Q_right_2, T, - PSTEP_I_ab(x_in_left, Q_left_3, Q_right_3, T, - PSTEP_I_ab(x_in_left, Q_left_4, Q_right_4, T, - PSTEP_I_ab(x_in_left, Q_left_5, Q_right_5, T, Q, - vlen), vlen), vlen), vlen), vlen), vlen); + Q = PSTEP_I_ab(x_in_left, Q_left_6, Q_right_6, T, + PSTEP_I_ab(x_in_left, Q_left_7, Q_right_7, T, + PSTEP_I_ab(x_in_left, Q_left_8, Q_right_8, T, + PSTEP_I_ab(x_in_left, Q_left_9, + Q_right_9, T, Q, vlen), + vlen), + vlen), + vlen); + + P = PSTEP_I_ab( + x_in_left, P_left_0, P_right_0, T, + PSTEP_I_ab( + x_in_left, P_left_1, P_right_1, T, + PSTEP_I_ab(x_in_left, P_left_2, P_right_2, T, + PSTEP_I_ab(x_in_left, P_left_3, P_right_3, T, + PSTEP_I_ab(x_in_left, P_left_4, P_right_4, T, + PSTEP_I_ab(x_in_left, P_left_5, + P_right_5, T, P, vlen), + vlen), + vlen), + vlen), + vlen), + vlen); + + Q = PSTEP_I_ab( + x_in_left, Q_left_0, Q_right_0, T, + PSTEP_I_ab( + x_in_left, Q_left_1, Q_right_1, T, + PSTEP_I_ab(x_in_left, Q_left_2, Q_right_2, T, + PSTEP_I_ab(x_in_left, Q_left_3, Q_right_3, T, + PSTEP_I_ab(x_in_left, Q_left_4, Q_right_4, T, + PSTEP_I_ab(x_in_left, Q_left_5, + Q_right_5, T, Q, vlen), + vlen), + vlen), + vlen), + vlen), + vlen); VFLOAT p_hi, p_lo; p_hi = __riscv_vfcvt_f(P, vlen); - p_lo = __riscv_vfcvt_f(__riscv_vsub(P, __riscv_vfcvt_x(p_hi, vlen), vlen), vlen); + p_lo = __riscv_vfcvt_f(__riscv_vsub(P, __riscv_vfcvt_x(p_hi, vlen), vlen), + vlen); VFLOAT delta_p0 = VFMV_VF(DELTA_P0_right, vlen); delta_p0 = __riscv_vfmerge(delta_p0, DELTA_P0_left, x_in_left, vlen); p_lo = __riscv_vfadd(p_lo, delta_p0, vlen); VFLOAT q_hi, q_lo; q_hi = __riscv_vfcvt_f(Q, vlen); - q_lo = __riscv_vfcvt_f(__riscv_vsub(Q, __riscv_vfcvt_x(q_hi, vlen), vlen), vlen); + q_lo = __riscv_vfcvt_f(__riscv_vsub(Q, __riscv_vfcvt_x(q_hi, vlen), vlen), + vlen); VFLOAT delta_q0 = VFMV_VF(DELTA_Q0_right, vlen); delta_q0 = __riscv_vfmerge(delta_q0, DELTA_Q0_left, x_in_left, vlen); q_lo = __riscv_vfadd(q_lo, delta_q0, vlen); @@ -193,8 +214,8 @@ void F_VER1(API) { y_lo = __riscv_vfmacc(y_lo, w_lo, p_hi, vlen); DIV_N2D2(y_hi, y_lo, q_hi, q_lo, w_hi, vlen); - - vy = w_hi; + + vy = w_hi; vy = __riscv_vfsgnj(vy, vx_sign, vlen); vy = __riscv_vmerge(vy, vy_special, special_args, vlen); diff --git a/include/rvvlm_erfinvD.inc.h b/include/rvvlm_erfinvD.inc.h index f7d78b4..b1f346d 100644 --- a/include/rvvlm_erfinvD.inc.h +++ b/include/rvvlm_erfinvD.inc.h @@ -8,65 +8,64 @@ #define F_VER1 RVVLM_ERFINVDI_STD #endif -//Two regions of approximation: left is [0, 0x1.7p-1) and right is [0x1.7p-1, 1) -//Both are done with rational functions. -//For left, x*P(x)/Q(x) x in [0, 0x1.7p-1) -//For right, y*P(t)/Q(t), y = sqrt(-log(1-x)); and t = 1/y - -// P_coefficients in asending order, all in Q79. p0_delta is in floating point -#define P_left_0 -0x48dbe9f5b3eabaa -#define P_left_1 -0xb35279f1a626ae5 -#define P_left_2 0x33789911873d184a -#define P_left_3 -0x1bf9138fc77c0fbf -#define P_left_4 -0x2d4ec43bc48403d4 -#define P_left_5 0x2d61deb53842cca1 -#define P_left_6 0x23324eca6b3ff02 -#define P_left_7 -0xd4ec1d31542c4fc -#define P_left_8 0x2ecf3c60308b0f2 -#define P_left_9 0x7c917b3378071e -#define P_left_10 -0x1e09b597f226ca +// Two regions of approximation: left is [0, 0x1.7p-1) and right is [0x1.7p-1, +// 1) Both are done with rational functions. For left, x*P(x)/Q(x) x in [0, +// 0x1.7p-1) For right, y*P(t)/Q(t), y = sqrt(-log(1-x)); and t = 1/y + +// P_coefficients in asending order, all in Q79. p0_delta is in floating point +#define P_left_0 -0x48dbe9f5b3eabaa +#define P_left_1 -0xb35279f1a626ae5 +#define P_left_2 0x33789911873d184a +#define P_left_3 -0x1bf9138fc77c0fbf +#define P_left_4 -0x2d4ec43bc48403d4 +#define P_left_5 0x2d61deb53842cca1 +#define P_left_6 0x23324eca6b3ff02 +#define P_left_7 -0xd4ec1d31542c4fc +#define P_left_8 0x2ecf3c60308b0f2 +#define P_left_9 0x7c917b3378071e +#define P_left_10 -0x1e09b597f226ca #define DELTA_P0_left -0x1.ec7dc41c17860p-2 -// Q_coefficients in asending order, all in Q79. q0_delta is in floating point -#define Q_left_0 -0x52366e5b14c0970 -#define Q_left_1 -0xca57e95abcc599b -#define Q_left_2 0x3b6c91ec67f5759c -#define Q_left_3 -0x1c40d5daa3be22bc -#define Q_left_4 -0x41f11eb5d837386c -#define Q_left_5 0x3c6ce478fcd75c9a -#define Q_left_6 0xbb1cd7270cfba1d -#define Q_left_7 -0x1988a4116498f1af -#define Q_left_8 0x44dc3042f103d20 -#define Q_left_9 0x2390e683d02edf3 -#define Q_left_10 -0x8ec66f2a7e410c +// Q_coefficients in asending order, all in Q79. q0_delta is in floating point +#define Q_left_0 -0x52366e5b14c0970 +#define Q_left_1 -0xca57e95abcc599b +#define Q_left_2 0x3b6c91ec67f5759c +#define Q_left_3 -0x1c40d5daa3be22bc +#define Q_left_4 -0x41f11eb5d837386c +#define Q_left_5 0x3c6ce478fcd75c9a +#define Q_left_6 0xbb1cd7270cfba1d +#define Q_left_7 -0x1988a4116498f1af +#define Q_left_8 0x44dc3042f103d20 +#define Q_left_9 0x2390e683d02edf3 +#define Q_left_10 -0x8ec66f2a7e410c #define DELTA_Q0_left -0x1.29a0161e99446p-3 -// P_coefficients in asending order, all in Q67. p0_delta is in floating point -#define P_right_0 0x17a0bb69321df -#define P_right_1 0x402eb416ae6015 -#define P_right_2 0x2973eb18028ce34 -#define P_right_3 0x8034a7ece1d5370 -#define P_right_4 0xa76c08a74dae273 -#define P_right_5 0x11dd3876b83dd078 -#define P_right_6 0xfdd7693c3b77653 -#define P_right_7 0xb33d66152b3c223 -#define P_right_8 0x5a564c28c6a41a9 -#define P_right_9 0x1190449fe630213 -#define P_right_10 -0x659c784274e1 +// P_coefficients in asending order, all in Q67. p0_delta is in floating point +#define P_right_0 0x17a0bb69321df +#define P_right_1 0x402eb416ae6015 +#define P_right_2 0x2973eb18028ce34 +#define P_right_3 0x8034a7ece1d5370 +#define P_right_4 0xa76c08a74dae273 +#define P_right_5 0x11dd3876b83dd078 +#define P_right_6 0xfdd7693c3b77653 +#define P_right_7 0xb33d66152b3c223 +#define P_right_8 0x5a564c28c6a41a9 +#define P_right_9 0x1190449fe630213 +#define P_right_10 -0x659c784274e1 #define DELTA_P0_right -0x1.d622f4cbe0eeep-2 -// Q_coefficients in asending order, all in Q67. q0_delta is in floating point -#define Q_right_0 0x17a09aabf9cee -#define Q_right_1 0x4030b9059ffcad -#define Q_right_2 0x29b26b0d87f7855 -#define Q_right_3 0x87572a13d3fa2dd -#define Q_right_4 0xd7a728b5620ac3c -#define Q_right_5 0x1754392b473fd439 -#define Q_right_6 0x1791b9a091a816c2 -#define Q_right_7 0x167f71db9e13b075 -#define Q_right_8 0xcb9f5f3e5e618a4 -#define Q_right_9 0x68271fae767c68e -#define Q_right_10 0x13745c4fa224b25 +// Q_coefficients in asending order, all in Q67. q0_delta is in floating point +#define Q_right_0 0x17a09aabf9cee +#define Q_right_1 0x4030b9059ffcad +#define Q_right_2 0x29b26b0d87f7855 +#define Q_right_3 0x87572a13d3fa2dd +#define Q_right_4 0xd7a728b5620ac3c +#define Q_right_5 0x1754392b473fd439 +#define Q_right_6 0x1791b9a091a816c2 +#define Q_right_7 0x167f71db9e13b075 +#define Q_right_8 0xcb9f5f3e5e618a4 +#define Q_right_9 0x68271fae767c68e +#define Q_right_10 0x13745c4fa224b25 #define DELTA_Q0_right 0x1.f7e7557a34ae6p-2 void F_VER1(API) { @@ -91,69 +90,91 @@ void F_VER1(API) { VINT T, T_right; if (__riscv_vcpop(x_in_right, vlen) > 0) { - VFLOAT one_minus_x; - one_minus_x = __riscv_vfrsub(vx, fp_posOne, vlen); + VFLOAT one_minus_x; + one_minus_x = __riscv_vfrsub(vx, fp_posOne, vlen); - VINT n_adjust; - n_adjust = __riscv_vxor(n_adjust, n_adjust, vlen); + VINT n_adjust; + n_adjust = __riscv_vxor(n_adjust, n_adjust, vlen); - NEG_LOGX_4_TRANSFORM(one_minus_x, n_adjust, y_hi, y_lo, vlen); + NEG_LOGX_4_TRANSFORM(one_minus_x, n_adjust, y_hi, y_lo, vlen); - SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_right, w_lo_right, T_right, - 0x1.0p63, 0x1.0p-63, vlen); + SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_right, w_lo_right, T_right, 0x1.0p63, + 0x1.0p-63, vlen); } T = __riscv_vfcvt_x(__riscv_vfmul(vx, 0x1.0p63, vlen), vlen); T = __riscv_vmerge(T, T_right, x_in_right, vlen); - + w_hi = vx; w_lo = I_AS_F(__riscv_vxor(F_AS_I(w_lo), F_AS_I(w_lo), vlen)); w_hi = __riscv_vmerge(w_hi, w_hi_right, x_in_right, vlen); w_lo = __riscv_vmerge(w_lo, w_lo_right, x_in_right, vlen); - //For transformed branch, compute (w_hi + w_lo) * P(T)/Q(T) + // For transformed branch, compute (w_hi + w_lo) * P(T)/Q(T) VINT P, Q; P = __riscv_vmerge(VMVI_VX(P_left_10, vlen), P_right_10, x_in_right, vlen); - P = PSTEP_I_ab(x_in_right, P_right_6, P_left_6, T, - PSTEP_I_ab(x_in_right, P_right_7, P_left_7, T, - PSTEP_I_ab(x_in_right, P_right_8, P_left_8, T, - PSTEP_I_ab(x_in_right, P_right_9, P_left_9, T, P, - vlen), vlen), vlen), vlen); + P = PSTEP_I_ab(x_in_right, P_right_6, P_left_6, T, + PSTEP_I_ab(x_in_right, P_right_7, P_left_7, T, + PSTEP_I_ab(x_in_right, P_right_8, P_left_8, T, + PSTEP_I_ab(x_in_right, P_right_9, + P_left_9, T, P, vlen), + vlen), + vlen), + vlen); Q = __riscv_vmerge(VMVI_VX(Q_left_10, vlen), Q_right_10, x_in_right, vlen); - Q = PSTEP_I_ab(x_in_right, Q_right_6, Q_left_6, T, - PSTEP_I_ab(x_in_right, Q_right_7, Q_left_7, T, - PSTEP_I_ab(x_in_right, Q_right_8, Q_left_8, T, - PSTEP_I_ab(x_in_right, Q_right_9, Q_left_9, T, Q, - vlen), vlen), vlen), vlen); - - P = PSTEP_I_ab(x_in_right, P_right_0, P_left_0, T, - PSTEP_I_ab(x_in_right, P_right_1, P_left_1, T, - PSTEP_I_ab(x_in_right, P_right_2, P_left_2, T, - PSTEP_I_ab(x_in_right, P_right_3, P_left_3, T, - PSTEP_I_ab(x_in_right, P_right_4, P_left_4, T, - PSTEP_I_ab(x_in_right, P_right_5, P_left_5, T, P, - vlen), vlen), vlen), vlen), vlen), vlen); - - Q = PSTEP_I_ab(x_in_right, Q_right_0, Q_left_0, T, - PSTEP_I_ab(x_in_right, Q_right_1, Q_left_1, T, - PSTEP_I_ab(x_in_right, Q_right_2, Q_left_2, T, - PSTEP_I_ab(x_in_right, Q_right_3, Q_left_3, T, - PSTEP_I_ab(x_in_right, Q_right_4, Q_left_4, T, - PSTEP_I_ab(x_in_right, Q_right_5, Q_left_5, T, Q, - vlen), vlen), vlen), vlen), vlen), vlen); - + Q = PSTEP_I_ab(x_in_right, Q_right_6, Q_left_6, T, + PSTEP_I_ab(x_in_right, Q_right_7, Q_left_7, T, + PSTEP_I_ab(x_in_right, Q_right_8, Q_left_8, T, + PSTEP_I_ab(x_in_right, Q_right_9, + Q_left_9, T, Q, vlen), + vlen), + vlen), + vlen); + + P = PSTEP_I_ab( + x_in_right, P_right_0, P_left_0, T, + PSTEP_I_ab( + x_in_right, P_right_1, P_left_1, T, + PSTEP_I_ab(x_in_right, P_right_2, P_left_2, T, + PSTEP_I_ab(x_in_right, P_right_3, P_left_3, T, + PSTEP_I_ab(x_in_right, P_right_4, P_left_4, T, + PSTEP_I_ab(x_in_right, P_right_5, + P_left_5, T, P, vlen), + vlen), + vlen), + vlen), + vlen), + vlen); + + Q = PSTEP_I_ab( + x_in_right, Q_right_0, Q_left_0, T, + PSTEP_I_ab( + x_in_right, Q_right_1, Q_left_1, T, + PSTEP_I_ab(x_in_right, Q_right_2, Q_left_2, T, + PSTEP_I_ab(x_in_right, Q_right_3, Q_left_3, T, + PSTEP_I_ab(x_in_right, Q_right_4, Q_left_4, T, + PSTEP_I_ab(x_in_right, Q_right_5, + Q_left_5, T, Q, vlen), + vlen), + vlen), + vlen), + vlen), + vlen); + VFLOAT p_hi, p_lo; p_hi = __riscv_vfcvt_f(P, vlen); - p_lo = __riscv_vfcvt_f(__riscv_vsub(P, __riscv_vfcvt_x(p_hi, vlen), vlen), vlen); + p_lo = __riscv_vfcvt_f(__riscv_vsub(P, __riscv_vfcvt_x(p_hi, vlen), vlen), + vlen); VFLOAT delta_p0 = VFMV_VF(DELTA_P0_left, vlen); delta_p0 = __riscv_vfmerge(delta_p0, DELTA_P0_right, x_in_right, vlen); p_lo = __riscv_vfadd(p_lo, delta_p0, vlen); VFLOAT q_hi, q_lo; q_hi = __riscv_vfcvt_f(Q, vlen); - q_lo = __riscv_vfcvt_f(__riscv_vsub(Q, __riscv_vfcvt_x(q_hi, vlen), vlen), vlen); + q_lo = __riscv_vfcvt_f(__riscv_vsub(Q, __riscv_vfcvt_x(q_hi, vlen), vlen), + vlen); VFLOAT delta_q0 = VFMV_VF(DELTA_Q0_left, vlen); delta_q0 = __riscv_vfmerge(delta_q0, DELTA_Q0_right, x_in_right, vlen); q_lo = __riscv_vfadd(q_lo, delta_q0, vlen); @@ -165,8 +186,8 @@ void F_VER1(API) { y_lo = __riscv_vfmacc(y_lo, w_lo, p_hi, vlen); DIV_N2D2(y_hi, y_lo, q_hi, q_lo, w_hi, vlen); - - vy = w_hi; + + vy = w_hi; vy = __riscv_vfsgnj(vy, vx_orig, vlen); vy = __riscv_vmerge(vy, vy_special, special_args, vlen); diff --git a/include/rvvlm_inverrorfuncsD.h b/include/rvvlm_inverrorfuncsD.h index 1b87630..8e1b619 100644 --- a/include/rvvlm_inverrorfuncsD.h +++ b/include/rvvlm_inverrorfuncsD.h @@ -7,30 +7,32 @@ #define NEG_LOG2_HI -0x1.62e42fefa4000p-1 #define NEG_LOG2_LO 0x1.8432a1b0e2634p-43 -// P_coefficients in asending order, in varying scales. p0_delta is in floating point -#define P_tiny_0 -0x8593442e139d // scale 66 -#define P_tiny_1 -0x1fcf7055ac5f03 // scale 64 -#define P_tiny_2 -0x106dde33d8dc179 // scale 61 -#define P_tiny_3 -0xc31d6e09935118a // scale 60 -#define P_tiny_4 -0x3560de73cb5bbcc0 // scale 59 -#define P_tiny_5 -0x1eb4c7e14b254de8 // scale 57 -#define P_tiny_6 0x1fdf5a9d23430bd7 // scale 56 -#define P_tiny_7 0x62c4020631de121b // scale 56 -#define P_tiny_8 0x4be1ed5d031773f1 // scale 58 -#define P_tiny_9 0x55a9f8b9538981a1 // scale 60 -#define DELTA_P0_tiny 0x1.ba4d0b79d16e6p-2 // scale 66 +// P_coefficients in asending order, in varying scales. p0_delta is in floating +// point +#define P_tiny_0 -0x8593442e139d // scale 66 +#define P_tiny_1 -0x1fcf7055ac5f03 // scale 64 +#define P_tiny_2 -0x106dde33d8dc179 // scale 61 +#define P_tiny_3 -0xc31d6e09935118a // scale 60 +#define P_tiny_4 -0x3560de73cb5bbcc0 // scale 59 +#define P_tiny_5 -0x1eb4c7e14b254de8 // scale 57 +#define P_tiny_6 0x1fdf5a9d23430bd7 // scale 56 +#define P_tiny_7 0x62c4020631de121b // scale 56 +#define P_tiny_8 0x4be1ed5d031773f1 // scale 58 +#define P_tiny_9 0x55a9f8b9538981a1 // scale 60 +#define DELTA_P0_tiny 0x1.ba4d0b79d16e6p-2 // scale 66 -// Q_coefficients in asending order, in varying scales. q0_delta is in floating point -#define Q_tiny_0 -0x85933cda2d6d // sacle 66 -#define Q_tiny_1 -0x1fcf792da7d51d // sacle 64 -#define Q_tiny_2 -0x106ec6e0ed13ae1 // sacle 61 -#define Q_tiny_3 -0x61b925a39a461aa // sacle 59 -#define Q_tiny_4 -0x35ebf9dc72fab062 // sacle 59 -#define Q_tiny_5 -0x2131cf7760e82873 // sacle 57 -#define Q_tiny_6 0x1860ae67db2a6609 // sacle 56 -#define Q_tiny_7 0x5e9a123701d89289 // sacle 56 -#define Q_tiny_8 0x417b35aab14ac49d // sacle 56 -#define Q_tiny_9 0x5e4a26a7c1415755 // sacle 57 +// Q_coefficients in asending order, in varying scales. q0_delta is in floating +// point +#define Q_tiny_0 -0x85933cda2d6d // sacle 66 +#define Q_tiny_1 -0x1fcf792da7d51d // sacle 64 +#define Q_tiny_2 -0x106ec6e0ed13ae1 // sacle 61 +#define Q_tiny_3 -0x61b925a39a461aa // sacle 59 +#define Q_tiny_4 -0x35ebf9dc72fab062 // sacle 59 +#define Q_tiny_5 -0x2131cf7760e82873 // sacle 57 +#define Q_tiny_6 0x1860ae67db2a6609 // sacle 56 +#define Q_tiny_7 0x5e9a123701d89289 // sacle 56 +#define Q_tiny_8 0x417b35aab14ac49d // sacle 56 +#define Q_tiny_9 0x5e4a26a7c1415755 // sacle 57 #define DELTA_Q0_tiny 0x1.8a7adad44d65ap-4 // scale 66 // erfinv(+-1) = +-Inf with divide by zero @@ -93,123 +95,119 @@ } while (0) // Compute -log(2^(-n_adjust) * x), where x < 1 -#define NEG_LOGX_4_TRANSFORM(vx, n_adjust, y_hi, y_lo, vlen) \ - do { \ - /* work on entire vector register */ \ - VFLOAT vx_in = (vx); \ - VINT n = __riscv_vadd(__riscv_vsra(F_AS_I(vx_in), MAN_LEN - 8, (vlen)), 0x96, \ - vlen); \ - n = __riscv_vsub(__riscv_vsra(n, 8, vlen), EXP_BIAS, vlen); \ - VFLOAT scale = I_AS_F(__riscv_vsll(__riscv_vrsub(n, EXP_BIAS, (vlen)), MAN_LEN, (vlen))); \ - vx_in = __riscv_vfmul(vx_in, scale, (vlen)); \ - /* x is scaled, and -log(x) is 2 atanh(w/2); w = 2(1-x)/(1+x) */ \ - n = __riscv_vsub(n, (n_adjust), (vlen)); \ - VFLOAT n_flt = __riscv_vfcvt_f(n, (vlen)); \ - VFLOAT numer = __riscv_vfrsub(vx_in, fp_posOne, (vlen)); \ - /* note that 1-x is exact as 1/2 < x < 2 */ \ - numer = __riscv_vfadd(numer, numer, (vlen)); \ - VFLOAT denom = __riscv_vfadd(vx_in, fp_posOne, (vlen)); \ - VFLOAT delta_denom = __riscv_vfadd(__riscv_vfrsub(denom, fp_posOne, (vlen)), vx_in, (vlen)); \ - /* note that 1 - denom is exact even if denom > 2 */ \ - /* becase 1 has many trailing zeros */ \ - VFLOAT r_hi, r_lo, r; \ - DIV_N1D2(numer, denom, delta_denom, r_hi, r_lo, (vlen)); \ - r = __riscv_vfadd(r_hi, r_lo, (vlen)); \ - /* for the original unscaled x, we have */ \ - /* -log(x) = -n * log(2) + 2 atanh(-w/2) */ \ - /* where w = 2(1-x)/(1+x); -w = 2(x-1)/(x+1) */ \ - VFLOAT A, B; \ - A = __riscv_vfmadd(n_flt, NEG_LOG2_HI, r_hi, (vlen)); \ - B = __riscv_vfmsub(n_flt, NEG_LOG2_HI, A, (vlen)); \ - B = __riscv_vfadd(B, r_hi, (vlen)); \ - B = __riscv_vfadd(r_lo, B, (vlen)); \ - VFLOAT rsq = __riscv_vfmul(r, r, (vlen)); \ - VFLOAT rcube = __riscv_vfmul(rsq, r, (vlen)); \ - VFLOAT r6 = __riscv_vfmul(rcube, rcube, (vlen)); \ - VFLOAT poly_right = PSTEP(0x1.74681ff881228p-14, rsq, \ - PSTEP( 0x1.39751be23e4a3p-16, \ - 0x1.30a893993e73dp-18, rsq, vlen), \ - vlen); \ - VFLOAT poly_left = PSTEP(0x1.999999996ce82p-7, rsq, \ - PSTEP(0x1.249249501b1adp-9, \ - 0x1.c71c47e7189f6p-12, rsq, vlen), \ - vlen); \ - poly_left = __riscv_vfmacc(poly_left, r6, poly_right, (vlen)); \ - poly_left = PSTEP(0x1.55555555555dbp-4, rsq, poly_left, (vlen)); \ - B = __riscv_vfmacc(B, NEG_LOG2_LO, n_flt, (vlen)); \ - B = __riscv_vfmacc(B, rcube, poly_left, (vlen)); \ - FAST2SUM(A, B, (y_hi), (y_lo), (vlen)); \ - /* A + B is -log(x) with extra precision, |B| \le ulp(A)/2 */ \ - } while(0) +#define NEG_LOGX_4_TRANSFORM(vx, n_adjust, y_hi, y_lo, vlen) \ + do { \ + /* work on entire vector register */ \ + VFLOAT vx_in = (vx); \ + VINT n = __riscv_vadd(__riscv_vsra(F_AS_I(vx_in), MAN_LEN - 8, (vlen)), \ + 0x96, vlen); \ + n = __riscv_vsub(__riscv_vsra(n, 8, vlen), EXP_BIAS, vlen); \ + VFLOAT scale = I_AS_F( \ + __riscv_vsll(__riscv_vrsub(n, EXP_BIAS, (vlen)), MAN_LEN, (vlen))); \ + vx_in = __riscv_vfmul(vx_in, scale, (vlen)); \ + /* x is scaled, and -log(x) is 2 atanh(w/2); w = 2(1-x)/(1+x) */ \ + n = __riscv_vsub(n, (n_adjust), (vlen)); \ + VFLOAT n_flt = __riscv_vfcvt_f(n, (vlen)); \ + VFLOAT numer = __riscv_vfrsub(vx_in, fp_posOne, (vlen)); \ + /* note that 1-x is exact as 1/2 < x < 2 */ \ + numer = __riscv_vfadd(numer, numer, (vlen)); \ + VFLOAT denom = __riscv_vfadd(vx_in, fp_posOne, (vlen)); \ + VFLOAT delta_denom = __riscv_vfadd( \ + __riscv_vfrsub(denom, fp_posOne, (vlen)), vx_in, (vlen)); \ + /* note that 1 - denom is exact even if denom > 2 */ \ + /* becase 1 has many trailing zeros */ \ + VFLOAT r_hi, r_lo, r; \ + DIV_N1D2(numer, denom, delta_denom, r_hi, r_lo, (vlen)); \ + r = __riscv_vfadd(r_hi, r_lo, (vlen)); \ + /* for the original unscaled x, we have */ \ + /* -log(x) = -n * log(2) + 2 atanh(-w/2) */ \ + /* where w = 2(1-x)/(1+x); -w = 2(x-1)/(x+1) */ \ + VFLOAT A, B; \ + A = __riscv_vfmadd(n_flt, NEG_LOG2_HI, r_hi, (vlen)); \ + B = __riscv_vfmsub(n_flt, NEG_LOG2_HI, A, (vlen)); \ + B = __riscv_vfadd(B, r_hi, (vlen)); \ + B = __riscv_vfadd(r_lo, B, (vlen)); \ + VFLOAT rsq = __riscv_vfmul(r, r, (vlen)); \ + VFLOAT rcube = __riscv_vfmul(rsq, r, (vlen)); \ + VFLOAT r6 = __riscv_vfmul(rcube, rcube, (vlen)); \ + VFLOAT poly_right = PSTEP( \ + 0x1.74681ff881228p-14, rsq, \ + PSTEP(0x1.39751be23e4a3p-16, 0x1.30a893993e73dp-18, rsq, vlen), vlen); \ + VFLOAT poly_left = PSTEP( \ + 0x1.999999996ce82p-7, rsq, \ + PSTEP(0x1.249249501b1adp-9, 0x1.c71c47e7189f6p-12, rsq, vlen), vlen); \ + poly_left = __riscv_vfmacc(poly_left, r6, poly_right, (vlen)); \ + poly_left = PSTEP(0x1.55555555555dbp-4, rsq, poly_left, (vlen)); \ + B = __riscv_vfmacc(B, NEG_LOG2_LO, n_flt, (vlen)); \ + B = __riscv_vfmacc(B, rcube, poly_left, (vlen)); \ + FAST2SUM(A, B, (y_hi), (y_lo), (vlen)); \ + /* A + B is -log(x) with extra precision, |B| \le ulp(A)/2 */ \ + } while (0) -// This macro computes w_hi + w_lo = sqrt(y_hi + y_lo) in floating point -// and 1/(w_hi + w_lo) as a Q63 fixed-point T -// y_hi, y_lo is normalized on input; that is y_hi has +// This macro computes w_hi + w_lo = sqrt(y_hi + y_lo) in floating point +// and 1/(w_hi + w_lo) as a Q63 fixed-point T +// y_hi, y_lo is normalized on input; that is y_hi has // full working precision of the sum y_hi + y_lo // and 2 log(2) < y_hi < 1100 log(2) -#define SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi, w_lo, T, t_sc, t_sc_inv, vlen) \ - do { \ - (w_hi) = __riscv_vfsqrt((y_hi), (vlen)); \ - (w_lo) = __riscv_vfnmsub((w_hi), (w_hi), (y_hi), (vlen)); \ - (w_lo) = __riscv_vfadd((w_lo), (y_lo), (vlen)); \ - VFLOAT recip; \ - recip = __riscv_vfadd((y_hi), (y_hi), (vlen)); \ - recip = __riscv_vfrec7(recip, (vlen)); \ - recip = __riscv_vfmul(recip, (w_hi), (vlen)); \ - (w_lo) = __riscv_vfmul((w_lo), recip, (vlen)); \ - /* w_hi + w_lo is sqrt(y_hi + y_lo) to extra precision */ \ - /* now compute T = t_sc/(w_hi + w_lo) as fixed point */ \ - VFLOAT t_lo = VFMV_VF((t_sc), (vlen)); \ - VFLOAT t_hi = __riscv_vfdiv(t_lo, (w_hi), (vlen)); \ - (T) = __riscv_vfcvt_x(t_hi, (vlen)); \ - t_lo = __riscv_vfnmsac(t_lo, (w_hi), t_hi, (vlen)); \ - t_lo = __riscv_vfnmsac(t_lo, (w_lo), t_hi, (vlen)); \ - t_lo = __riscv_vfmul(t_lo, t_hi, (vlen)); \ - t_lo = __riscv_vfmul(t_lo, (t_sc_inv), vlen); \ - (T) = __riscv_vadd((T), __riscv_vfcvt_x(t_lo, (vlen)), (vlen)); \ - } while(0) - -#define ERFCINV_PQ_TINY(T, p_hi_tiny, p_lo_tiny, q_hi_tiny, q_lo_tiny, vlen) \ - do { \ - /* T is in scale of 65 */ \ - VINT P, Q; \ - P = PSTEP_I_SRA(P_tiny_7, T, 4, PSTEP_I_SRA(P_tiny_8, P_tiny_9, 4, T, \ - (vlen)), (vlen)); \ - /* P in Q_56 */ \ - P = PSTEP_I_SRA(P_tiny_5, T, 1, PSTEP_I_SRA(P_tiny_6, P, 2, T, \ - (vlen)), (vlen)); \ - /* P in Q_57 */ \ - P = PSTEP_I_SRA(P_tiny_3, T, 1, PSTEP_I(P_tiny_4, P, T, \ - (vlen)), (vlen)); \ - /* P in Q_60 */ \ - P = PSTEP_I_SLL(P_tiny_1, T, 1, PSTEP_I_SRA(P_tiny_2, P, 1, T, \ - (vlen)), (vlen)); \ - /* P in Q_64 */ \ - P = PSTEP_I(P_tiny_0, T, P, (vlen));\ - /* P in Q_66 */ \ - \ - Q = PSTEP_I_SRA(Q_tiny_7, T, 2, PSTEP_I_SRA(Q_tiny_8, Q_tiny_9, 3, T, \ - (vlen)), (vlen)); \ - /* Q in Q_56 */ \ - Q = PSTEP_I_SRA(Q_tiny_5, T, 1, PSTEP_I_SRA(Q_tiny_6, Q, 2, T, \ - (vlen)), (vlen)); \ - /* Q in Q_57 */ \ - Q = PSTEP_I_SRA(Q_tiny_3, T, 2, PSTEP_I(Q_tiny_4, Q, T, \ - (vlen)), (vlen)); \ - /* P in Q_59 */ \ - Q = PSTEP_I_SLL(Q_tiny_1, T, 1, PSTEP_I(Q_tiny_2, Q, T, \ - (vlen)), (vlen)); \ - /* Q in Q_64 */ \ - Q = PSTEP_I(Q_tiny_0, T, Q, (vlen));\ - /* Q in Q_66 */ \ - \ - p_hi_tiny = __riscv_vfcvt_f(P, (vlen)); \ - p_lo_tiny = __riscv_vfcvt_f( \ - __riscv_vsub(P, __riscv_vfcvt_x(p_hi_tiny, (vlen)), (vlen)), (vlen)); \ - p_lo_tiny = __riscv_vfadd(p_lo_tiny, DELTA_P0_tiny, (vlen)); \ - q_hi_tiny = __riscv_vfcvt_f(Q, vlen); \ - q_lo_tiny = __riscv_vfcvt_f( \ - __riscv_vsub(Q, __riscv_vfcvt_x(q_hi_tiny, (vlen)), (vlen)), (vlen)); \ - q_lo_tiny = __riscv_vfadd(q_lo, DELTA_Q0_tiny, (vlen)); \ - } while(0) +#define SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi, w_lo, T, t_sc, t_sc_inv, vlen) \ + do { \ + (w_hi) = __riscv_vfsqrt((y_hi), (vlen)); \ + (w_lo) = __riscv_vfnmsub((w_hi), (w_hi), (y_hi), (vlen)); \ + (w_lo) = __riscv_vfadd((w_lo), (y_lo), (vlen)); \ + VFLOAT recip; \ + recip = __riscv_vfadd((y_hi), (y_hi), (vlen)); \ + recip = __riscv_vfrec7(recip, (vlen)); \ + recip = __riscv_vfmul(recip, (w_hi), (vlen)); \ + (w_lo) = __riscv_vfmul((w_lo), recip, (vlen)); \ + /* w_hi + w_lo is sqrt(y_hi + y_lo) to extra precision */ \ + /* now compute T = t_sc/(w_hi + w_lo) as fixed point */ \ + VFLOAT t_lo = VFMV_VF((t_sc), (vlen)); \ + VFLOAT t_hi = __riscv_vfdiv(t_lo, (w_hi), (vlen)); \ + (T) = __riscv_vfcvt_x(t_hi, (vlen)); \ + t_lo = __riscv_vfnmsac(t_lo, (w_hi), t_hi, (vlen)); \ + t_lo = __riscv_vfnmsac(t_lo, (w_lo), t_hi, (vlen)); \ + t_lo = __riscv_vfmul(t_lo, t_hi, (vlen)); \ + t_lo = __riscv_vfmul(t_lo, (t_sc_inv), vlen); \ + (T) = __riscv_vadd((T), __riscv_vfcvt_x(t_lo, (vlen)), (vlen)); \ + } while (0) +#define ERFCINV_PQ_TINY(T, p_hi_tiny, p_lo_tiny, q_hi_tiny, q_lo_tiny, vlen) \ + do { \ + /* T is in scale of 65 */ \ + VINT P, Q; \ + P = PSTEP_I_SRA(P_tiny_7, T, 4, \ + PSTEP_I_SRA(P_tiny_8, P_tiny_9, 4, T, (vlen)), (vlen)); \ + /* P in Q_56 */ \ + P = PSTEP_I_SRA(P_tiny_5, T, 1, PSTEP_I_SRA(P_tiny_6, P, 2, T, (vlen)), \ + (vlen)); \ + /* P in Q_57 */ \ + P = PSTEP_I_SRA(P_tiny_3, T, 1, PSTEP_I(P_tiny_4, P, T, (vlen)), (vlen)); \ + /* P in Q_60 */ \ + P = PSTEP_I_SLL(P_tiny_1, T, 1, PSTEP_I_SRA(P_tiny_2, P, 1, T, (vlen)), \ + (vlen)); \ + /* P in Q_64 */ \ + P = PSTEP_I(P_tiny_0, T, P, (vlen)); \ + /* P in Q_66 */ \ + \ + Q = PSTEP_I_SRA(Q_tiny_7, T, 2, \ + PSTEP_I_SRA(Q_tiny_8, Q_tiny_9, 3, T, (vlen)), (vlen)); \ + /* Q in Q_56 */ \ + Q = PSTEP_I_SRA(Q_tiny_5, T, 1, PSTEP_I_SRA(Q_tiny_6, Q, 2, T, (vlen)), \ + (vlen)); \ + /* Q in Q_57 */ \ + Q = PSTEP_I_SRA(Q_tiny_3, T, 2, PSTEP_I(Q_tiny_4, Q, T, (vlen)), (vlen)); \ + /* P in Q_59 */ \ + Q = PSTEP_I_SLL(Q_tiny_1, T, 1, PSTEP_I(Q_tiny_2, Q, T, (vlen)), (vlen)); \ + /* Q in Q_64 */ \ + Q = PSTEP_I(Q_tiny_0, T, Q, (vlen)); \ + /* Q in Q_66 */ \ + \ + p_hi_tiny = __riscv_vfcvt_f(P, (vlen)); \ + p_lo_tiny = __riscv_vfcvt_f( \ + __riscv_vsub(P, __riscv_vfcvt_x(p_hi_tiny, (vlen)), (vlen)), (vlen)); \ + p_lo_tiny = __riscv_vfadd(p_lo_tiny, DELTA_P0_tiny, (vlen)); \ + q_hi_tiny = __riscv_vfcvt_f(Q, vlen); \ + q_lo_tiny = __riscv_vfcvt_f( \ + __riscv_vsub(Q, __riscv_vfcvt_x(q_hi_tiny, (vlen)), (vlen)), (vlen)); \ + q_lo_tiny = __riscv_vfadd(q_lo, DELTA_Q0_tiny, (vlen)); \ + } while (0) diff --git a/test/src/test_erfcinv.cpp b/test/src/test_erfcinv.cpp index dc7f416..e937fcf 100644 --- a/test/src/test_erfcinv.cpp +++ b/test/src/test_erfcinv.cpp @@ -23,22 +23,28 @@ TEST(erfcinv, tiny_args) { COMMENT("erfcinv: current chosen algorithm; reduced argument in FP64 only") - x_start = 0x1.0p-1074;; + x_start = 0x1.0p-1074; + ; x_end = 0x1.0p-1000; nb_tests = 40000; double thres = 3.5; - report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests, thres); + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, + nb_tests, thres); - x_start = 0x1.0p-1000;; + x_start = 0x1.0p-1000; + ; x_end = 0x1.0p-500; nb_tests = 40000; thres = 2.5; - report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests, thres); + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, + nb_tests, thres); - x_start = 0x1.0p-500;; + x_start = 0x1.0p-500; + ; x_end = 0x1.0p-52; nb_tests = 40000; - report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests, thres); + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, + nb_tests, thres); } TEST(erfcinv, small_args) { @@ -47,15 +53,19 @@ TEST(erfcinv, small_args) { COMMENT("erfcinv: current chosen algorithm; reduced argument in FP64 only") - x_start = 0x1.0p-50;; + x_start = 0x1.0p-50; + ; x_end = 0x1.0p-20; nb_tests = 40000; - report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, + nb_tests); - x_start = 0x1.0p-20;; + x_start = 0x1.0p-20; + ; x_end = 0x1.0p-4; nb_tests = 40000; - report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, + nb_tests); } TEST(erfcinv, medium_args) { @@ -64,19 +74,23 @@ TEST(erfcinv, medium_args) { COMMENT("erfcinv: current chosen algorithm; reduced argument in FP64 only") - x_start = 0x1.0p-4;; + x_start = 0x1.0p-4; + ; x_end = 0x1.0p-1; nb_tests = 40000; - report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, + nb_tests); - x_start = 0x1.0p-1;; + x_start = 0x1.0p-1; + ; x_end = 0x1.8p0; nb_tests = 40000; - report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, + nb_tests); x_start = 2.0 - 0x1.0p-52; x_end = 0x1.8p0; nb_tests = 40000; - report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, + nb_tests); } - diff --git a/test/src/test_erfinv.cpp b/test/src/test_erfinv.cpp index 1744e02..9e7a436 100644 --- a/test/src/test_erfinv.cpp +++ b/test/src/test_erfinv.cpp @@ -23,21 +23,26 @@ TEST(erfinv, small_args) { COMMENT("erfinv: current chosen algorithm; reduced argument in FP64 only") - x_start = 0x1.0p-40;; + x_start = 0x1.0p-40; + ; x_end = 0x1.0p-30; nb_tests = 40000; - report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests); + report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, + nb_tests); - x_start = 0x1.0p-30;; + x_start = 0x1.0p-30; + ; x_end = 0x1.0p-10; nb_tests = 40000; - report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests); + report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, + nb_tests); - x_start = 0x1.0p-10;; + x_start = 0x1.0p-10; + ; x_end = 0x1.0p-4; nb_tests = 40000; - report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests); - + report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, + nb_tests); } TEST(erfinv, middle_args) { @@ -46,40 +51,52 @@ TEST(erfinv, middle_args) { COMMENT("erfinv: current chosen algorithm; reduced argument in FP64 only") - x_start = 0x1.0p-4;; + x_start = 0x1.0p-4; + ; x_end = 0x1.0p-2; nb_tests = 40000; - report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests); + report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, + nb_tests); - x_start = 0x1.0p-2;; + x_start = 0x1.0p-2; + ; x_end = 0x1.0p-1; nb_tests = 40000; - report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests); + report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, + nb_tests); - x_start = 0x1.0p-1;; + x_start = 0x1.0p-1; + ; x_end = 0x1.78p-1; nb_tests = 40000; - report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests); + report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, + nb_tests); - x_start = 0x1.6p-1;; + x_start = 0x1.6p-1; + ; x_end = 0x1.7cp-1; nb_tests = 40000; - report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests); + report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, + nb_tests); - x_start = 0x1.71p-1;; + x_start = 0x1.71p-1; + ; x_end = 0x1.8p-1; nb_tests = 40000; - report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests); + report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, + nb_tests); x_start = -0.8; x_end = 0.8; nb_tests = 40000; - report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests); + report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, + nb_tests); x_start = 0.25; x_end = 0.9; nb_tests = 2; - report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests); + report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, + nb_tests); } TEST(erfinv, close_to_1) { @@ -91,6 +108,6 @@ TEST(erfinv, close_to_1) { x_start = 1.0 - 0x1.0p-53; x_end = 0x1.ffp-1; nb_tests = 40000; - report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests); + report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, + nb_tests); } -