diff --git a/CMakeLists.txt b/CMakeLists.txt index 233f8ef..846675a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,6 +48,10 @@ set(PROJECT_SOURCES src/rvvlm_erfDI.c src/rvvlm_erfcD.c src/rvvlm_erfcDI.c + src/rvvlm_erfcinvD.c + src/rvvlm_erfcinvDI.c + src/rvvlm_erfinvD.c + src/rvvlm_erfinvDI.c src/rvvlm_expD_tbl.c src/rvvlm_expD.c src/rvvlm_expDI.c diff --git a/include/foo.h b/include/foo.h new file mode 100755 index 0000000..49c5391 --- /dev/null +++ b/include/foo.h @@ -0,0 +1,11 @@ + +#define PSTEP_I_SLL(COEFF_j, X, K, POLY, vlen) \ + __riscv_vsadd(__riscv_vsll( \ + __riscv_vsmul((POLY), (X), 1, (vlen)), (K), (vlen)), \ + (COEFF_j), (vlen)) + +#define PSTEP_I_SRA(COEFF_j, X, K, POLY, vlen) \ + __riscv_vsadd(__riscv_vsra( \ + __riscv_vsmul((POLY), (X), 1, (vlen)), (K), (vlen)), \ + (COEFF_j), (vlen)) + diff --git a/include/rvvlm.h b/include/rvvlm.h index 569af80..a5ac3a0 100644 --- a/include/rvvlm.h +++ b/include/rvvlm.h @@ -83,9 +83,25 @@ union sui64_fp64 { #define PSTEP_I(COEFF_j, X, POLY, vlen) \ __riscv_vsadd(__riscv_vsmul((POLY), (X), 1, (vlen)), (COEFF_j), (vlen)) +#define PSTEP_I_SLL(COEFF_j, X, K, POLY, vlen) \ + __riscv_vsadd( \ + __riscv_vsll(__riscv_vsmul((POLY), (X), 1, (vlen)), (K), (vlen)), \ + (COEFF_j), (vlen)) + +#define PSTEP_I_SRA(COEFF_j, X, K, POLY, vlen) \ + __riscv_vsadd( \ + __riscv_vsra(__riscv_vsmul((POLY), (X), 1, (vlen)), (K), (vlen)), \ + (COEFF_j), (vlen)) + #define PSTEPN_I(COEFF_j, X, POLY, vlen) \ __riscv_vrsub(__riscv_vsmul((POLY), (X), 1, (vlen)), (COEFF_j), (vlen)) +#define PSTEP_I_ab(pick_a, COEFF_a, COEFF_b, X, POLY, vlen) \ + __riscv_vsadd( \ + __riscv_vsmul((POLY), (X), 1, (vlen)), \ + __riscv_vmerge(VMVI_VX((COEFF_b), (vlen)), (COEFF_a), (pick_a), (vlen)), \ + (vlen)) + #define FAST2SUM(X, Y, S, s, vlen) \ do { \ (S) = __riscv_vfadd((X), (Y), (vlen)); \ @@ -285,6 +301,20 @@ union sui64_fp64 { #define RVVLM_ERFCDI_VSET_CONFIG "rvvlm_fp64m1.h" #define RVVLM_ERFCDI_STD rvvlm_erfcI +// FP64 erfinv function configuration +#define RVVLM_ERFINVD_VSET_CONFIG "rvvlm_fp64m1.h" +#define RVVLM_ERFINVD_STD rvvlm_erfinv + +#define RVVLM_ERFINVDI_VSET_CONFIG "rvvlm_fp64m1.h" +#define RVVLM_ERFINVDI_STD rvvlm_erfinvI + +// FP64 erfcinv function configuration +#define RVVLM_ERFCINVD_VSET_CONFIG "rvvlm_fp64m1.h" +#define RVVLM_ERFCINVD_STD rvvlm_erfcinv + +#define RVVLM_ERFCINVDI_VSET_CONFIG "rvvlm_fp64m1.h" +#define RVVLM_ERFCINVDI_STD rvvlm_erfcinvI + // FP64 exp function configuration #define RVVLM_EXPD_VSET_CONFIG "rvvlm_fp64m4.h" #define RVVLM_EXPD_STD rvvlm_expD_std @@ -518,6 +548,14 @@ void RVVLM_ERFCD_STD(size_t x_len, const double *x, double *y); void RVVLM_ERFCDI_STD(size_t x_len, const double *x, size_t stride_x, double *y, size_t stride_y); +void RVVLM_ERFCINVD_STD(size_t x_len, const double *x, double *y); +void RVVLM_ERFCINVDI_STD(size_t x_len, const double *x, size_t stride_x, + double *y, size_t stride_y); + +void RVVLM_ERFINVD_STD(size_t x_len, const double *x, double *y); +void RVVLM_ERFINVDI_STD(size_t x_len, const double *x, size_t stride_x, + double *y, size_t stride_y); + void RVVLM_EXPD_STD(size_t x_len, const double *x, double *y); void RVVLM_EXPDI_STD(size_t x_len, const double *x, size_t stride_x, double *y, size_t stride_y); diff --git a/include/rvvlm_erfcinvD.inc.h b/include/rvvlm_erfcinvD.inc.h new file mode 100644 index 0000000..15816ee --- /dev/null +++ b/include/rvvlm_erfcinvD.inc.h @@ -0,0 +1,211 @@ +// SPDX-FileCopyrightText: 2023 Rivos Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#if (STRIDE == UNIT_STRIDE) +#define F_VER1 RVVLM_ERFCINVD_STD +#else +#define F_VER1 RVVLM_ERFCINVDI_STD +#endif + +//Erfcinv is defined on (0, 2). Suffices to consider (0, 1] +//Two regions of approximation: left is [0, 0x1.2p-2) and right is [0x1.2p-1, 1) +//Both are done with rational functions. +//For right, t*P(t)/Q(t) t = 1-x; x in [0x1.2p-1, 1) +//For left, y*P(t)/Q(t), y = sqrt(-log(x)); and t = 1/y + +// P_coefficients in asending order, all in Q79. p0_delta is in floating point +#define P_right_0 -0x48dbe9f5b3eabaa +#define P_right_1 -0xb35279f1a626ae5 +#define P_right_2 0x33789911873d184a +#define P_right_3 -0x1bf9138fc77c0fbf +#define P_right_4 -0x2d4ec43bc48403d4 +#define P_right_5 0x2d61deb53842cca1 +#define P_right_6 0x23324eca6b3ff02 +#define P_right_7 -0xd4ec1d31542c4fc +#define P_right_8 0x2ecf3c60308b0f2 +#define P_right_9 0x7c917b3378071e +#define P_right_10 -0x1e09b597f226ca +#define DELTA_P0_right -0x1.ec7dc41c17860p-2 + +// Q_coefficients in asending order, all in Q79. q0_delta is in floating point +#define Q_right_0 -0x52366e5b14c0970 +#define Q_right_1 -0xca57e95abcc599b +#define Q_right_2 0x3b6c91ec67f5759c +#define Q_right_3 -0x1c40d5daa3be22bc +#define Q_right_4 -0x41f11eb5d837386c +#define Q_right_5 0x3c6ce478fcd75c9a +#define Q_right_6 0xbb1cd7270cfba1d +#define Q_right_7 -0x1988a4116498f1af +#define Q_right_8 0x44dc3042f103d20 +#define Q_right_9 0x2390e683d02edf3 +#define Q_right_10 -0x8ec66f2a7e410c +#define DELTA_Q0_right -0x1.29a0161e99446p-3 + +// P_coefficients in asending order, all in Q67. p0_delta is in floating point +#define P_left_0 0x17a0bb69321df +#define P_left_1 0x402eb416ae6015 +#define P_left_2 0x2973eb18028ce34 +#define P_left_3 0x8034a7ece1d5370 +#define P_left_4 0xa76c08a74dae273 +#define P_left_5 0x11dd3876b83dd078 +#define P_left_6 0xfdd7693c3b77653 +#define P_left_7 0xb33d66152b3c223 +#define P_left_8 0x5a564c28c6a41a9 +#define P_left_9 0x1190449fe630213 +#define P_left_10 -0x659c784274e1 +#define DELTA_P0_left -0x1.d622f4cbe0eeep-2 + +// Q_coefficients in asending order, all in Q67. q0_delta is in floating point +#define Q_left_0 0x17a09aabf9cee +#define Q_left_1 0x4030b9059ffcad +#define Q_left_2 0x29b26b0d87f7855 +#define Q_left_3 0x87572a13d3fa2dd +#define Q_left_4 0xd7a728b5620ac3c +#define Q_left_5 0x1754392b473fd439 +#define Q_left_6 0x1791b9a091a816c2 +#define Q_left_7 0x167f71db9e13b075 +#define Q_left_8 0xcb9f5f3e5e618a4 +#define Q_left_9 0x68271fae767c68e +#define Q_left_10 0x13745c4fa224b25 +#define DELTA_Q0_left 0x1.f7e7557a34ae6p-2 + +void F_VER1(API) { + size_t vlen; + VFLOAT vx, vx_sign, vy, vy_special; + VBOOL special_args; + + VFLOAT dummy; + + SET_ROUNDTONEAREST; + // stripmining over input arguments + for (; _inarg_n > 0; _inarg_n -= vlen) { + vlen = VSET(_inarg_n); + vx = VFLOAD_INARG1(vlen); + + // Handle Inf and NaN + EXCEPTION_HANDLING_ERFCINV(vx, special_args, vy_special, vlen); + + dummy = VFMV_VF(3.1415926535, vlen); + + vx_sign = __riscv_vfrsub(vx, fp_posOne, vlen); + VFLOAT two_minus_x = __riscv_vfadd(vx_sign, fp_posOne, vlen); + VBOOL x_gt_1 = __riscv_vmflt(vx_sign, fp_posZero, vlen); + vx = __riscv_vmerge(vx, two_minus_x, x_gt_1, vlen); + // vx is now in (0, 1] + VBOOL x_in_left = __riscv_vmfle(vx, 0x1.2p-2, vlen); + + VFLOAT w_hi, w_lo, w_hi_left, w_lo_left, y_hi, y_lo; + VINT T, T_left, T_tiny; + VBOOL x_is_tiny; + + if (__riscv_vcpop(x_in_left, vlen) > 0) { + VFLOAT x_left = VFMV_VF(0x1.0p-3, vlen); + x_left = __riscv_vmerge(x_left, vx, x_in_left, vlen); + x_is_tiny = __riscv_vmflt(x_left, 0x1.0p-52, vlen); + INT n_adjust = 60; + x_left = __riscv_vfmul(x_left, 0x1.0p60, vlen); + NEG_LOGX_4_TRANSFORM(x_left, n_adjust, y_hi, y_lo, vlen); + + SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_left, w_lo_left, T_left, + 0x1.0p63, 0x1.0p-63, vlen); + if (__riscv_vcpop(x_is_tiny, vlen) > 0) { + VFLOAT w_hi_dummy, w_lo_dummy; + SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_dummy, w_lo_dummy, T_tiny, + 0x1.0p65, 0x1.0p-65, vlen); + dummy = __riscv_vfmul(__riscv_vfcvt_f(T_tiny, vlen), 0x1.0p-2, vlen); + dummy = __riscv_vfsub(dummy, __riscv_vfcvt_f(T_left, vlen), vlen); + } + } + w_hi = VFMV_VF(fp_posOne, vlen); + w_hi = __riscv_vfsub(w_hi, vx, vlen); + w_lo = __riscv_vfrsub(w_hi, fp_posOne, vlen); + w_lo = __riscv_vfsub(w_lo, vx, vlen); + T = __riscv_vfcvt_x(__riscv_vfmul(w_hi, 0x1.0p63, vlen), vlen); + VFLOAT delta_t = __riscv_vfmul(w_lo, 0x1.0p63, vlen); + T = __riscv_vadd(T, __riscv_vfcvt_x(delta_t, vlen), vlen); + T = __riscv_vmerge(T, T_left, x_in_left, vlen); + + w_hi = __riscv_vmerge(w_hi, w_hi_left, x_in_left, vlen); + w_lo = __riscv_vmerge(w_lo, w_lo_left, x_in_left, vlen); + + //For transformed branch, compute (w_hi + w_lo) * P(T)/Q(T) + VINT P, Q; + + P = __riscv_vmerge(VMVI_VX(P_right_10, vlen), P_left_10, x_in_left, vlen); + P = PSTEP_I_ab(x_in_left, P_left_6, P_right_6, T, + PSTEP_I_ab(x_in_left, P_left_7, P_right_7, T, + PSTEP_I_ab(x_in_left, P_left_8, P_right_8, T, + PSTEP_I_ab(x_in_left, P_left_9, P_right_9, T, P, + vlen), vlen), vlen), vlen); + + Q = __riscv_vmerge(VMVI_VX(Q_right_10, vlen), Q_left_10, x_in_left, vlen); + Q = PSTEP_I_ab(x_in_left, Q_left_6, Q_right_6, T, + PSTEP_I_ab(x_in_left, Q_left_7, Q_right_7, T, + PSTEP_I_ab(x_in_left, Q_left_8, Q_right_8, T, + PSTEP_I_ab(x_in_left, Q_left_9, Q_right_9, T, Q, + vlen), vlen), vlen), vlen); + + P = PSTEP_I_ab(x_in_left, P_left_0, P_right_0, T, + PSTEP_I_ab(x_in_left, P_left_1, P_right_1, T, + PSTEP_I_ab(x_in_left, P_left_2, P_right_2, T, + PSTEP_I_ab(x_in_left, P_left_3, P_right_3, T, + PSTEP_I_ab(x_in_left, P_left_4, P_right_4, T, + PSTEP_I_ab(x_in_left, P_left_5, P_right_5, T, P, + vlen), vlen), vlen), vlen), vlen), vlen); + + Q = PSTEP_I_ab(x_in_left, Q_left_0, Q_right_0, T, + PSTEP_I_ab(x_in_left, Q_left_1, Q_right_1, T, + PSTEP_I_ab(x_in_left, Q_left_2, Q_right_2, T, + PSTEP_I_ab(x_in_left, Q_left_3, Q_right_3, T, + PSTEP_I_ab(x_in_left, Q_left_4, Q_right_4, T, + PSTEP_I_ab(x_in_left, Q_left_5, Q_right_5, T, Q, + vlen), vlen), vlen), vlen), vlen), vlen); + + VFLOAT p_hi, p_lo; + p_hi = __riscv_vfcvt_f(P, vlen); + + p_lo = __riscv_vfcvt_f(__riscv_vsub(P, __riscv_vfcvt_x(p_hi, vlen), vlen), vlen); + VFLOAT delta_p0 = VFMV_VF(DELTA_P0_right, vlen); + delta_p0 = __riscv_vfmerge(delta_p0, DELTA_P0_left, x_in_left, vlen); + p_lo = __riscv_vfadd(p_lo, delta_p0, vlen); + + VFLOAT q_hi, q_lo; + q_hi = __riscv_vfcvt_f(Q, vlen); + q_lo = __riscv_vfcvt_f(__riscv_vsub(Q, __riscv_vfcvt_x(q_hi, vlen), vlen), vlen); + VFLOAT delta_q0 = VFMV_VF(DELTA_Q0_right, vlen); + delta_q0 = __riscv_vfmerge(delta_q0, DELTA_Q0_left, x_in_left, vlen); + q_lo = __riscv_vfadd(q_lo, delta_q0, vlen); + + if (__riscv_vcpop(x_is_tiny, vlen) > 0) { + VFLOAT p_hi_tiny, p_lo_tiny, q_hi_tiny, q_lo_tiny; + ERFCINV_PQ_TINY(T_tiny, p_hi_tiny, p_lo_tiny, q_hi_tiny, q_lo_tiny, vlen); + p_hi = __riscv_vmerge(p_hi, p_hi_tiny, x_is_tiny, vlen); + p_lo = __riscv_vmerge(p_lo, p_lo_tiny, x_is_tiny, vlen); + q_hi = __riscv_vmerge(q_hi, q_hi_tiny, x_is_tiny, vlen); + q_lo = __riscv_vmerge(q_lo, q_lo_tiny, x_is_tiny, vlen); + } + + // (y_hi, y_lo) <-- (w_hi + w_lo) * (p_hi + p_lo) + y_hi = __riscv_vfmul(w_hi, p_hi, vlen); + y_lo = __riscv_vfmsub(w_hi, p_hi, y_hi, vlen); + y_lo = __riscv_vfmacc(y_lo, w_hi, p_lo, vlen); + y_lo = __riscv_vfmacc(y_lo, w_lo, p_hi, vlen); + + DIV_N2D2(y_hi, y_lo, q_hi, q_lo, w_hi, vlen); + + vy = w_hi; + + vy = __riscv_vfsgnj(vy, vx_sign, vlen); + vy = __riscv_vmerge(vy, vy_special, special_args, vlen); + + // vy = dummy; + + // copy vy into y and increment addr pointers + VFSTORE_OUTARG1(vy, vlen); + + INCREMENT_INARG1(vlen); + INCREMENT_OUTARG1(vlen); + } + RESTORE_FRM; +} diff --git a/include/rvvlm_erfinvD.inc.h b/include/rvvlm_erfinvD.inc.h new file mode 100644 index 0000000..f7d78b4 --- /dev/null +++ b/include/rvvlm_erfinvD.inc.h @@ -0,0 +1,180 @@ +// SPDX-FileCopyrightText: 2023 Rivos Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#if (STRIDE == UNIT_STRIDE) +#define F_VER1 RVVLM_ERFINVD_STD +#else +#define F_VER1 RVVLM_ERFINVDI_STD +#endif + +//Two regions of approximation: left is [0, 0x1.7p-1) and right is [0x1.7p-1, 1) +//Both are done with rational functions. +//For left, x*P(x)/Q(x) x in [0, 0x1.7p-1) +//For right, y*P(t)/Q(t), y = sqrt(-log(1-x)); and t = 1/y + +// P_coefficients in asending order, all in Q79. p0_delta is in floating point +#define P_left_0 -0x48dbe9f5b3eabaa +#define P_left_1 -0xb35279f1a626ae5 +#define P_left_2 0x33789911873d184a +#define P_left_3 -0x1bf9138fc77c0fbf +#define P_left_4 -0x2d4ec43bc48403d4 +#define P_left_5 0x2d61deb53842cca1 +#define P_left_6 0x23324eca6b3ff02 +#define P_left_7 -0xd4ec1d31542c4fc +#define P_left_8 0x2ecf3c60308b0f2 +#define P_left_9 0x7c917b3378071e +#define P_left_10 -0x1e09b597f226ca +#define DELTA_P0_left -0x1.ec7dc41c17860p-2 + +// Q_coefficients in asending order, all in Q79. q0_delta is in floating point +#define Q_left_0 -0x52366e5b14c0970 +#define Q_left_1 -0xca57e95abcc599b +#define Q_left_2 0x3b6c91ec67f5759c +#define Q_left_3 -0x1c40d5daa3be22bc +#define Q_left_4 -0x41f11eb5d837386c +#define Q_left_5 0x3c6ce478fcd75c9a +#define Q_left_6 0xbb1cd7270cfba1d +#define Q_left_7 -0x1988a4116498f1af +#define Q_left_8 0x44dc3042f103d20 +#define Q_left_9 0x2390e683d02edf3 +#define Q_left_10 -0x8ec66f2a7e410c +#define DELTA_Q0_left -0x1.29a0161e99446p-3 + +// P_coefficients in asending order, all in Q67. p0_delta is in floating point +#define P_right_0 0x17a0bb69321df +#define P_right_1 0x402eb416ae6015 +#define P_right_2 0x2973eb18028ce34 +#define P_right_3 0x8034a7ece1d5370 +#define P_right_4 0xa76c08a74dae273 +#define P_right_5 0x11dd3876b83dd078 +#define P_right_6 0xfdd7693c3b77653 +#define P_right_7 0xb33d66152b3c223 +#define P_right_8 0x5a564c28c6a41a9 +#define P_right_9 0x1190449fe630213 +#define P_right_10 -0x659c784274e1 +#define DELTA_P0_right -0x1.d622f4cbe0eeep-2 + +// Q_coefficients in asending order, all in Q67. q0_delta is in floating point +#define Q_right_0 0x17a09aabf9cee +#define Q_right_1 0x4030b9059ffcad +#define Q_right_2 0x29b26b0d87f7855 +#define Q_right_3 0x87572a13d3fa2dd +#define Q_right_4 0xd7a728b5620ac3c +#define Q_right_5 0x1754392b473fd439 +#define Q_right_6 0x1791b9a091a816c2 +#define Q_right_7 0x167f71db9e13b075 +#define Q_right_8 0xcb9f5f3e5e618a4 +#define Q_right_9 0x68271fae767c68e +#define Q_right_10 0x13745c4fa224b25 +#define DELTA_Q0_right 0x1.f7e7557a34ae6p-2 + +void F_VER1(API) { + size_t vlen; + VFLOAT vx, vx_orig, vy, vy_special; + VBOOL special_args; + + SET_ROUNDTONEAREST; + // stripmining over input arguments + for (; _inarg_n > 0; _inarg_n -= vlen) { + vlen = VSET(_inarg_n); + vx = VFLOAD_INARG1(vlen); + vx_orig = vx; + + // Handle Inf and NaN + EXCEPTION_HANDLING_ERFINV(vx, special_args, vy_special, vlen); + + vx = __riscv_vfsgnj(vx, fp_posOne, vlen); + VBOOL x_in_right = __riscv_vmfge(vx, 0x1.7p-1, vlen); + + VFLOAT w_hi, w_lo, w_hi_right, w_lo_right, y_hi, y_lo; + VINT T, T_right; + + if (__riscv_vcpop(x_in_right, vlen) > 0) { + VFLOAT one_minus_x; + one_minus_x = __riscv_vfrsub(vx, fp_posOne, vlen); + + VINT n_adjust; + n_adjust = __riscv_vxor(n_adjust, n_adjust, vlen); + + NEG_LOGX_4_TRANSFORM(one_minus_x, n_adjust, y_hi, y_lo, vlen); + + SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_right, w_lo_right, T_right, + 0x1.0p63, 0x1.0p-63, vlen); + } + T = __riscv_vfcvt_x(__riscv_vfmul(vx, 0x1.0p63, vlen), vlen); + T = __riscv_vmerge(T, T_right, x_in_right, vlen); + + w_hi = vx; + w_lo = I_AS_F(__riscv_vxor(F_AS_I(w_lo), F_AS_I(w_lo), vlen)); + w_hi = __riscv_vmerge(w_hi, w_hi_right, x_in_right, vlen); + w_lo = __riscv_vmerge(w_lo, w_lo_right, x_in_right, vlen); + + //For transformed branch, compute (w_hi + w_lo) * P(T)/Q(T) + VINT P, Q; + + P = __riscv_vmerge(VMVI_VX(P_left_10, vlen), P_right_10, x_in_right, vlen); + P = PSTEP_I_ab(x_in_right, P_right_6, P_left_6, T, + PSTEP_I_ab(x_in_right, P_right_7, P_left_7, T, + PSTEP_I_ab(x_in_right, P_right_8, P_left_8, T, + PSTEP_I_ab(x_in_right, P_right_9, P_left_9, T, P, + vlen), vlen), vlen), vlen); + + Q = __riscv_vmerge(VMVI_VX(Q_left_10, vlen), Q_right_10, x_in_right, vlen); + Q = PSTEP_I_ab(x_in_right, Q_right_6, Q_left_6, T, + PSTEP_I_ab(x_in_right, Q_right_7, Q_left_7, T, + PSTEP_I_ab(x_in_right, Q_right_8, Q_left_8, T, + PSTEP_I_ab(x_in_right, Q_right_9, Q_left_9, T, Q, + vlen), vlen), vlen), vlen); + + P = PSTEP_I_ab(x_in_right, P_right_0, P_left_0, T, + PSTEP_I_ab(x_in_right, P_right_1, P_left_1, T, + PSTEP_I_ab(x_in_right, P_right_2, P_left_2, T, + PSTEP_I_ab(x_in_right, P_right_3, P_left_3, T, + PSTEP_I_ab(x_in_right, P_right_4, P_left_4, T, + PSTEP_I_ab(x_in_right, P_right_5, P_left_5, T, P, + vlen), vlen), vlen), vlen), vlen), vlen); + + Q = PSTEP_I_ab(x_in_right, Q_right_0, Q_left_0, T, + PSTEP_I_ab(x_in_right, Q_right_1, Q_left_1, T, + PSTEP_I_ab(x_in_right, Q_right_2, Q_left_2, T, + PSTEP_I_ab(x_in_right, Q_right_3, Q_left_3, T, + PSTEP_I_ab(x_in_right, Q_right_4, Q_left_4, T, + PSTEP_I_ab(x_in_right, Q_right_5, Q_left_5, T, Q, + vlen), vlen), vlen), vlen), vlen), vlen); + + VFLOAT p_hi, p_lo; + p_hi = __riscv_vfcvt_f(P, vlen); + + p_lo = __riscv_vfcvt_f(__riscv_vsub(P, __riscv_vfcvt_x(p_hi, vlen), vlen), vlen); + VFLOAT delta_p0 = VFMV_VF(DELTA_P0_left, vlen); + delta_p0 = __riscv_vfmerge(delta_p0, DELTA_P0_right, x_in_right, vlen); + p_lo = __riscv_vfadd(p_lo, delta_p0, vlen); + + VFLOAT q_hi, q_lo; + q_hi = __riscv_vfcvt_f(Q, vlen); + q_lo = __riscv_vfcvt_f(__riscv_vsub(Q, __riscv_vfcvt_x(q_hi, vlen), vlen), vlen); + VFLOAT delta_q0 = VFMV_VF(DELTA_Q0_left, vlen); + delta_q0 = __riscv_vfmerge(delta_q0, DELTA_Q0_right, x_in_right, vlen); + q_lo = __riscv_vfadd(q_lo, delta_q0, vlen); + + // (y_hi, y_lo) <-- (w_hi + w_lo) * (p_hi + p_lo) + y_hi = __riscv_vfmul(w_hi, p_hi, vlen); + y_lo = __riscv_vfmsub(w_hi, p_hi, y_hi, vlen); + y_lo = __riscv_vfmacc(y_lo, w_hi, p_lo, vlen); + y_lo = __riscv_vfmacc(y_lo, w_lo, p_hi, vlen); + + DIV_N2D2(y_hi, y_lo, q_hi, q_lo, w_hi, vlen); + + vy = w_hi; + + vy = __riscv_vfsgnj(vy, vx_orig, vlen); + vy = __riscv_vmerge(vy, vy_special, special_args, vlen); + // copy vy into y and increment addr pointers + VFSTORE_OUTARG1(vy, vlen); + + INCREMENT_INARG1(vlen); + INCREMENT_OUTARG1(vlen); + } + RESTORE_FRM; +} diff --git a/include/rvvlm_inverrorfuncsD.h b/include/rvvlm_inverrorfuncsD.h new file mode 100644 index 0000000..1b87630 --- /dev/null +++ b/include/rvvlm_inverrorfuncsD.h @@ -0,0 +1,215 @@ +// SPDX-FileCopyrightText: 2023 Rivos Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#define RTPI_BY2_HI 0x1.c5bf891b4ef6ap-1 +#define RTPI_BY2_LO 0x1.4f38760a41abbp-54 +#define NEG_LOG2_HI -0x1.62e42fefa4000p-1 +#define NEG_LOG2_LO 0x1.8432a1b0e2634p-43 + +// P_coefficients in asending order, in varying scales. p0_delta is in floating point +#define P_tiny_0 -0x8593442e139d // scale 66 +#define P_tiny_1 -0x1fcf7055ac5f03 // scale 64 +#define P_tiny_2 -0x106dde33d8dc179 // scale 61 +#define P_tiny_3 -0xc31d6e09935118a // scale 60 +#define P_tiny_4 -0x3560de73cb5bbcc0 // scale 59 +#define P_tiny_5 -0x1eb4c7e14b254de8 // scale 57 +#define P_tiny_6 0x1fdf5a9d23430bd7 // scale 56 +#define P_tiny_7 0x62c4020631de121b // scale 56 +#define P_tiny_8 0x4be1ed5d031773f1 // scale 58 +#define P_tiny_9 0x55a9f8b9538981a1 // scale 60 +#define DELTA_P0_tiny 0x1.ba4d0b79d16e6p-2 // scale 66 + +// Q_coefficients in asending order, in varying scales. q0_delta is in floating point +#define Q_tiny_0 -0x85933cda2d6d // sacle 66 +#define Q_tiny_1 -0x1fcf792da7d51d // sacle 64 +#define Q_tiny_2 -0x106ec6e0ed13ae1 // sacle 61 +#define Q_tiny_3 -0x61b925a39a461aa // sacle 59 +#define Q_tiny_4 -0x35ebf9dc72fab062 // sacle 59 +#define Q_tiny_5 -0x2131cf7760e82873 // sacle 57 +#define Q_tiny_6 0x1860ae67db2a6609 // sacle 56 +#define Q_tiny_7 0x5e9a123701d89289 // sacle 56 +#define Q_tiny_8 0x417b35aab14ac49d // sacle 56 +#define Q_tiny_9 0x5e4a26a7c1415755 // sacle 57 +#define DELTA_Q0_tiny 0x1.8a7adad44d65ap-4 // scale 66 + +// erfinv(+-1) = +-Inf with divide by zero +// erfinv(x) |x| > 1, real is NaN with invalid +// erfinv(NaN) is NaN, invalid if input is signalling NaN +// erfinv(x) is (2/rt(pi)) x for |x| < 2^-30 +#define EXCEPTION_HANDLING_ERFINV(vx, special_args, vy_special, vlen) \ + do { \ + VUINT expo_x = __riscv_vand(__riscv_vsrl(F_AS_U((vx)), MAN_LEN, (vlen)), \ + 0x7FF, (vlen)); \ + VBOOL x_large = __riscv_vmsgeu(expo_x, EXP_BIAS, (vlen)); \ + VBOOL x_small = __riscv_vmsltu(expo_x, EXP_BIAS - 30, (vlen)); \ + (special_args) = __riscv_vmor(x_large, x_small, (vlen)); \ + if (__riscv_vcpop((special_args), (vlen)) > 0) { \ + VFLOAT abs_x = __riscv_vfsgnj((vx), fp_posOne, (vlen)); \ + VBOOL x_gt_1 = __riscv_vmfgt(abs_x, fp_posOne, (vlen)); \ + VBOOL x_eq_1 = __riscv_vmfeq(abs_x, fp_posOne, (vlen)); \ + /* substitute |x| > 1 with sNaN */ \ + (vx) = __riscv_vfmerge((vx), fp_sNaN, x_gt_1, (vlen)); \ + /* substitute |x| = 1 with +/-Inf and generate div-by-zero signal */ \ + VFLOAT tmp = VFMV_VF(fp_posZero, (vlen)); \ + tmp = __riscv_vfsgnj(tmp, (vx), (vlen)); \ + tmp = __riscv_vfrec7(x_eq_1, tmp, (vlen)); \ + (vy_special) = __riscv_vfadd((special_args), (vx), (vx), (vlen)); \ + (vy_special) = __riscv_vmerge((vy_special), tmp, x_eq_1, (vlen)); \ + tmp = __riscv_vfmul(x_small, (vx), RTPI_BY2_LO, (vlen)); \ + tmp = __riscv_vfmacc(x_small, tmp, RTPI_BY2_HI, (vx), (vlen)); \ + (vy_special) = __riscv_vmerge((vy_special), tmp, x_small, (vlen)); \ + (vx) = __riscv_vfmerge((vx), fp_posZero, (special_args), (vlen)); \ + } \ + } while (0) + +// erfcinv(0) = Inf, erfcinv(2) = -Inf with divide by zero +// erfcinv(x) x outside [0, 2], real is NaN with invalid +// erfcinv(NaN) is NaN, invalid if input is signalling NaN +#define EXCEPTION_HANDLING_ERFCINV(vx, special_args, vy_special, vlen) \ + do { \ + VUINT vclass = __riscv_vfclass((vx), (vlen)); \ + IDENTIFY(vclass, 0x39F, (special_args), (vlen)); \ + VBOOL x_ge_2 = __riscv_vmfge((vx), 0x1.0p1, (vlen)); \ + (special_args) = __riscv_vmor((special_args), x_ge_2, (vlen)); \ + if (__riscv_vcpop((special_args), (vlen)) > 0) { \ + VBOOL x_gt_2 = __riscv_vmfgt((vx), 0x1.0p1, (vlen)); \ + VBOOL x_lt_0 = __riscv_vmflt((vx), fp_posZero, (vlen)); \ + /* substitute x > 2 or x < 0 with sNaN */ \ + (vx) = __riscv_vfmerge((vx), fp_sNaN, x_gt_2, (vlen)); \ + (vx) = __riscv_vfmerge((vx), fp_sNaN, x_lt_0, (vlen)); \ + /* substitute x = 0 or 2 with +/-Inf and generate div-by-zero signal */ \ + VFLOAT tmp = VFMV_VF(fp_posZero, (vlen)); \ + VFLOAT x_tmp = __riscv_vfrsub((vx), fp_posOne, (vlen)); \ + tmp = __riscv_vfsgnj(tmp, x_tmp, (vlen)); \ + VBOOL x_eq_2 = __riscv_vmfeq((vx), 0x1.0p1, (vlen)); \ + VBOOL x_eq_0 = __riscv_vmfeq((vx), fp_posZero, (vlen)); \ + VBOOL pm_Inf = __riscv_vmor(x_eq_2, x_eq_0, (vlen)); \ + tmp = __riscv_vfrec7(pm_Inf, tmp, (vlen)); \ + (vy_special) = __riscv_vfsub((special_args), (vx), (vx), (vlen)); \ + (vy_special) = __riscv_vmerge((vy_special), tmp, pm_Inf, (vlen)); \ + (vx) = __riscv_vfmerge((vx), fp_posOne, (special_args), (vlen)); \ + } \ + } while (0) + +// Compute -log(2^(-n_adjust) * x), where x < 1 +#define NEG_LOGX_4_TRANSFORM(vx, n_adjust, y_hi, y_lo, vlen) \ + do { \ + /* work on entire vector register */ \ + VFLOAT vx_in = (vx); \ + VINT n = __riscv_vadd(__riscv_vsra(F_AS_I(vx_in), MAN_LEN - 8, (vlen)), 0x96, \ + vlen); \ + n = __riscv_vsub(__riscv_vsra(n, 8, vlen), EXP_BIAS, vlen); \ + VFLOAT scale = I_AS_F(__riscv_vsll(__riscv_vrsub(n, EXP_BIAS, (vlen)), MAN_LEN, (vlen))); \ + vx_in = __riscv_vfmul(vx_in, scale, (vlen)); \ + /* x is scaled, and -log(x) is 2 atanh(w/2); w = 2(1-x)/(1+x) */ \ + n = __riscv_vsub(n, (n_adjust), (vlen)); \ + VFLOAT n_flt = __riscv_vfcvt_f(n, (vlen)); \ + VFLOAT numer = __riscv_vfrsub(vx_in, fp_posOne, (vlen)); \ + /* note that 1-x is exact as 1/2 < x < 2 */ \ + numer = __riscv_vfadd(numer, numer, (vlen)); \ + VFLOAT denom = __riscv_vfadd(vx_in, fp_posOne, (vlen)); \ + VFLOAT delta_denom = __riscv_vfadd(__riscv_vfrsub(denom, fp_posOne, (vlen)), vx_in, (vlen)); \ + /* note that 1 - denom is exact even if denom > 2 */ \ + /* becase 1 has many trailing zeros */ \ + VFLOAT r_hi, r_lo, r; \ + DIV_N1D2(numer, denom, delta_denom, r_hi, r_lo, (vlen)); \ + r = __riscv_vfadd(r_hi, r_lo, (vlen)); \ + /* for the original unscaled x, we have */ \ + /* -log(x) = -n * log(2) + 2 atanh(-w/2) */ \ + /* where w = 2(1-x)/(1+x); -w = 2(x-1)/(x+1) */ \ + VFLOAT A, B; \ + A = __riscv_vfmadd(n_flt, NEG_LOG2_HI, r_hi, (vlen)); \ + B = __riscv_vfmsub(n_flt, NEG_LOG2_HI, A, (vlen)); \ + B = __riscv_vfadd(B, r_hi, (vlen)); \ + B = __riscv_vfadd(r_lo, B, (vlen)); \ + VFLOAT rsq = __riscv_vfmul(r, r, (vlen)); \ + VFLOAT rcube = __riscv_vfmul(rsq, r, (vlen)); \ + VFLOAT r6 = __riscv_vfmul(rcube, rcube, (vlen)); \ + VFLOAT poly_right = PSTEP(0x1.74681ff881228p-14, rsq, \ + PSTEP( 0x1.39751be23e4a3p-16, \ + 0x1.30a893993e73dp-18, rsq, vlen), \ + vlen); \ + VFLOAT poly_left = PSTEP(0x1.999999996ce82p-7, rsq, \ + PSTEP(0x1.249249501b1adp-9, \ + 0x1.c71c47e7189f6p-12, rsq, vlen), \ + vlen); \ + poly_left = __riscv_vfmacc(poly_left, r6, poly_right, (vlen)); \ + poly_left = PSTEP(0x1.55555555555dbp-4, rsq, poly_left, (vlen)); \ + B = __riscv_vfmacc(B, NEG_LOG2_LO, n_flt, (vlen)); \ + B = __riscv_vfmacc(B, rcube, poly_left, (vlen)); \ + FAST2SUM(A, B, (y_hi), (y_lo), (vlen)); \ + /* A + B is -log(x) with extra precision, |B| \le ulp(A)/2 */ \ + } while(0) + +// This macro computes w_hi + w_lo = sqrt(y_hi + y_lo) in floating point +// and 1/(w_hi + w_lo) as a Q63 fixed-point T +// y_hi, y_lo is normalized on input; that is y_hi has +// full working precision of the sum y_hi + y_lo +// and 2 log(2) < y_hi < 1100 log(2) +#define SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi, w_lo, T, t_sc, t_sc_inv, vlen) \ + do { \ + (w_hi) = __riscv_vfsqrt((y_hi), (vlen)); \ + (w_lo) = __riscv_vfnmsub((w_hi), (w_hi), (y_hi), (vlen)); \ + (w_lo) = __riscv_vfadd((w_lo), (y_lo), (vlen)); \ + VFLOAT recip; \ + recip = __riscv_vfadd((y_hi), (y_hi), (vlen)); \ + recip = __riscv_vfrec7(recip, (vlen)); \ + recip = __riscv_vfmul(recip, (w_hi), (vlen)); \ + (w_lo) = __riscv_vfmul((w_lo), recip, (vlen)); \ + /* w_hi + w_lo is sqrt(y_hi + y_lo) to extra precision */ \ + /* now compute T = t_sc/(w_hi + w_lo) as fixed point */ \ + VFLOAT t_lo = VFMV_VF((t_sc), (vlen)); \ + VFLOAT t_hi = __riscv_vfdiv(t_lo, (w_hi), (vlen)); \ + (T) = __riscv_vfcvt_x(t_hi, (vlen)); \ + t_lo = __riscv_vfnmsac(t_lo, (w_hi), t_hi, (vlen)); \ + t_lo = __riscv_vfnmsac(t_lo, (w_lo), t_hi, (vlen)); \ + t_lo = __riscv_vfmul(t_lo, t_hi, (vlen)); \ + t_lo = __riscv_vfmul(t_lo, (t_sc_inv), vlen); \ + (T) = __riscv_vadd((T), __riscv_vfcvt_x(t_lo, (vlen)), (vlen)); \ + } while(0) + +#define ERFCINV_PQ_TINY(T, p_hi_tiny, p_lo_tiny, q_hi_tiny, q_lo_tiny, vlen) \ + do { \ + /* T is in scale of 65 */ \ + VINT P, Q; \ + P = PSTEP_I_SRA(P_tiny_7, T, 4, PSTEP_I_SRA(P_tiny_8, P_tiny_9, 4, T, \ + (vlen)), (vlen)); \ + /* P in Q_56 */ \ + P = PSTEP_I_SRA(P_tiny_5, T, 1, PSTEP_I_SRA(P_tiny_6, P, 2, T, \ + (vlen)), (vlen)); \ + /* P in Q_57 */ \ + P = PSTEP_I_SRA(P_tiny_3, T, 1, PSTEP_I(P_tiny_4, P, T, \ + (vlen)), (vlen)); \ + /* P in Q_60 */ \ + P = PSTEP_I_SLL(P_tiny_1, T, 1, PSTEP_I_SRA(P_tiny_2, P, 1, T, \ + (vlen)), (vlen)); \ + /* P in Q_64 */ \ + P = PSTEP_I(P_tiny_0, T, P, (vlen));\ + /* P in Q_66 */ \ + \ + Q = PSTEP_I_SRA(Q_tiny_7, T, 2, PSTEP_I_SRA(Q_tiny_8, Q_tiny_9, 3, T, \ + (vlen)), (vlen)); \ + /* Q in Q_56 */ \ + Q = PSTEP_I_SRA(Q_tiny_5, T, 1, PSTEP_I_SRA(Q_tiny_6, Q, 2, T, \ + (vlen)), (vlen)); \ + /* Q in Q_57 */ \ + Q = PSTEP_I_SRA(Q_tiny_3, T, 2, PSTEP_I(Q_tiny_4, Q, T, \ + (vlen)), (vlen)); \ + /* P in Q_59 */ \ + Q = PSTEP_I_SLL(Q_tiny_1, T, 1, PSTEP_I(Q_tiny_2, Q, T, \ + (vlen)), (vlen)); \ + /* Q in Q_64 */ \ + Q = PSTEP_I(Q_tiny_0, T, Q, (vlen));\ + /* Q in Q_66 */ \ + \ + p_hi_tiny = __riscv_vfcvt_f(P, (vlen)); \ + p_lo_tiny = __riscv_vfcvt_f( \ + __riscv_vsub(P, __riscv_vfcvt_x(p_hi_tiny, (vlen)), (vlen)), (vlen)); \ + p_lo_tiny = __riscv_vfadd(p_lo_tiny, DELTA_P0_tiny, (vlen)); \ + q_hi_tiny = __riscv_vfcvt_f(Q, vlen); \ + q_lo_tiny = __riscv_vfcvt_f( \ + __riscv_vsub(Q, __riscv_vfcvt_x(q_hi_tiny, (vlen)), (vlen)), (vlen)); \ + q_lo_tiny = __riscv_vfadd(q_lo, DELTA_Q0_tiny, (vlen)); \ + } while(0) + diff --git a/src/rvvlm_erfcinvD.c b/src/rvvlm_erfcinvD.c new file mode 100644 index 0000000..356c2ff --- /dev/null +++ b/src/rvvlm_erfcinvD.c @@ -0,0 +1,17 @@ +// SPDX-FileCopyrightText: 2023 Rivos Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include "rvvlm.h" +#define API_SIGNATURE API_SIGNATURE_11 +#define STRIDE UNIT_STRIDE + +#include RVVLM_ERFCINVD_VSET_CONFIG + +#define COMPILE_FOR_ERFCINV +#include "rvvlm_inverrorfuncsD.h" + +#include "rvvlm_erfcinvD.inc.h" diff --git a/src/rvvlm_erfcinvDI.c b/src/rvvlm_erfcinvDI.c new file mode 100644 index 0000000..ab6defb --- /dev/null +++ b/src/rvvlm_erfcinvDI.c @@ -0,0 +1,17 @@ +// SPDX-FileCopyrightText: 2023 Rivos Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include "rvvlm.h" +#define API_SIGNATURE API_SIGNATURE_11 +#define STRIDE GENERAL_STRIDE + +#include RVVLM_ERFCINVDI_VSET_CONFIG + +#define COMPILE_FOR_ERFCINV +#include "rvvlm_inverrorfuncsD.h" + +#include "rvvlm_erfcinvD.inc.h" diff --git a/src/rvvlm_erfinvD.c b/src/rvvlm_erfinvD.c new file mode 100644 index 0000000..864d6c2 --- /dev/null +++ b/src/rvvlm_erfinvD.c @@ -0,0 +1,17 @@ +// SPDX-FileCopyrightText: 2023 Rivos Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include "rvvlm.h" +#define API_SIGNATURE API_SIGNATURE_11 +#define STRIDE UNIT_STRIDE + +#include RVVLM_ERFINVD_VSET_CONFIG + +#define COMPILE_FOR_ERFINV +#include "rvvlm_inverrorfuncsD.h" + +#include "rvvlm_erfinvD.inc.h" diff --git a/src/rvvlm_erfinvDI.c b/src/rvvlm_erfinvDI.c new file mode 100644 index 0000000..005d032 --- /dev/null +++ b/src/rvvlm_erfinvDI.c @@ -0,0 +1,17 @@ +// SPDX-FileCopyrightText: 2023 Rivos Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include "rvvlm.h" +#define API_SIGNATURE API_SIGNATURE_11 +#define STRIDE GENERAL_STRIDE + +#include RVVLM_ERFINVDI_VSET_CONFIG + +#define COMPILE_FOR_ERFINV +#include "rvvlm_inverrorfuncsD.h" + +#include "rvvlm_erfinvD.inc.h" diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f4ce376..89ccd76 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -30,10 +30,12 @@ set(TEST_SOURCES # src/test_cbrtI.cpp # src/test_cdfnorm.cpp # src/test_cdfnormI.cpp -# src/test_erf.cpp -# src/test_erfI.cpp -# src/test_erfc.cpp -# src/test_erfcI.cpp + src/test_erf.cpp + src/test_erfI.cpp + src/test_erfc.cpp + src/test_erfcI.cpp + src/test_erfinv.cpp + src/test_erfcinv.cpp src/test_exp.cpp src/test_expI.cpp src/test_exp2.cpp diff --git a/test/include/test_infra.h b/test/include/test_infra.h index ba09dce..ccb01ab 100644 --- a/test/include/test_infra.h +++ b/test/include/test_infra.h @@ -42,6 +42,11 @@ void report_err2_fp64(void (*test_func)(size_t, const double *, size_t, long double (*ref_func)(long double, long double), double, double, int, int, double, double, int, int, int, bool); +void report_err_byinv_fp64(void (*test_func)(size_t, const double *, double *), + long double (*ref_inv_func)(long double), + long double (*ref_inv_func_prime)(long double), + double, double, int, double = 1.0); + void report_err_pow_fp64(void (*test_func)(size_t, const double *, const double *, double *), long double (*ref_func)(long double, long double), @@ -78,3 +83,5 @@ long double cdfnorml(long double); long double exp_neg_rsq(long double); long double x_transform(long double); long double recip_scale(long double); +long double erfl_prime(long double); +long double erfcl_prime(long double); diff --git a/test/src/test_erf.cpp b/test/src/test_erf.cpp index d8de902..6f82462 100644 --- a/test/src/test_erf.cpp +++ b/test/src/test_erf.cpp @@ -14,7 +14,7 @@ TEST(erf, test) { COMMENT("erf: current chosen algorithm; reduced argument in FP64 only") - // show_special_fp64(rvvlm_erf, "Special Value handling of this function"); + show_special_fp64(rvvlm_erf, "Special Value handling of this function"); x_start = 0x1.0p-40; x_end = 0x1.0p-20; diff --git a/test/src/test_erfc.cpp b/test/src/test_erfc.cpp index 265a15a..e0f149f 100644 --- a/test/src/test_erfc.cpp +++ b/test/src/test_erfc.cpp @@ -8,45 +8,48 @@ #include "rvvlm.h" #include "test_infra.h" -TEST(erfc, test) { +TEST(erfc, special) { unsigned long nb_tests; double x_start, x_end; COMMENT("erfc: current chosen algorithm; reduced argument in FP64 only") show_special_fp64(rvvlm_erfc, "Special Value handling of this function"); +} + +TEST(erfc, medium_args) { + unsigned long nb_tests; + double x_start, x_end; + + COMMENT("erfc: current chosen algorithm; reduced argument in FP64 only") x_start = -0x1.0p0; x_end = 0x1.0p0; nb_tests = 400000; report_err_fp64(rvvlm_erfc, erfcl, x_start, x_end, nb_tests); +} - x_start = 0x1.0p-2; - x_end = 0x1.0p0; - nb_tests = 400000; - report_err_fp64(rvvlm_erfc, erfcl, x_start, x_end, nb_tests); - report_err_fp64(rvvlm_erfc, erfcl, -x_start, -x_end, nb_tests); +TEST(erfc, around_one) { + unsigned long nb_tests; + double x_start, x_end; - x_start = 0x1.0p0; - x_end = 0x1.0p3; - nb_tests = 400000; - report_err_fp64(rvvlm_erfc, erfcl, x_start, x_end, nb_tests); - report_err_fp64(rvvlm_erfc, erfcl, -x_start, -x_end, nb_tests); + COMMENT("erfc: current chosen algorithm; reduced argument in FP64 only") - x_start = 0x1.0p3; - x_end = 30.0; + x_start = 0x1.0p-1; + x_end = 0x1.0p1; nb_tests = 400000; report_err_fp64(rvvlm_erfc, erfcl, x_start, x_end, nb_tests); report_err_fp64(rvvlm_erfc, erfcl, -x_start, -x_end, nb_tests); +} - x_start = 0x1.0p-80; - x_end = 0x1.0p-50; - nb_tests = 400000; - report_err_fp64(rvvlm_erfc, erfcl, x_start, x_end, nb_tests); - report_err_fp64(rvvlm_erfc, erfcl, -x_start, -x_end, nb_tests); +TEST(erfc, large) { + unsigned long nb_tests; + double x_start, x_end; - x_start = 0x1.0p-50; - x_end = 0x1.0p-10; + COMMENT("erfc: current chosen algorithm; reduced argument in FP64 only") + + x_start = 0x1.0p1; + x_end = 0x1.0p3; nb_tests = 400000; report_err_fp64(rvvlm_erfc, erfcl, x_start, x_end, nb_tests); report_err_fp64(rvvlm_erfc, erfcl, -x_start, -x_end, nb_tests); diff --git a/test/src/test_erfcinv.cpp b/test/src/test_erfcinv.cpp new file mode 100644 index 0000000..dc7f416 --- /dev/null +++ b/test/src/test_erfcinv.cpp @@ -0,0 +1,82 @@ +// SPDX-FileCopyrightText: 2023 Rivos Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include "rvvlm.h" +#include "test_infra.h" + +TEST(erfcinv, special) { + unsigned long nb_tests; + double x_start, x_end; + + COMMENT("erfcinv: current chosen algorithm; reduced argument in FP64 only") + + show_special_fp64(rvvlm_erfcinv, "Special Value handling of this function"); +} + +TEST(erfcinv, tiny_args) { + unsigned long nb_tests; + double x_start, x_end; + + COMMENT("erfcinv: current chosen algorithm; reduced argument in FP64 only") + + x_start = 0x1.0p-1074;; + x_end = 0x1.0p-1000; + nb_tests = 40000; + double thres = 3.5; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests, thres); + + x_start = 0x1.0p-1000;; + x_end = 0x1.0p-500; + nb_tests = 40000; + thres = 2.5; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests, thres); + + x_start = 0x1.0p-500;; + x_end = 0x1.0p-52; + nb_tests = 40000; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests, thres); +} + +TEST(erfcinv, small_args) { + unsigned long nb_tests; + double x_start, x_end; + + COMMENT("erfcinv: current chosen algorithm; reduced argument in FP64 only") + + x_start = 0x1.0p-50;; + x_end = 0x1.0p-20; + nb_tests = 40000; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + + x_start = 0x1.0p-20;; + x_end = 0x1.0p-4; + nb_tests = 40000; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); +} + +TEST(erfcinv, medium_args) { + unsigned long nb_tests; + double x_start, x_end; + + COMMENT("erfcinv: current chosen algorithm; reduced argument in FP64 only") + + x_start = 0x1.0p-4;; + x_end = 0x1.0p-1; + nb_tests = 40000; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + + x_start = 0x1.0p-1;; + x_end = 0x1.8p0; + nb_tests = 40000; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + + x_start = 2.0 - 0x1.0p-52; + x_end = 0x1.8p0; + nb_tests = 40000; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); +} + diff --git a/test/src/test_erfcinv.foo b/test/src/test_erfcinv.foo new file mode 100755 index 0000000..43231db --- /dev/null +++ b/test/src/test_erfcinv.foo @@ -0,0 +1,109 @@ +// SPDX-FileCopyrightText: 2023 Rivos Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include "rvvlm.h" +#include "test_infra.h" + +#define COMMENT(comment) \ + { printf("\n=====\t" comment "\n"); } + +int main() { + unsigned long nb_tests; + double x_start, x_end; + + COMMENT("erfcinv: current chosen algorithm; reduced argument in FP64 only") + + show_special_fp64(rvvlm_erfcinv, "Special Value handling of this function"); + + x_start = 0x1.0p-5; + x_end = 0x1.0p-2; + nb_tests = 100000; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + + x_start = 0x1.0p-2; + x_end = 0x1.0p-1; + nb_tests = 100000; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + + x_start = 0x1.0p-1; + x_end = 0x1.fffp-1; + nb_tests = 100000; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + + x_start = 1.0 + 0x1.0p-5; + x_end = 1.0 + 0x1.6p-2; + nb_tests = 100000; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + + x_start = 2.0 - 0x1.0p-52; + x_end = 1.0 + 0x1.6p-1; + nb_tests = 100000; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + + x_start = 0.24; + x_end = 0.3; + nb_tests = 2; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + + x_start = 0x1.0p-54; + x_end = 0x1.0p-53; + nb_tests = 100000; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + + x_start = 0x1.0p-55; + x_end = 0x1.0p-54; + nb_tests = 100000; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + + x_start = 0x1.0p-56; + x_end = 0x1.0p-55; + nb_tests = 100000; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + + x_start = 0x1.0p-57; + x_end = 0x1.0p-56; + nb_tests = 100000; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + + x_start = 0x1.0p-58; + x_end = 0x1.0p-57; + nb_tests = 100000; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + + x_start = 0x1.0p-59; + x_end = 0x1.0p-58; + nb_tests = 100000; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + + x_start = 0x1.0p-60; + x_end = 0x1.0p-59; + nb_tests = 100000; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + + x_start = 0x1.0p-100; + x_end = 0x1.0p-60; + nb_tests = 100000; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + + x_start = 0x1.0p-500; + x_end = 0x1.0p-100; + nb_tests = 100000; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + + x_start = 0x1.0p-1022; + x_end = 0x1.0p-100; + nb_tests = 100000; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + + x_start = 0x1.0p-1074; + x_end = 0x1.0p-1022; + nb_tests = 10000; + report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests); + + return 0; +} + diff --git a/test/src/test_erfinv.cpp b/test/src/test_erfinv.cpp new file mode 100644 index 0000000..1744e02 --- /dev/null +++ b/test/src/test_erfinv.cpp @@ -0,0 +1,96 @@ +// SPDX-FileCopyrightText: 2023 Rivos Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include "rvvlm.h" +#include "test_infra.h" + +TEST(erfinv, special) { + unsigned long nb_tests; + double x_start, x_end; + + COMMENT("erfinv: current chosen algorithm; reduced argument in FP64 only") + + show_special_fp64(rvvlm_erfinv, "Special Value handling of this function"); +} + +TEST(erfinv, small_args) { + unsigned long nb_tests; + double x_start, x_end; + + COMMENT("erfinv: current chosen algorithm; reduced argument in FP64 only") + + x_start = 0x1.0p-40;; + x_end = 0x1.0p-30; + nb_tests = 40000; + report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests); + + x_start = 0x1.0p-30;; + x_end = 0x1.0p-10; + nb_tests = 40000; + report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests); + + x_start = 0x1.0p-10;; + x_end = 0x1.0p-4; + nb_tests = 40000; + report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests); + +} + +TEST(erfinv, middle_args) { + unsigned long nb_tests; + double x_start, x_end; + + COMMENT("erfinv: current chosen algorithm; reduced argument in FP64 only") + + x_start = 0x1.0p-4;; + x_end = 0x1.0p-2; + nb_tests = 40000; + report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests); + + x_start = 0x1.0p-2;; + x_end = 0x1.0p-1; + nb_tests = 40000; + report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests); + + x_start = 0x1.0p-1;; + x_end = 0x1.78p-1; + nb_tests = 40000; + report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests); + + x_start = 0x1.6p-1;; + x_end = 0x1.7cp-1; + nb_tests = 40000; + report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests); + + x_start = 0x1.71p-1;; + x_end = 0x1.8p-1; + nb_tests = 40000; + report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests); + + x_start = -0.8; + x_end = 0.8; + nb_tests = 40000; + report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests); + + x_start = 0.25; + x_end = 0.9; + nb_tests = 2; + report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests); +} + +TEST(erfinv, close_to_1) { + unsigned long nb_tests; + double x_start, x_end; + + COMMENT("erfinv: current chosen algorithm; reduced argument in FP64 only") + + x_start = 1.0 - 0x1.0p-53; + x_end = 0x1.ffp-1; + nb_tests = 40000; + report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests); +} + diff --git a/test/src/test_infra.cpp b/test/src/test_infra.cpp index 715a075..686501e 100644 --- a/test/src/test_infra.cpp +++ b/test/src/test_infra.cpp @@ -457,6 +457,91 @@ void report_err_fp64(void (*test_func)(size_t, const double *, size_t, double *, free(z); } +void report_err_byinv_fp64(void (*test_func)(size_t, const double *, double *), + long double (*ref_inv_func)(long double), + long double (*ref_inv_func_prime)(long double), + double start, double end, int nb_pts, + double threshold) { + + long double f_inv_ref; + double *x, *y, delta; + long double abs_err, rel_err, ulp_err; + long double max_abs_err, max_rel_err, max_ulp_err; + + x = (double *)malloc(nb_pts * sizeof(double)); + y = (double *)malloc(nb_pts * sizeof(double)); + + if (nb_pts <= 1) { + delta = 0.0; + nb_pts = 1; + } else { + delta = (end - start) / (double)(nb_pts - 1); + } + + max_abs_err = 0.0L; + max_rel_err = 0.0L; + max_ulp_err = 0.0L; + + // fill up the set of test points + for (int i = 0; i < nb_pts; ++i) { + x[i] = start + (double)i * delta; + } + + // call function under test + test_func((size_t)nb_pts, x, y); + + // now for each point we compute error and log the max + for (int j = 0; j < nb_pts; j++) { + f_inv_ref = ref_inv_func((long double)y[j]); + abs_err = (long double)x[j] - f_inv_ref; + abs_err = abs_err / ref_inv_func_prime((long double)y[j]); + abs_err = ABS(abs_err); + + if (ABS(y[j]) > 0.0) { + rel_err = abs_err / ABS(y[j]); + } else { + rel_err = abs_err / 0x1.0p-1074; + } + ulp_err = abs_err / ulp_64(y[j]); + + max_abs_err = MAX(max_abs_err, abs_err); + max_rel_err = MAX(max_rel_err, rel_err); + max_ulp_err = MAX(max_ulp_err, ulp_err); + + if (VERBOSE) { + union sui64_fp64 xxx, yyy; + xxx.f = x[j]; + yyy.f = y[j]; + printf("--input %24.17le, 0x%016lx, output %24.17le, 0x%016lx \n", xxx.f, + xxx.ui, yyy.f, yyy.ui); + printf(" inverse of computed value %28.25Le\n\n", f_inv_ref); + printf(" abs err %8.3Le, rel err %8.3Le, ulp err %8.3Le\n\n", abs_err, + rel_err, ulp_err); + printf(" max observered abs err %8.3Le\n", max_abs_err); + } + } + printf("----------------------------\n"); + if ((ABS(start) > 100.) || (ABS(end) < 1.e-2)) { + printf("Tested %d points in [%8.3le, %8.3le]\n", nb_pts, start, end); + } else { + printf("Tested %d points in [%3.3lf, %3.3lf]\n", nb_pts, start, end); + } + printf("Maximum observed absolute error is %8.3Le\n", max_abs_err); + printf("Maximum observed relative error is %8.3Le\n", max_rel_err); + if (max_rel_err > 0.0) { + printf(" which is 2^(%3.3Lf)\n", + log2(max_rel_err)); + } + printf("Maximum observed ULP error is %3.3Lf\n", max_ulp_err); + + EXPECT_LT(max_ulp_err, threshold); + + free(x); + free(y); + + return; +} + void report_err_pow_fp64(void (*test_func)(size_t, const double *, const double *, double *), long double (*ref_func)(long double, long double), @@ -1108,3 +1193,17 @@ long double recip_scale(long double x) { result = result / (result + x + x); return result; } + +long double erfl_prime(long double x) { + long double two_ov_rt_pi = 1.12837916709551257389615890312154517L; + long double y; + y = two_ov_rt_pi * expl(-x * x); + return y; +} + +long double erfcl_prime(long double x) { + long double two_ov_rt_pi = 1.12837916709551257389615890312154517L; + long double y; + y = -two_ov_rt_pi * expl(-x * x); + return y; +}