diff --git a/include/rvvlm.h b/include/rvvlm.h index a5ac3a0..eccf2b8 100644 --- a/include/rvvlm.h +++ b/include/rvvlm.h @@ -71,6 +71,9 @@ union sui64_fp64 { } \ } while (0) +#define VSRL_I_AS_U(x, nbits, vlen) \ + U_AS_I(__riscv_vsrl(I_AS_U((x)), (nbits), (vlen))) + #define PSTEP(coeff_j, x, poly, vlen) \ __riscv_vfmadd((poly), (x), VFMV_VF((coeff_j), (vlen)), (vlen)) @@ -93,6 +96,13 @@ union sui64_fp64 { __riscv_vsra(__riscv_vsmul((POLY), (X), 1, (vlen)), (K), (vlen)), \ (COEFF_j), (vlen)) +#define PSTEP_I_HI_SRA(COEFF_j, X, K, POLY, vlen) \ + __riscv_vadd(__riscv_vsra(__riscv_vmulh((POLY), (X), (vlen)), (K), (vlen)), \ + (COEFF_j), (vlen)) + +#define PSTEP_I_HI(COEFF_j, X, POLY, vlen) \ + __riscv_vadd(__riscv_vmulh((POLY), (X), (vlen)), (COEFF_j), (vlen)) + #define PSTEPN_I(COEFF_j, X, POLY, vlen) \ __riscv_vrsub(__riscv_vsmul((POLY), (X), 1, (vlen)), (COEFF_j), (vlen)) diff --git a/include/rvvlm_erfcinvD.inc.h b/include/rvvlm_erfcinvD.inc.h index 7e8805d..147328b 100644 --- a/include/rvvlm_erfcinvD.inc.h +++ b/include/rvvlm_erfcinvD.inc.h @@ -107,8 +107,8 @@ void F_VER1(API) { 0x1.0p-63, vlen); if (__riscv_vcpop(x_is_tiny, vlen) > 0) { VFLOAT w_hi_dummy, w_lo_dummy; - SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_dummy, w_lo_dummy, T_tiny, 0x1.0p65, - 0x1.0p-65, vlen); + SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_dummy, w_lo_dummy, T_tiny, 0x1.0p64, + 0x1.0p-64, vlen); } } w_hi = VFMV_VF(fp_posOne, vlen); @@ -195,7 +195,8 @@ void F_VER1(API) { if (__riscv_vcpop(x_is_tiny, vlen) > 0) { VFLOAT p_hi_tiny, p_lo_tiny, q_hi_tiny, q_lo_tiny; - ERFCINV_PQ_TINY(T_tiny, p_hi_tiny, p_lo_tiny, q_hi_tiny, q_lo_tiny, vlen); + ERFCINV_PQ_HILO_TINY(T_tiny, p_hi_tiny, p_lo_tiny, q_hi_tiny, q_lo_tiny, + vlen); p_hi = __riscv_vmerge(p_hi, p_hi_tiny, x_is_tiny, vlen); p_lo = __riscv_vmerge(p_lo, p_lo_tiny, x_is_tiny, vlen); q_hi = __riscv_vmerge(q_hi, q_hi_tiny, x_is_tiny, vlen); diff --git a/include/rvvlm_inverrorfuncsD.h b/include/rvvlm_inverrorfuncsD.h index 8e1b619..df670a3 100644 --- a/include/rvvlm_inverrorfuncsD.h +++ b/include/rvvlm_inverrorfuncsD.h @@ -35,6 +35,52 @@ #define Q_tiny_9 0x5e4a26a7c1415755 // sacle 57 #define DELTA_Q0_tiny 0x1.8a7adad44d65ap-4 // scale 66 +#define Q50_84 +// Using [P,Q]_tiny_[HI,LO]_k, HI in Q50, LO in Q84 +#if defined(Q50_84) +#define P_tiny_HI_0 -0x8593442eL +#define P_tiny_LO_0 -0x4e7245b3L +#define P_tiny_HI_1 -0x7f3dc156b1L +#define P_tiny_LO_1 -0x1f0300096L +#define P_tiny_HI_2 -0x20dbbc67b1b8L +#define P_tiny_LO_2 -0xbc59b742L +#define P_tiny_HI_3 -0x30c75b8264d44L +#define P_tiny_LO_3 -0x18a421ab9L +#define P_tiny_HI_4 -0x1ab06f39e5addeL +#define P_tiny_LO_4 -0x180f2a477L +#define P_tiny_HI_5 -0x3d698fc2964a9cL +#define P_tiny_LO_5 0xc3d4ab0bL +#define P_tiny_HI_6 0x7f7d6a748d0c2fL +#define P_tiny_LO_6 0x1729754e9L +#define P_tiny_HI_7 0x18b100818c77848L +#define P_tiny_LO_7 0x1aca73439L +#define P_tiny_HI_8 0x4be1ed5d031774L +#define P_tiny_LO_8 -0x3b6c5afbL +#define P_tiny_HI_9 0x156a7e2e54e260L +#define P_tiny_LO_9 0x1a0c336beL + +#define Q_tiny_HI_0 -0x85933cdaL +#define Q_tiny_LO_0 -0xb5b39d61L +#define Q_tiny_HI_1 -0x7f3de4b69fL +#define Q_tiny_LO_1 -0x151d1cd35L +#define Q_tiny_HI_2 -0x20dd8dc1da27L +#define Q_tiny_LO_2 -0x1706945d7L +#define Q_tiny_HI_3 -0x30dc92d1cd231L +#define Q_tiny_LO_3 0xabde03f9L +#define Q_tiny_HI_4 -0x1af5fcee397d58L +#define Q_tiny_LO_4 -0xc3530d28L +#define Q_tiny_HI_5 -0x42639eeec1d051L +#define Q_tiny_LO_5 0x662b41ecL +#define Q_tiny_HI_6 0x6182b99f6ca998L +#define Q_tiny_LO_6 0x938a5e35L +#define Q_tiny_HI_7 0x17a6848dc07624aL +#define Q_tiny_LO_7 0x8a0484b7L +#define Q_tiny_HI_8 0x105ecd6aac52b12L +#define Q_tiny_LO_8 0x1d1e38258L +#define Q_tiny_HI_9 0xbc944d4f8282afL +#define Q_tiny_LO_9 -0x155b50b48L +#endif + // erfinv(+-1) = +-Inf with divide by zero // erfinv(x) |x| > 1, real is NaN with invalid // erfinv(NaN) is NaN, invalid if input is signalling NaN @@ -211,3 +257,73 @@ __riscv_vsub(Q, __riscv_vfcvt_x(q_hi_tiny, (vlen)), (vlen)), (vlen)); \ q_lo_tiny = __riscv_vfadd(q_lo, DELTA_Q0_tiny, (vlen)); \ } while (0) + +#define UPDATE_P_LO(COEFF, T, P_HI, P_LO, P_tmp, K, vlen) \ + do { \ + (P_LO) = PSTEP_I_HI((COEFF), (T), (P_LO), (vlen)); \ + (P_tmp) = __riscv_vmul((T), (P_HI), (vlen)); \ + (P_tmp) = VSRL_I_AS_U((P_tmp), (K), (vlen)); \ + (P_LO) = __riscv_vadd((P_LO), (P_tmp), (vlen)); \ + } while (0) + +#define ERFCINV_PQ_HILO_TINY(T, p_hi_tiny, p_lo_tiny, q_hi_tiny, q_lo_tiny, \ + vlen) \ + do { \ + /* T is in scale of 64 */ \ + VINT P_HI, P_LO, Q_HI, Q_LO, P_tmp, Q_tmp; \ + \ + P_HI = VMVI_VX(P_tiny_HI_9, (vlen)); \ + P_LO = VMVI_VX(P_tiny_LO_9, (vlen)); \ + \ + UPDATE_P_LO(P_tiny_LO_8, (T), P_HI, P_LO, P_tmp, 30, (vlen)); \ + P_HI = PSTEP_I_HI(P_tiny_HI_8, (T), P_HI, (vlen)); \ + UPDATE_P_LO(P_tiny_LO_7, (T), P_HI, P_LO, P_tmp, 30, (vlen)); \ + P_HI = PSTEP_I_HI(P_tiny_HI_7, (T), P_HI, (vlen)); \ + UPDATE_P_LO(P_tiny_LO_6, (T), P_HI, P_LO, P_tmp, 30, (vlen)); \ + P_HI = PSTEP_I_HI(P_tiny_HI_6, (T), P_HI, (vlen)); \ + UPDATE_P_LO(P_tiny_LO_5, (T), P_HI, P_LO, P_tmp, 30, (vlen)); \ + P_HI = PSTEP_I_HI(P_tiny_HI_5, (T), P_HI, (vlen)); \ + UPDATE_P_LO(P_tiny_LO_4, (T), P_HI, P_LO, P_tmp, 30, (vlen)); \ + P_HI = PSTEP_I_HI(P_tiny_HI_4, (T), P_HI, (vlen)); \ + UPDATE_P_LO(P_tiny_LO_3, (T), P_HI, P_LO, P_tmp, 30, (vlen)); \ + P_HI = PSTEP_I_HI(P_tiny_HI_3, (T), P_HI, (vlen)); \ + UPDATE_P_LO(P_tiny_LO_2, (T), P_HI, P_LO, P_tmp, 30, (vlen)); \ + P_HI = PSTEP_I_HI(P_tiny_HI_2, (T), P_HI, (vlen)); \ + UPDATE_P_LO(P_tiny_LO_1, (T), P_HI, P_LO, P_tmp, 30, (vlen)); \ + P_HI = PSTEP_I_HI(P_tiny_HI_1, (T), P_HI, (vlen)); \ + UPDATE_P_LO(P_tiny_LO_0, (T), P_HI, P_LO, P_tmp, 30, (vlen)); \ + P_HI = PSTEP_I_HI(P_tiny_HI_0, (T), P_HI, (vlen)); \ + \ + Q_HI = VMVI_VX(Q_tiny_HI_9, (vlen)); \ + Q_LO = VMVI_VX(Q_tiny_LO_9, (vlen)); \ + \ + UPDATE_P_LO(Q_tiny_LO_8, (T), Q_HI, Q_LO, Q_tmp, 30, (vlen)); \ + Q_HI = PSTEP_I_HI(Q_tiny_HI_8, (T), Q_HI, (vlen)); \ + UPDATE_P_LO(Q_tiny_LO_7, (T), Q_HI, Q_LO, Q_tmp, 30, (vlen)); \ + Q_HI = PSTEP_I_HI(Q_tiny_HI_7, (T), Q_HI, (vlen)); \ + UPDATE_P_LO(Q_tiny_LO_6, (T), Q_HI, Q_LO, Q_tmp, 30, (vlen)); \ + Q_HI = PSTEP_I_HI(Q_tiny_HI_6, (T), Q_HI, (vlen)); \ + UPDATE_P_LO(Q_tiny_LO_5, (T), Q_HI, Q_LO, Q_tmp, 30, (vlen)); \ + Q_HI = PSTEP_I_HI(Q_tiny_HI_5, (T), Q_HI, (vlen)); \ + UPDATE_P_LO(Q_tiny_LO_4, (T), Q_HI, Q_LO, Q_tmp, 30, (vlen)); \ + Q_HI = PSTEP_I_HI(Q_tiny_HI_4, (T), Q_HI, (vlen)); \ + UPDATE_P_LO(Q_tiny_LO_3, (T), Q_HI, Q_LO, Q_tmp, 30, (vlen)); \ + Q_HI = PSTEP_I_HI(Q_tiny_HI_3, (T), Q_HI, (vlen)); \ + UPDATE_P_LO(Q_tiny_LO_2, (T), Q_HI, Q_LO, Q_tmp, 30, (vlen)); \ + Q_HI = PSTEP_I_HI(Q_tiny_HI_2, (T), Q_HI, (vlen)); \ + UPDATE_P_LO(Q_tiny_LO_1, (T), Q_HI, Q_LO, Q_tmp, 30, (vlen)); \ + Q_HI = PSTEP_I_HI(Q_tiny_HI_1, (T), Q_HI, (vlen)); \ + UPDATE_P_LO(Q_tiny_LO_0, (T), Q_HI, Q_LO, Q_tmp, 30, (vlen)); \ + Q_HI = PSTEP_I_HI(Q_tiny_HI_0, (T), Q_HI, (vlen)); \ + \ + VFLOAT A = __riscv_vfcvt_f(P_HI, (vlen)); \ + p_lo_tiny = __riscv_vfcvt_f(P_LO, (vlen)); \ + p_hi_tiny = __riscv_vfmadd(p_lo_tiny, 0x1.0p-34, A, (vlen)); \ + p_lo_tiny = __riscv_vfmadd(p_lo_tiny, 0x1.0p-34, \ + __riscv_vfsub(A, p_hi_tiny, (vlen)), (vlen)); \ + VFLOAT B = __riscv_vfcvt_f(Q_HI, (vlen)); \ + q_lo_tiny = __riscv_vfcvt_f(Q_LO, (vlen)); \ + q_hi_tiny = __riscv_vfmadd(q_lo_tiny, 0x1.0p-34, B, (vlen)); \ + q_lo_tiny = __riscv_vfmadd(q_lo_tiny, 0x1.0p-34, \ + __riscv_vfsub(B, q_hi_tiny, (vlen)), (vlen)); \ + } while (0) diff --git a/test/src/test_erfcinv.cpp b/test/src/test_erfcinv.cpp index e937fcf..5dfae21 100644 --- a/test/src/test_erfcinv.cpp +++ b/test/src/test_erfcinv.cpp @@ -24,27 +24,22 @@ TEST(erfcinv, tiny_args) { COMMENT("erfcinv: current chosen algorithm; reduced argument in FP64 only") x_start = 0x1.0p-1074; - ; x_end = 0x1.0p-1000; nb_tests = 40000; - double thres = 3.5; report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, - nb_tests, thres); + nb_tests); x_start = 0x1.0p-1000; - ; x_end = 0x1.0p-500; nb_tests = 40000; - thres = 2.5; report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, - nb_tests, thres); + nb_tests); - x_start = 0x1.0p-500; - ; - x_end = 0x1.0p-52; + x_start = 0x1.0p-55; + x_end = 0x1.0p-53; nb_tests = 40000; report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, - nb_tests, thres); + nb_tests); } TEST(erfcinv, small_args) {