Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improved accuracy of erfcinv for tiny arguments #27

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions include/rvvlm.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ union sui64_fp64 {
} \
} while (0)

#define VSRL_I_AS_U(x, nbits, vlen) \
U_AS_I(__riscv_vsrl(I_AS_U((x)), (nbits), (vlen)))

#define PSTEP(coeff_j, x, poly, vlen) \
__riscv_vfmadd((poly), (x), VFMV_VF((coeff_j), (vlen)), (vlen))

Expand All @@ -93,6 +96,13 @@ union sui64_fp64 {
__riscv_vsra(__riscv_vsmul((POLY), (X), 1, (vlen)), (K), (vlen)), \
(COEFF_j), (vlen))

#define PSTEP_I_HI_SRA(COEFF_j, X, K, POLY, vlen) \
__riscv_vadd(__riscv_vsra(__riscv_vmulh((POLY), (X), (vlen)), (K), (vlen)), \
(COEFF_j), (vlen))

#define PSTEP_I_HI(COEFF_j, X, POLY, vlen) \
__riscv_vadd(__riscv_vmulh((POLY), (X), (vlen)), (COEFF_j), (vlen))

#define PSTEPN_I(COEFF_j, X, POLY, vlen) \
__riscv_vrsub(__riscv_vsmul((POLY), (X), 1, (vlen)), (COEFF_j), (vlen))

Expand Down
7 changes: 4 additions & 3 deletions include/rvvlm_erfcinvD.inc.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,8 @@ void F_VER1(API) {
0x1.0p-63, vlen);
if (__riscv_vcpop(x_is_tiny, vlen) > 0) {
VFLOAT w_hi_dummy, w_lo_dummy;
SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_dummy, w_lo_dummy, T_tiny, 0x1.0p65,
0x1.0p-65, vlen);
SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_dummy, w_lo_dummy, T_tiny, 0x1.0p64,
0x1.0p-64, vlen);
}
}
w_hi = VFMV_VF(fp_posOne, vlen);
Expand Down Expand Up @@ -195,7 +195,8 @@ void F_VER1(API) {

if (__riscv_vcpop(x_is_tiny, vlen) > 0) {
VFLOAT p_hi_tiny, p_lo_tiny, q_hi_tiny, q_lo_tiny;
ERFCINV_PQ_TINY(T_tiny, p_hi_tiny, p_lo_tiny, q_hi_tiny, q_lo_tiny, vlen);
ERFCINV_PQ_HILO_TINY(T_tiny, p_hi_tiny, p_lo_tiny, q_hi_tiny, q_lo_tiny,
vlen);
p_hi = __riscv_vmerge(p_hi, p_hi_tiny, x_is_tiny, vlen);
p_lo = __riscv_vmerge(p_lo, p_lo_tiny, x_is_tiny, vlen);
q_hi = __riscv_vmerge(q_hi, q_hi_tiny, x_is_tiny, vlen);
Expand Down
116 changes: 116 additions & 0 deletions include/rvvlm_inverrorfuncsD.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,52 @@
#define Q_tiny_9 0x5e4a26a7c1415755 // sacle 57
#define DELTA_Q0_tiny 0x1.8a7adad44d65ap-4 // scale 66

#define Q50_84
// Using [P,Q]_tiny_[HI,LO]_k, HI in Q50, LO in Q84
#if defined(Q50_84)
#define P_tiny_HI_0 -0x8593442eL
#define P_tiny_LO_0 -0x4e7245b3L
#define P_tiny_HI_1 -0x7f3dc156b1L
#define P_tiny_LO_1 -0x1f0300096L
#define P_tiny_HI_2 -0x20dbbc67b1b8L
#define P_tiny_LO_2 -0xbc59b742L
#define P_tiny_HI_3 -0x30c75b8264d44L
#define P_tiny_LO_3 -0x18a421ab9L
#define P_tiny_HI_4 -0x1ab06f39e5addeL
#define P_tiny_LO_4 -0x180f2a477L
#define P_tiny_HI_5 -0x3d698fc2964a9cL
#define P_tiny_LO_5 0xc3d4ab0bL
#define P_tiny_HI_6 0x7f7d6a748d0c2fL
#define P_tiny_LO_6 0x1729754e9L
#define P_tiny_HI_7 0x18b100818c77848L
#define P_tiny_LO_7 0x1aca73439L
#define P_tiny_HI_8 0x4be1ed5d031774L
#define P_tiny_LO_8 -0x3b6c5afbL
#define P_tiny_HI_9 0x156a7e2e54e260L
#define P_tiny_LO_9 0x1a0c336beL

#define Q_tiny_HI_0 -0x85933cdaL
#define Q_tiny_LO_0 -0xb5b39d61L
#define Q_tiny_HI_1 -0x7f3de4b69fL
#define Q_tiny_LO_1 -0x151d1cd35L
#define Q_tiny_HI_2 -0x20dd8dc1da27L
#define Q_tiny_LO_2 -0x1706945d7L
#define Q_tiny_HI_3 -0x30dc92d1cd231L
#define Q_tiny_LO_3 0xabde03f9L
#define Q_tiny_HI_4 -0x1af5fcee397d58L
#define Q_tiny_LO_4 -0xc3530d28L
#define Q_tiny_HI_5 -0x42639eeec1d051L
#define Q_tiny_LO_5 0x662b41ecL
#define Q_tiny_HI_6 0x6182b99f6ca998L
#define Q_tiny_LO_6 0x938a5e35L
#define Q_tiny_HI_7 0x17a6848dc07624aL
#define Q_tiny_LO_7 0x8a0484b7L
#define Q_tiny_HI_8 0x105ecd6aac52b12L
#define Q_tiny_LO_8 0x1d1e38258L
#define Q_tiny_HI_9 0xbc944d4f8282afL
#define Q_tiny_LO_9 -0x155b50b48L
#endif

// erfinv(+-1) = +-Inf with divide by zero
// erfinv(x) |x| > 1, real is NaN with invalid
// erfinv(NaN) is NaN, invalid if input is signalling NaN
Expand Down Expand Up @@ -211,3 +257,73 @@
__riscv_vsub(Q, __riscv_vfcvt_x(q_hi_tiny, (vlen)), (vlen)), (vlen)); \
q_lo_tiny = __riscv_vfadd(q_lo, DELTA_Q0_tiny, (vlen)); \
} while (0)

#define UPDATE_P_LO(COEFF, T, P_HI, P_LO, P_tmp, K, vlen) \
do { \
(P_LO) = PSTEP_I_HI((COEFF), (T), (P_LO), (vlen)); \
(P_tmp) = __riscv_vmul((T), (P_HI), (vlen)); \
(P_tmp) = VSRL_I_AS_U((P_tmp), (K), (vlen)); \
(P_LO) = __riscv_vadd((P_LO), (P_tmp), (vlen)); \
} while (0)

#define ERFCINV_PQ_HILO_TINY(T, p_hi_tiny, p_lo_tiny, q_hi_tiny, q_lo_tiny, \
vlen) \
do { \
/* T is in scale of 64 */ \
VINT P_HI, P_LO, Q_HI, Q_LO, P_tmp, Q_tmp; \
\
P_HI = VMVI_VX(P_tiny_HI_9, (vlen)); \
P_LO = VMVI_VX(P_tiny_LO_9, (vlen)); \
\
UPDATE_P_LO(P_tiny_LO_8, (T), P_HI, P_LO, P_tmp, 30, (vlen)); \
P_HI = PSTEP_I_HI(P_tiny_HI_8, (T), P_HI, (vlen)); \
UPDATE_P_LO(P_tiny_LO_7, (T), P_HI, P_LO, P_tmp, 30, (vlen)); \
P_HI = PSTEP_I_HI(P_tiny_HI_7, (T), P_HI, (vlen)); \
UPDATE_P_LO(P_tiny_LO_6, (T), P_HI, P_LO, P_tmp, 30, (vlen)); \
P_HI = PSTEP_I_HI(P_tiny_HI_6, (T), P_HI, (vlen)); \
UPDATE_P_LO(P_tiny_LO_5, (T), P_HI, P_LO, P_tmp, 30, (vlen)); \
P_HI = PSTEP_I_HI(P_tiny_HI_5, (T), P_HI, (vlen)); \
UPDATE_P_LO(P_tiny_LO_4, (T), P_HI, P_LO, P_tmp, 30, (vlen)); \
P_HI = PSTEP_I_HI(P_tiny_HI_4, (T), P_HI, (vlen)); \
UPDATE_P_LO(P_tiny_LO_3, (T), P_HI, P_LO, P_tmp, 30, (vlen)); \
P_HI = PSTEP_I_HI(P_tiny_HI_3, (T), P_HI, (vlen)); \
UPDATE_P_LO(P_tiny_LO_2, (T), P_HI, P_LO, P_tmp, 30, (vlen)); \
P_HI = PSTEP_I_HI(P_tiny_HI_2, (T), P_HI, (vlen)); \
UPDATE_P_LO(P_tiny_LO_1, (T), P_HI, P_LO, P_tmp, 30, (vlen)); \
P_HI = PSTEP_I_HI(P_tiny_HI_1, (T), P_HI, (vlen)); \
UPDATE_P_LO(P_tiny_LO_0, (T), P_HI, P_LO, P_tmp, 30, (vlen)); \
P_HI = PSTEP_I_HI(P_tiny_HI_0, (T), P_HI, (vlen)); \
\
Q_HI = VMVI_VX(Q_tiny_HI_9, (vlen)); \
Q_LO = VMVI_VX(Q_tiny_LO_9, (vlen)); \
\
UPDATE_P_LO(Q_tiny_LO_8, (T), Q_HI, Q_LO, Q_tmp, 30, (vlen)); \
Q_HI = PSTEP_I_HI(Q_tiny_HI_8, (T), Q_HI, (vlen)); \
UPDATE_P_LO(Q_tiny_LO_7, (T), Q_HI, Q_LO, Q_tmp, 30, (vlen)); \
Q_HI = PSTEP_I_HI(Q_tiny_HI_7, (T), Q_HI, (vlen)); \
UPDATE_P_LO(Q_tiny_LO_6, (T), Q_HI, Q_LO, Q_tmp, 30, (vlen)); \
Q_HI = PSTEP_I_HI(Q_tiny_HI_6, (T), Q_HI, (vlen)); \
UPDATE_P_LO(Q_tiny_LO_5, (T), Q_HI, Q_LO, Q_tmp, 30, (vlen)); \
Q_HI = PSTEP_I_HI(Q_tiny_HI_5, (T), Q_HI, (vlen)); \
UPDATE_P_LO(Q_tiny_LO_4, (T), Q_HI, Q_LO, Q_tmp, 30, (vlen)); \
Q_HI = PSTEP_I_HI(Q_tiny_HI_4, (T), Q_HI, (vlen)); \
UPDATE_P_LO(Q_tiny_LO_3, (T), Q_HI, Q_LO, Q_tmp, 30, (vlen)); \
Q_HI = PSTEP_I_HI(Q_tiny_HI_3, (T), Q_HI, (vlen)); \
UPDATE_P_LO(Q_tiny_LO_2, (T), Q_HI, Q_LO, Q_tmp, 30, (vlen)); \
Q_HI = PSTEP_I_HI(Q_tiny_HI_2, (T), Q_HI, (vlen)); \
UPDATE_P_LO(Q_tiny_LO_1, (T), Q_HI, Q_LO, Q_tmp, 30, (vlen)); \
Q_HI = PSTEP_I_HI(Q_tiny_HI_1, (T), Q_HI, (vlen)); \
UPDATE_P_LO(Q_tiny_LO_0, (T), Q_HI, Q_LO, Q_tmp, 30, (vlen)); \
Q_HI = PSTEP_I_HI(Q_tiny_HI_0, (T), Q_HI, (vlen)); \
\
VFLOAT A = __riscv_vfcvt_f(P_HI, (vlen)); \
p_lo_tiny = __riscv_vfcvt_f(P_LO, (vlen)); \
p_hi_tiny = __riscv_vfmadd(p_lo_tiny, 0x1.0p-34, A, (vlen)); \
p_lo_tiny = __riscv_vfmadd(p_lo_tiny, 0x1.0p-34, \
__riscv_vfsub(A, p_hi_tiny, (vlen)), (vlen)); \
VFLOAT B = __riscv_vfcvt_f(Q_HI, (vlen)); \
q_lo_tiny = __riscv_vfcvt_f(Q_LO, (vlen)); \
q_hi_tiny = __riscv_vfmadd(q_lo_tiny, 0x1.0p-34, B, (vlen)); \
q_lo_tiny = __riscv_vfmadd(q_lo_tiny, 0x1.0p-34, \
__riscv_vfsub(B, q_hi_tiny, (vlen)), (vlen)); \
} while (0)
15 changes: 5 additions & 10 deletions test/src/test_erfcinv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,27 +24,22 @@ TEST(erfcinv, tiny_args) {
COMMENT("erfcinv: current chosen algorithm; reduced argument in FP64 only")

x_start = 0x1.0p-1074;
;
x_end = 0x1.0p-1000;
nb_tests = 40000;
double thres = 3.5;
report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end,
nb_tests, thres);
nb_tests);

x_start = 0x1.0p-1000;
;
x_end = 0x1.0p-500;
nb_tests = 40000;
thres = 2.5;
report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end,
nb_tests, thres);
nb_tests);

x_start = 0x1.0p-500;
;
x_end = 0x1.0p-52;
x_start = 0x1.0p-55;
x_end = 0x1.0p-53;
nb_tests = 40000;
report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end,
nb_tests, thres);
nb_tests);
}

TEST(erfcinv, small_args) {
Expand Down
Loading