Skip to content

Commit

Permalink
fix formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
PingTakPeterTang committed Mar 7, 2024
1 parent 677fe89 commit 65e6159
Show file tree
Hide file tree
Showing 6 changed files with 450 additions and 380 deletions.
13 changes: 6 additions & 7 deletions include/foo.h
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@

#define PSTEP_I_SLL(COEFF_j, X, K, POLY, vlen) \
__riscv_vsadd(__riscv_vsll( \
__riscv_vsmul((POLY), (X), 1, (vlen)), (K), (vlen)), \
(COEFF_j), (vlen))
__riscv_vsadd( \
__riscv_vsll(__riscv_vsmul((POLY), (X), 1, (vlen)), (K), (vlen)), \
(COEFF_j), (vlen))

#define PSTEP_I_SRA(COEFF_j, X, K, POLY, vlen) \
__riscv_vsadd(__riscv_vsra( \
__riscv_vsmul((POLY), (X), 1, (vlen)), (K), (vlen)), \
(COEFF_j), (vlen))

__riscv_vsadd( \
__riscv_vsra(__riscv_vsmul((POLY), (X), 1, (vlen)), (K), (vlen)), \
(COEFF_j), (vlen))
229 changes: 125 additions & 104 deletions include/rvvlm_erfcinvD.inc.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,74 +8,73 @@
#define F_VER1 RVVLM_ERFCINVDI_STD
#endif

//Erfcinv is defined on (0, 2). Suffices to consider (0, 1]
//Two regions of approximation: left is [0, 0x1.2p-2) and right is [0x1.2p-1, 1)
//Both are done with rational functions.
//For right, t*P(t)/Q(t) t = 1-x; x in [0x1.2p-1, 1)
//For left, y*P(t)/Q(t), y = sqrt(-log(x)); and t = 1/y

// P_coefficients in asending order, all in Q79. p0_delta is in floating point
#define P_right_0 -0x48dbe9f5b3eabaa
#define P_right_1 -0xb35279f1a626ae5
#define P_right_2 0x33789911873d184a
#define P_right_3 -0x1bf9138fc77c0fbf
#define P_right_4 -0x2d4ec43bc48403d4
#define P_right_5 0x2d61deb53842cca1
#define P_right_6 0x23324eca6b3ff02
#define P_right_7 -0xd4ec1d31542c4fc
#define P_right_8 0x2ecf3c60308b0f2
#define P_right_9 0x7c917b3378071e
#define P_right_10 -0x1e09b597f226ca
// Erfcinv is defined on (0, 2). Suffices to consider (0, 1]
// Two regions of approximation: left is [0, 0x1.2p-2) and right is [0x1.2p-1,
// 1) Both are done with rational functions. For right, t*P(t)/Q(t) t = 1-x; x
// in [0x1.2p-1, 1) For left, y*P(t)/Q(t), y = sqrt(-log(x)); and t = 1/y

// P_coefficients in asending order, all in Q79. p0_delta is in floating point
#define P_right_0 -0x48dbe9f5b3eabaa
#define P_right_1 -0xb35279f1a626ae5
#define P_right_2 0x33789911873d184a
#define P_right_3 -0x1bf9138fc77c0fbf
#define P_right_4 -0x2d4ec43bc48403d4
#define P_right_5 0x2d61deb53842cca1
#define P_right_6 0x23324eca6b3ff02
#define P_right_7 -0xd4ec1d31542c4fc
#define P_right_8 0x2ecf3c60308b0f2
#define P_right_9 0x7c917b3378071e
#define P_right_10 -0x1e09b597f226ca
#define DELTA_P0_right -0x1.ec7dc41c17860p-2

// Q_coefficients in asending order, all in Q79. q0_delta is in floating point
#define Q_right_0 -0x52366e5b14c0970
#define Q_right_1 -0xca57e95abcc599b
#define Q_right_2 0x3b6c91ec67f5759c
#define Q_right_3 -0x1c40d5daa3be22bc
#define Q_right_4 -0x41f11eb5d837386c
#define Q_right_5 0x3c6ce478fcd75c9a
#define Q_right_6 0xbb1cd7270cfba1d
#define Q_right_7 -0x1988a4116498f1af
#define Q_right_8 0x44dc3042f103d20
#define Q_right_9 0x2390e683d02edf3
#define Q_right_10 -0x8ec66f2a7e410c
// Q_coefficients in asending order, all in Q79. q0_delta is in floating point
#define Q_right_0 -0x52366e5b14c0970
#define Q_right_1 -0xca57e95abcc599b
#define Q_right_2 0x3b6c91ec67f5759c
#define Q_right_3 -0x1c40d5daa3be22bc
#define Q_right_4 -0x41f11eb5d837386c
#define Q_right_5 0x3c6ce478fcd75c9a
#define Q_right_6 0xbb1cd7270cfba1d
#define Q_right_7 -0x1988a4116498f1af
#define Q_right_8 0x44dc3042f103d20
#define Q_right_9 0x2390e683d02edf3
#define Q_right_10 -0x8ec66f2a7e410c
#define DELTA_Q0_right -0x1.29a0161e99446p-3

// P_coefficients in asending order, all in Q67. p0_delta is in floating point
#define P_left_0 0x17a0bb69321df
#define P_left_1 0x402eb416ae6015
#define P_left_2 0x2973eb18028ce34
#define P_left_3 0x8034a7ece1d5370
#define P_left_4 0xa76c08a74dae273
#define P_left_5 0x11dd3876b83dd078
#define P_left_6 0xfdd7693c3b77653
#define P_left_7 0xb33d66152b3c223
#define P_left_8 0x5a564c28c6a41a9
#define P_left_9 0x1190449fe630213
#define P_left_10 -0x659c784274e1
// P_coefficients in asending order, all in Q67. p0_delta is in floating point
#define P_left_0 0x17a0bb69321df
#define P_left_1 0x402eb416ae6015
#define P_left_2 0x2973eb18028ce34
#define P_left_3 0x8034a7ece1d5370
#define P_left_4 0xa76c08a74dae273
#define P_left_5 0x11dd3876b83dd078
#define P_left_6 0xfdd7693c3b77653
#define P_left_7 0xb33d66152b3c223
#define P_left_8 0x5a564c28c6a41a9
#define P_left_9 0x1190449fe630213
#define P_left_10 -0x659c784274e1
#define DELTA_P0_left -0x1.d622f4cbe0eeep-2

// Q_coefficients in asending order, all in Q67. q0_delta is in floating point
#define Q_left_0 0x17a09aabf9cee
#define Q_left_1 0x4030b9059ffcad
#define Q_left_2 0x29b26b0d87f7855
#define Q_left_3 0x87572a13d3fa2dd
#define Q_left_4 0xd7a728b5620ac3c
#define Q_left_5 0x1754392b473fd439
#define Q_left_6 0x1791b9a091a816c2
#define Q_left_7 0x167f71db9e13b075
#define Q_left_8 0xcb9f5f3e5e618a4
#define Q_left_9 0x68271fae767c68e
#define Q_left_10 0x13745c4fa224b25
// Q_coefficients in asending order, all in Q67. q0_delta is in floating point
#define Q_left_0 0x17a09aabf9cee
#define Q_left_1 0x4030b9059ffcad
#define Q_left_2 0x29b26b0d87f7855
#define Q_left_3 0x87572a13d3fa2dd
#define Q_left_4 0xd7a728b5620ac3c
#define Q_left_5 0x1754392b473fd439
#define Q_left_6 0x1791b9a091a816c2
#define Q_left_7 0x167f71db9e13b075
#define Q_left_8 0xcb9f5f3e5e618a4
#define Q_left_9 0x68271fae767c68e
#define Q_left_10 0x13745c4fa224b25
#define DELTA_Q0_left 0x1.f7e7557a34ae6p-2

void F_VER1(API) {
size_t vlen;
VFLOAT vx, vx_sign, vy, vy_special;
VBOOL special_args;

VFLOAT dummy;
VFLOAT dummy;

SET_ROUNDTONEAREST;
// stripmining over input arguments
Expand All @@ -92,30 +91,30 @@ void F_VER1(API) {
VFLOAT two_minus_x = __riscv_vfadd(vx_sign, fp_posOne, vlen);
VBOOL x_gt_1 = __riscv_vmflt(vx_sign, fp_posZero, vlen);
vx = __riscv_vmerge(vx, two_minus_x, x_gt_1, vlen);
// vx is now in (0, 1]
// vx is now in (0, 1]
VBOOL x_in_left = __riscv_vmfle(vx, 0x1.2p-2, vlen);

VFLOAT w_hi, w_lo, w_hi_left, w_lo_left, y_hi, y_lo;
VINT T, T_left, T_tiny;
VBOOL x_is_tiny;

if (__riscv_vcpop(x_in_left, vlen) > 0) {
VFLOAT x_left = VFMV_VF(0x1.0p-3, vlen);
x_left = __riscv_vmerge(x_left, vx, x_in_left, vlen);
x_is_tiny = __riscv_vmflt(x_left, 0x1.0p-52, vlen);
INT n_adjust = 60;
x_left = __riscv_vfmul(x_left, 0x1.0p60, vlen);
NEG_LOGX_4_TRANSFORM(x_left, n_adjust, y_hi, y_lo, vlen);
SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_left, w_lo_left, T_left,
0x1.0p63, 0x1.0p-63, vlen);
if (__riscv_vcpop(x_is_tiny, vlen) > 0) {
VFLOAT w_hi_dummy, w_lo_dummy;
SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_dummy, w_lo_dummy, T_tiny,
0x1.0p65, 0x1.0p-65, vlen);
dummy = __riscv_vfmul(__riscv_vfcvt_f(T_tiny, vlen), 0x1.0p-2, vlen);
dummy = __riscv_vfsub(dummy, __riscv_vfcvt_f(T_left, vlen), vlen);
}
VFLOAT x_left = VFMV_VF(0x1.0p-3, vlen);
x_left = __riscv_vmerge(x_left, vx, x_in_left, vlen);
x_is_tiny = __riscv_vmflt(x_left, 0x1.0p-52, vlen);
INT n_adjust = 60;
x_left = __riscv_vfmul(x_left, 0x1.0p60, vlen);
NEG_LOGX_4_TRANSFORM(x_left, n_adjust, y_hi, y_lo, vlen);

SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_left, w_lo_left, T_left, 0x1.0p63,
0x1.0p-63, vlen);
if (__riscv_vcpop(x_is_tiny, vlen) > 0) {
VFLOAT w_hi_dummy, w_lo_dummy;
SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_dummy, w_lo_dummy, T_tiny, 0x1.0p65,
0x1.0p-65, vlen);
dummy = __riscv_vfmul(__riscv_vfcvt_f(T_tiny, vlen), 0x1.0p-2, vlen);
dummy = __riscv_vfsub(dummy, __riscv_vfcvt_f(T_left, vlen), vlen);
}
}
w_hi = VFMV_VF(fp_posOne, vlen);
w_hi = __riscv_vfsub(w_hi, vx, vlen);
Expand All @@ -125,54 +124,76 @@ void F_VER1(API) {
VFLOAT delta_t = __riscv_vfmul(w_lo, 0x1.0p63, vlen);
T = __riscv_vadd(T, __riscv_vfcvt_x(delta_t, vlen), vlen);
T = __riscv_vmerge(T, T_left, x_in_left, vlen);

w_hi = __riscv_vmerge(w_hi, w_hi_left, x_in_left, vlen);
w_lo = __riscv_vmerge(w_lo, w_lo_left, x_in_left, vlen);

//For transformed branch, compute (w_hi + w_lo) * P(T)/Q(T)
// For transformed branch, compute (w_hi + w_lo) * P(T)/Q(T)
VINT P, Q;

P = __riscv_vmerge(VMVI_VX(P_right_10, vlen), P_left_10, x_in_left, vlen);
P = PSTEP_I_ab(x_in_left, P_left_6, P_right_6, T,
PSTEP_I_ab(x_in_left, P_left_7, P_right_7, T,
PSTEP_I_ab(x_in_left, P_left_8, P_right_8, T,
PSTEP_I_ab(x_in_left, P_left_9, P_right_9, T, P,
vlen), vlen), vlen), vlen);
P = PSTEP_I_ab(x_in_left, P_left_6, P_right_6, T,
PSTEP_I_ab(x_in_left, P_left_7, P_right_7, T,
PSTEP_I_ab(x_in_left, P_left_8, P_right_8, T,
PSTEP_I_ab(x_in_left, P_left_9,
P_right_9, T, P, vlen),
vlen),
vlen),
vlen);

Q = __riscv_vmerge(VMVI_VX(Q_right_10, vlen), Q_left_10, x_in_left, vlen);
Q = PSTEP_I_ab(x_in_left, Q_left_6, Q_right_6, T,
PSTEP_I_ab(x_in_left, Q_left_7, Q_right_7, T,
PSTEP_I_ab(x_in_left, Q_left_8, Q_right_8, T,
PSTEP_I_ab(x_in_left, Q_left_9, Q_right_9, T, Q,
vlen), vlen), vlen), vlen);

P = PSTEP_I_ab(x_in_left, P_left_0, P_right_0, T,
PSTEP_I_ab(x_in_left, P_left_1, P_right_1, T,
PSTEP_I_ab(x_in_left, P_left_2, P_right_2, T,
PSTEP_I_ab(x_in_left, P_left_3, P_right_3, T,
PSTEP_I_ab(x_in_left, P_left_4, P_right_4, T,
PSTEP_I_ab(x_in_left, P_left_5, P_right_5, T, P,
vlen), vlen), vlen), vlen), vlen), vlen);

Q = PSTEP_I_ab(x_in_left, Q_left_0, Q_right_0, T,
PSTEP_I_ab(x_in_left, Q_left_1, Q_right_1, T,
PSTEP_I_ab(x_in_left, Q_left_2, Q_right_2, T,
PSTEP_I_ab(x_in_left, Q_left_3, Q_right_3, T,
PSTEP_I_ab(x_in_left, Q_left_4, Q_right_4, T,
PSTEP_I_ab(x_in_left, Q_left_5, Q_right_5, T, Q,
vlen), vlen), vlen), vlen), vlen), vlen);
Q = PSTEP_I_ab(x_in_left, Q_left_6, Q_right_6, T,
PSTEP_I_ab(x_in_left, Q_left_7, Q_right_7, T,
PSTEP_I_ab(x_in_left, Q_left_8, Q_right_8, T,
PSTEP_I_ab(x_in_left, Q_left_9,
Q_right_9, T, Q, vlen),
vlen),
vlen),
vlen);

P = PSTEP_I_ab(
x_in_left, P_left_0, P_right_0, T,
PSTEP_I_ab(
x_in_left, P_left_1, P_right_1, T,
PSTEP_I_ab(x_in_left, P_left_2, P_right_2, T,
PSTEP_I_ab(x_in_left, P_left_3, P_right_3, T,
PSTEP_I_ab(x_in_left, P_left_4, P_right_4, T,
PSTEP_I_ab(x_in_left, P_left_5,
P_right_5, T, P, vlen),
vlen),
vlen),
vlen),
vlen),
vlen);

Q = PSTEP_I_ab(
x_in_left, Q_left_0, Q_right_0, T,
PSTEP_I_ab(
x_in_left, Q_left_1, Q_right_1, T,
PSTEP_I_ab(x_in_left, Q_left_2, Q_right_2, T,
PSTEP_I_ab(x_in_left, Q_left_3, Q_right_3, T,
PSTEP_I_ab(x_in_left, Q_left_4, Q_right_4, T,
PSTEP_I_ab(x_in_left, Q_left_5,
Q_right_5, T, Q, vlen),
vlen),
vlen),
vlen),
vlen),
vlen);

VFLOAT p_hi, p_lo;
p_hi = __riscv_vfcvt_f(P, vlen);

p_lo = __riscv_vfcvt_f(__riscv_vsub(P, __riscv_vfcvt_x(p_hi, vlen), vlen), vlen);
p_lo = __riscv_vfcvt_f(__riscv_vsub(P, __riscv_vfcvt_x(p_hi, vlen), vlen),
vlen);
VFLOAT delta_p0 = VFMV_VF(DELTA_P0_right, vlen);
delta_p0 = __riscv_vfmerge(delta_p0, DELTA_P0_left, x_in_left, vlen);
p_lo = __riscv_vfadd(p_lo, delta_p0, vlen);

VFLOAT q_hi, q_lo;
q_hi = __riscv_vfcvt_f(Q, vlen);
q_lo = __riscv_vfcvt_f(__riscv_vsub(Q, __riscv_vfcvt_x(q_hi, vlen), vlen), vlen);
q_lo = __riscv_vfcvt_f(__riscv_vsub(Q, __riscv_vfcvt_x(q_hi, vlen), vlen),
vlen);
VFLOAT delta_q0 = VFMV_VF(DELTA_Q0_right, vlen);
delta_q0 = __riscv_vfmerge(delta_q0, DELTA_Q0_left, x_in_left, vlen);
q_lo = __riscv_vfadd(q_lo, delta_q0, vlen);
Expand All @@ -193,8 +214,8 @@ void F_VER1(API) {
y_lo = __riscv_vfmacc(y_lo, w_lo, p_hi, vlen);

DIV_N2D2(y_hi, y_lo, q_hi, q_lo, w_hi, vlen);
vy = w_hi;

vy = w_hi;

vy = __riscv_vfsgnj(vy, vx_sign, vlen);
vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
Expand Down
Loading

0 comments on commit 65e6159

Please sign in to comment.