Skip to content

Commit

Permalink
Merge pull request #16 from rivosinc/dev/PingTakPeterTang/erfc_n_ncdf
Browse files Browse the repository at this point in the history
add erfc and cdfnorm (cumulative distribution Gaussian 0,1)
  • Loading branch information
PingTakPeterTang authored Feb 3, 2024
2 parents 80c46ed + 6821766 commit 8e54234
Show file tree
Hide file tree
Showing 15 changed files with 696 additions and 1 deletion.
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ set(PROJECT_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/src/rvvlm_atanhDI.c
${CMAKE_CURRENT_SOURCE_DIR}/src/rvvlm_cbrtD.c
${CMAKE_CURRENT_SOURCE_DIR}/src/rvvlm_cbrtDI.c
${CMAKE_CURRENT_SOURCE_DIR}/src/rvvlm_cdfnormD.c
${CMAKE_CURRENT_SOURCE_DIR}/src/rvvlm_cdfnormDI.c
${CMAKE_CURRENT_SOURCE_DIR}/src/rvvlm_erfcD.c
${CMAKE_CURRENT_SOURCE_DIR}/src/rvvlm_erfcDI.c
${CMAKE_CURRENT_SOURCE_DIR}/src/rvvlm_expD_tbl.c
${CMAKE_CURRENT_SOURCE_DIR}/src/rvvlm_expD.c
${CMAKE_CURRENT_SOURCE_DIR}/src/rvvlm_expDI.c
Expand Down
22 changes: 22 additions & 0 deletions include/rvvlm.h
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,20 @@ union sui64_fp64 {
#define RVVLM_CBRTDI_VSET_CONFIG "rvvlm_fp64m4.h"
#define RVVLM_CBRTDI_ITER rvvlm_cbrtI

// FP64 cdfnorm function configuration
#define RVVLM_CDFNORMD_VSET_CONFIG "rvvlm_fp64m1.h"
#define RVVLM_CDFNORMD_STD rvvlm_cdfnorm

#define RVVLM_CDFNORMDI_VSET_CONFIG "rvvlm_fp64m1.h"
#define RVVLM_CDFNORMDI_STD rvvlm_cdfnormI

// FP64 erfc function configuration
#define RVVLM_ERFCD_VSET_CONFIG "rvvlm_fp64m1.h"
#define RVVLM_ERFCD_STD rvvlm_erfc

#define RVVLM_ERFCDI_VSET_CONFIG "rvvlm_fp64m1.h"
#define RVVLM_ERFCDI_STD rvvlm_erfcI

// FP64 exp function configuration
#define RVVLM_EXPD_VSET_CONFIG "rvvlm_fp64m4.h"
#define RVVLM_EXPD_STD rvvlm_expD_std
Expand Down Expand Up @@ -464,6 +478,14 @@ void RVVLM_CBRTD_ITER(size_t x_len, const double *x, double *y);
void RVVLM_CBRTDI_ITER(size_t x_len, const double *x, size_t stride_x,
double *y, size_t stride_y);

void RVVLM_CDFNORMD_STD(size_t x_len, const double *x, double *y);
void RVVLM_CDFNORMDI_STD(size_t x_len, const double *x, size_t stride_x,
double *y, size_t stride_y);

void RVVLM_ERFCD_STD(size_t x_len, const double *x, double *y);
void RVVLM_ERFCDI_STD(size_t x_len, const double *x, size_t stride_x, double *y,
size_t stride_y);

void RVVLM_EXPD_STD(size_t x_len, const double *x, double *y);
void RVVLM_EXPDI_STD(size_t x_len, const double *x, size_t stride_x, double *y,
size_t stride_y);
Expand Down
250 changes: 250 additions & 0 deletions include/rvvlm_erfcD.inc.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
// SPDX-FileCopyrightText: 2023 Rivos Inc.
//
// SPDX-License-Identifier: Apache-2.0

#if defined(COMPILE_FOR_ERFC)
#if (STRIDE == UNIT_STRIDE)
#define F_VER1 RVVLM_ERFCD_STD
#else
#define F_VER1 RVVLM_ERFCDI_STD
#endif
#elif defined(COMPILE_FOR_CDFNORM)
#if (STRIDE == UNIT_STRIDE)
#define F_VER1 RVVLM_CDFNORMD_STD
#else
#define F_VER1 RVVLM_CDFNORMDI_STD
#endif
#else
static_assert(false,
"Must compile for ERFC or CDFNORM when including " __FILE__);
#endif

#include <fenv.h>

#if defined(COMPILE_FOR_ERFC)
// polynomial coefficients Q62
#define P_0 0x4f33682d757709e8
#define P_1 -0x95970864bc25c71
#define P_2 0xa377a56796fd6f
#define P_3 0x5ea2d221c412d2d
#define P_4 -0x8f0caa24847e2a3
#define P_5 0x8ac6781d49af506
#define P_6 -0x67476ebb9bc1f58
#define P_7 0x3d0ed00f93b86cb
#define P_8 -0x1c36fb9d9556ac0
#define P_9 0x96c3f45eaad23b
#define P_10 -0x1a6d434ab9ada1
#define P_11 -0x4dd9356c9c3f8
#define P_12 0x4bb31b11d0a1a
#define P_13 -0xf2d325083d5b
#define P_14 -0x52720383749f
#define P_15 0x33f7f3f6cb7d
#define P_16 0x4ed13a394f
#define P_17 -0x770e9d9af50
#define P_18 0x108f3f3cf59
#define P_19 0x101b7f3c485
#define P_20 -0x3ab6fb75ad
#define P_21 -0x237088721c
#define P_22 0x6ed93407e
#define P_23 0x3dbfb2c72
#define NEG_A_SCALED -0x1.ep+64
#define B_FOR_TRANS 0x1.04p+2
#define MIN_CLIP 0x1.0p-60
#define MAX_CLIP 0x1.cp4
#else
// polynomial coefficients in Q63
#define P_0 0x6c25c9f6cfd132e7
#define P_1 -0x5abb8f458c7895f
#define P_2 -0x5ea2dcf3956792c
#define P_3 0xdd22963d83fa7d8
#define P_4 -0x107d667db8b90c84
#define P_5 0xea0acc44786d840
#define P_6 -0xa5e5b52ef29e23a
#define P_7 0x5ef73d5784d9dc6
#define P_8 -0x2acb1deb9208ae5
#define P_9 0xdf0d75186479cf
#define P_10 -0x25493132730985
#define P_11 -0x7daed4327549c
#define P_12 0x6ff2fb205b4f9
#define P_13 -0x15242feefcc0f
#define P_14 -0x7f14d7432d2b
#define P_15 0x4b2791427dab
#define P_16 0x17d0499cfa7
#define P_17 -0xae9fb960b85
#define P_18 0x15d4aa6975c
#define P_19 0x17cff734612
#define P_20 -0x505ad971f3
#define P_21 -0x34366c3ea9
#define P_22 0x97dfa0691
#define P_23 0x591d3b55a
#define NEG_A_SCALED -0x1.536p+65
#define B_FOR_TRANS 0x1.6fap+2
#define MIN_CLIP 0x1.0p-60
#define MAX_CLIP 0x1.4p5
#endif

// When COMPILE_FOR_ERFC
// The main computation is for erfc(|x|) and exploits the symmetry
// erfc(-|x|) = 2 - erfc(|x|)
// When COMPILE_FOR_CDFNORM
// The main compputation is for cdfnorm(-|x|) and exploits the symmetry
// cdfnorm(|x|) = 1 - cdfnorm(-|x|)
void F_VER1(API) {
size_t vlen;
VFLOAT vx, vx_orig, vy, vy_special;
VBOOL special_args;

SET_ROUNDTONEAREST;
// stripmining over input arguments
for (; _inarg_n > 0; _inarg_n -= vlen) {
vlen = VSET(_inarg_n);
vx = VFLOAD_INARG1(vlen);
vx_orig = vx;

// Handle Inf and NaN
EXCEPTION_HANDLING(vx, special_args, vy_special, vlen);

// suffices to focus on |x| clipped to [2^-60, 28]
vx = __riscv_vfsgnj(vx, fp_posOne, vlen);
vx = __riscv_vfmin(vx, MAX_CLIP, vlen);
vx = __riscv_vfmax(vx, MIN_CLIP, vlen);

VINT R;
// Compute (x-a)/(x+b) as Q63 fixed-point
X_TRANSFORM(vx, NEG_A_SCALED, B_FOR_TRANS, R, vlen);

VINT n, A;
#if defined(COMPILE_FOR_ERFC)
VFLOAT vy = vx;
#else
VFLOAT vy = __riscv_vfmul(vx, 0x1.0p-1, vlen);
#endif
// Compute exp(-x*x) or exp(-x*x/2) as 2^n a
// but return a as Q62 fixed-point A
EXP_negAB(vx, vy, n, A, vlen);

// Approximate exp(x*x)*(1+2x)*erfc(x)
// or exp(x*x/2)*(1+2x)*cdfnorm(-x)
// using a polynomial in r = (x-a)/(x+b)
// We use fixed-point computing
// -1 < r < 1, thus using Q63 fixed-point for r
// All coefficients are scaled the same and thus
// the final value is in this scaling.
// Scale is 2^62 for erfc and 2^63 for cdfnorm
VINT P_RIGHT = PSTEP_I(
P_16, R,
PSTEP_I(
P_17, R,
PSTEP_I(P_18, R,
PSTEP_I(P_19, R,
PSTEP_I(P_20, R,
PSTEP_I(P_21, R,
PSTEP_I(P_22, P_23, R, vlen), vlen),
vlen),
vlen),
vlen),
vlen),
vlen);

VINT R8 = __riscv_vsmul(R, R, 1, vlen);
R8 = __riscv_vsmul(R8, R8, 1, vlen);
R8 = __riscv_vsmul(R8, R8, 1, vlen);

VINT P_MID = PSTEP_I(
P_8, R,
PSTEP_I(
P_9, R,
PSTEP_I(P_10, R,
PSTEP_I(P_11, R,
PSTEP_I(P_12, R,
PSTEP_I(P_13, R,
PSTEP_I(P_14, P_15, R, vlen), vlen),
vlen),
vlen),
vlen),
vlen),
vlen);

P_RIGHT = __riscv_vsmul(R8, P_RIGHT, 1, vlen);
P_RIGHT = __riscv_vadd(P_RIGHT, P_MID, vlen);
P_RIGHT = __riscv_vsmul(R8, P_RIGHT, 1, vlen);

VINT P_LEFT = PSTEP_I(
P_0, R,
PSTEP_I(
P_1, R,
PSTEP_I(P_2, R,
PSTEP_I(P_3, R,
PSTEP_I(P_4, R,
PSTEP_I(P_5, R, PSTEP_I(P_6, P_7, R, vlen),
vlen),
vlen),
vlen),
vlen),
vlen),
vlen);

VINT P = __riscv_vadd(P_LEFT, P_RIGHT, vlen);

VINT m, B;
RECIP_SCALE(vx, B, m, vlen);

#if defined(COMPILE_FOR_ERFC)
// exp(-x^2) is 2^n * 2^(-62) * A
// 1/(1+2x) is 2^(-m) * B, m >= 62
// exp(x^2)(1+2x)erfc(x) is 2^(-62) * P
P = __riscv_vsmul(P, A, 1, vlen); // Q61
P = __riscv_vsmul(P, B, 1, vlen); // Q(m-2)
n = __riscv_vsub(n, m, vlen);
n = __riscv_vadd(n, 2, vlen); // n <= -60

VUINT ell = I_AS_U(__riscv_vrsub(n, -60, vlen));
ell = __riscv_vminu(ell, 63, vlen);
VINT PP = __riscv_vsra(P, ell, vlen);
VINT Q = VMVI_VX(1, vlen);
Q = __riscv_vsll(Q, 61, vlen);
Q = __riscv_vsub(Q, PP, vlen);
VFLOAT vz = __riscv_vfcvt_f(Q, vlen);
vz = __riscv_vfmul(vz, 0x1.0p-60, vlen);

vy = __riscv_vfcvt_f(P, vlen);
FAST_LDEXP(vy, n, vlen);
// vy is erfc(|x|) at this point

VBOOL x_is_neg = __riscv_vmflt(vx_orig, fp_posZero, vlen);
vy = __riscv_vmerge(vy, vz, x_is_neg, vlen);
#else
// exp(-x^2/2) is 2^n * 2^(-62) * A
// 1/(1+2x) is 2^(-m) * B, m >= 62
// exp(x^2/2)(1+2x)cdfnorm(-x) is 2^(-63) * P
P = __riscv_vsmul(P, A, 1, vlen); // Q62
P = __riscv_vsmul(P, B, 1, vlen); // Q(m-1)
n = __riscv_vsub(n, m, vlen);
n = __riscv_vadd(n, 1, vlen); // n <= -61

VUINT ell = I_AS_U(__riscv_vrsub(n, -61, vlen));
ell = __riscv_vminu(ell, 63, vlen);
VINT PP = __riscv_vsra(P, ell, vlen);
VINT Q = VMVI_VX(1, vlen);
Q = __riscv_vsll(Q, 61, vlen);
Q = __riscv_vsub(Q, PP, vlen);
VFLOAT vz = __riscv_vfcvt_f(Q, vlen);
vz = __riscv_vfmul(vz, 0x1.0p-61, vlen);

vy = __riscv_vfcvt_f(P, vlen);
FAST_LDEXP(vy, n, vlen);
// vy is cdfnorm(-|x|) at this point

VBOOL x_is_pos = __riscv_vmfgt(vx_orig, fp_posZero, vlen);
vy = __riscv_vmerge(vy, vz, x_is_pos, vlen);
#endif

vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
// copy vy into y and increment addr pointers
VFSTORE_OUTARG1(vy, vlen);

INCREMENT_INARG1(vlen);
INCREMENT_OUTARG1(vlen);
}
RESTORE_FRM;
}
Loading

0 comments on commit 8e54234

Please sign in to comment.