Skip to content

Commit

Permalink
add tan and tanpi functions
Browse files Browse the repository at this point in the history
  • Loading branch information
PingTakPeterTang committed Dec 14, 2023
1 parent b7a4334 commit 984880a
Show file tree
Hide file tree
Showing 13 changed files with 494 additions and 12 deletions.
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ set(PROJECT_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/src/rvvlm_sinDI.c
${CMAKE_CURRENT_SOURCE_DIR}/src/rvvlm_sinpiD.c
${CMAKE_CURRENT_SOURCE_DIR}/src/rvvlm_sinpiDI.c
${CMAKE_CURRENT_SOURCE_DIR}/src/rvvlm_tanD.c
${CMAKE_CURRENT_SOURCE_DIR}/src/rvvlm_tanDI.c
${CMAKE_CURRENT_SOURCE_DIR}/src/rvvlm_tanpiD.c
${CMAKE_CURRENT_SOURCE_DIR}/src/rvvlm_tanpiDI.c
)

add_library(vecm
Expand Down
32 changes: 32 additions & 0 deletions include/rvvlm.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,17 @@ union sui64_fp64 {
q = __riscv_vfmul(q, __riscv_vfrec7((denom), (vlen)), (vlen)); \
} while (0)

#define DIV_N2D2(numer, delta_n, denom, delta_d, Q, vlen) \
do { \
VFLOAT _q; \
(Q) = __riscv_vfdiv((numer), (denom), (vlen)); \
_q = __riscv_vfnmsub((Q), (denom), (numer), (vlen)); \
_q = __riscv_vfnmsac(_q, (Q), (delta_d), (vlen)); \
_q = __riscv_vfadd(_q, (delta_n), (vlen)); \
_q = __riscv_vfmul(_q, __riscv_vfrec7((denom), (vlen)), (vlen)); \
(Q) = __riscv_vfadd((Q), _q, (vlen)); \
} while (0)

#define IDENTIFY2(__vclass, __stencil, __signature, __identity_mask, __vlen) \
do { \
__signature = __riscv_vand(__vclass, __stencil, __vlen); \
Expand Down Expand Up @@ -238,6 +249,19 @@ union sui64_fp64 {
#define RVVLM_SINPIDI_VSET_CONFIG "rvvlm_fp64m2.h"
#define RVVLM_SINPIDI_MERGED rvvlm_sinpiI

// FP64 tan function configuration
#define RVVLM_TAND_VSET_CONFIG "rvvlm_fp64m2.h"
#define RVVLM_TAND_MERGED rvvlm_tan

#define RVVLM_TANDI_VSET_CONFIG "rvvlm_fp64m2.h"
#define RVVLM_TANDI_MERGED rvvlm_tanI

#define RVVLM_TANPID_VSET_CONFIG "rvvlm_fp64m2.h"
#define RVVLM_TANPID_MERGED rvvlm_tanpi

#define RVVLM_TANPIDI_VSET_CONFIG "rvvlm_fp64m2.h"
#define RVVLM_TANPIDI_MERGED rvvlm_tanpiI

// Define the various tables for table-driven implementations
extern int64_t expD_tbl64_fixedpt[64];
extern int64_t logD_tbl128_fixedpt[128];
Expand Down Expand Up @@ -334,6 +358,14 @@ void RVVLM_SINPID_MERGED(size_t x_len, const double *x, double *y);
void RVVLM_SINPIDI_MERGED(size_t x_len, const double *x, size_t stride_x,
double *y, size_t stride_y);

void RVVLM_TAND_MERGED(size_t x_len, const double *x, double *y);
void RVVLM_TANDI_MERGED(size_t x_len, const double *x, size_t stride_x,
double *y, size_t stride_y);

void RVVLM_TANPID_MERGED(size_t x_len, const double *x, double *y);
void RVVLM_TANPIDI_MERGED(size_t x_len, const double *x, size_t stride_x,
double *y, size_t stride_y);

#ifdef __cplusplus
}
#endif
261 changes: 261 additions & 0 deletions include/rvvlm_tanD.inc.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
// SPDX-FileCopyrightText: 2023 Rivos Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include "rvvlm_trigD.h"

#include <fenv.h>

#if defined(COMPILE_FOR_TAN)
#if (STRIDE == UNIT_STRIDE)
#define F_VER1 RVVLM_TAND_MERGED
#else
#define F_VER1 RVVLM_TANDI_MERGED
#endif
#elif defined(COMPILE_FOR_TANPI)
#if (STRIDE == UNIT_STRIDE)
#define F_VER1 RVVLM_TANPID_MERGED
#else
#define F_VER1 RVVLM_TANPIDI_MERGED
#endif
#else
static_assert(false, "Must specify tan or tanpi" __FILE__);
#endif

#include <fenv.h>

// This versions reduces argument to [-pi/4, pi/4] and computes sin(r) or cos(r)
// tan(x) is either sin(r)/cos(r) or -cos(r)/sin(r)
void F_VER1(API) {
size_t vlen;
VFLOAT vx_orig, vx, vy, vy_special;
VBOOL special_args;
VUINT expo_x;

SET_ROUNDTONEAREST;
// stripmining over input arguments
for (; _inarg_n > 0; _inarg_n -= vlen) {
vlen = VSET(_inarg_n);
vx_orig = VFLOAD_INARG1(vlen);
vx = __riscv_vfsgnj(vx_orig, fp_posOne, vlen);
expo_x = __riscv_vsrl(F_AS_U(vx), MAN_LEN, vlen);

// Set results for input of NaN and Inf and also for |x| very small
EXCEPTION_HANDLING_TRIG(vx_orig, expo_x, special_args, vy_special, vlen);

#if defined(COMPILE_FOR_TAN)
VBOOL x_large =
__riscv_vmsgeu(expo_x, EXP_BIAS + 24, vlen); // |x| >= 2^(24)
VFLOAT vx_copy = vx;
vx = __riscv_vfmerge(vx, fp_posZero, x_large, vlen);

VFLOAT n_flt = __riscv_vfmul(vx, PIBY2_INV, vlen);
VINT n = __riscv_vfcvt_x(n_flt, vlen);
n_flt = __riscv_vfcvt_f(n, vlen);
VFLOAT r_hi = __riscv_vfnmsac(vx, PIBY2_HI, n_flt, vlen);
VUINT expo_r = __riscv_vsrl(F_AS_U(r_hi), MAN_LEN, vlen);
expo_r = __riscv_vand(expo_r, 0x7FF, vlen);
VBOOL r_small =
__riscv_vmsleu(expo_r, EXP_BIAS - 16, vlen); // |r_hi| < 2^(-15)
UINT nb_r_small = __riscv_vcpop(r_small, vlen);
VFLOAT r = __riscv_vfnmsac(r_hi, PIBY2_MID, n_flt, vlen);
VFLOAT r_delta = __riscv_vfsub(r_hi, r, vlen);
r_delta = __riscv_vfnmsac(r_delta, PIBY2_MID, n_flt, vlen);
// At this point, r + r_delta is an accurate reduced argument PROVIDED
// |r_hi| >= 2^(-15)
if (nb_r_small > 0) {
VFLOAT A = __riscv_vfmul(n_flt, PIBY2_MID, vlen);
VFLOAT a = __riscv_vfmsub(n_flt, PIBY2_MID, A, vlen);
// A + a is n * piby2_mid exactly
VFLOAT S = __riscv_vfsub(r_hi, A, vlen);
VFLOAT s = __riscv_vfsub(r_hi, S, vlen);
s = __riscv_vfsub(s, A, vlen);
s = __riscv_vfnmsac(s, PIBY2_LO, n_flt, vlen);
r = __riscv_vmerge(r, S, r_small, vlen);
r_delta = __riscv_vmerge(r_delta, s, r_small, vlen);
}

if (__riscv_vcpop(x_large, vlen) > 0) {
VFLOAT r_xlarge, r_delta_xlarge;
VINT n_xlarge;
LARGE_ARGUMENT_REDUCTION_Piby2(vx_copy, vlen, x_large, n_xlarge, r_xlarge,
r_delta_xlarge);
r = __riscv_vmerge(r, r_xlarge, x_large, vlen);
r_delta = __riscv_vmerge(r_delta, r_delta_xlarge, x_large, vlen);
n = __riscv_vmerge(n, n_xlarge, x_large, vlen);
}
#else
VBOOL x_large =
__riscv_vmsgeu(expo_x, EXP_BIAS + 53, vlen); // |x| >= 2^(53)
vx = __riscv_vfmerge(vx, fp_posZero, x_large, vlen);

// Usual argument reduction
// N = rint(2x); rem := 2x - N, |rem| <= 1/2 and x = (N/2) + (rem/2);
// x pi = N (pi/2) + rem * (pi/2)
VFLOAT two_x = __riscv_vfadd(vx, vx, vlen);
VINT n = __riscv_vfcvt_x(two_x, vlen);
VFLOAT n_flt = __riscv_vfcvt_f(n, vlen);
VFLOAT rem = __riscv_vfsub(two_x, n_flt, vlen);
VBOOL x_is_n_piby2 = __riscv_vmseq(F_AS_U(rem), 0, vlen);
// Now rem * pi_by_2 as r + r_delta
// tanpi can be exactly 0 or Inf when x_is_n_piby2
// Furthermore, the signs of these 0 and Inf are as follows.
// tanpi(-X) = -tanpi(X). Thus only consider X >= 0.
// tanpi(n * pi/2) = (-1)^floor(n/2) {0 if n even; Inf otherwise}
if (__riscv_vcpop(x_is_n_piby2, vlen) > 0) {
VBOOL n_even = __riscv_vmseq(__riscv_vand(n, 0x1, vlen), 0, vlen);
VBOOL set_inf = __riscv_vmandn(x_is_n_piby2, n_even, vlen);
VFLOAT Zero_or_Inf = VFMV_VF(fp_posZero, vlen);
Zero_or_Inf = __riscv_vmerge(Zero_or_Inf,
__riscv_vfrec7(set_inf, Zero_or_Inf, vlen),
set_inf, vlen);
Zero_or_Inf = __riscv_vfsgnj(
Zero_or_Inf, U_AS_F(__riscv_vsll(I_AS_U(n), BIT_WIDTH - 2, vlen)),
vlen);
Zero_or_Inf = __riscv_vfsgnjx(Zero_or_Inf, vx_orig, vlen);
vy_special = __riscv_vmerge(vy_special, Zero_or_Inf, x_is_n_piby2, vlen);
special_args = __riscv_vmor(special_args, x_is_n_piby2, vlen);
n = __riscv_vmerge(n, 0, x_is_n_piby2, vlen);
}
VFLOAT r = __riscv_vfmul(rem, PIBY2_HI, vlen);
VFLOAT r_delta = __riscv_vfmsac(r, PIBY2_HI, rem, vlen);
r_delta = __riscv_vfmacc(r_delta, PIBY2_MID, rem, vlen);
// At this point, r + r_delta is an accurate reduced argument PROVIDED
#endif

VUINT n_lsb = __riscv_vand(I_AS_U(n), 0x1, vlen);
VBOOL numer_pick_c = __riscv_vmsne(n_lsb, 0, vlen);
VBOOL denom_pick_c = __riscv_vmnot(numer_pick_c, vlen);

//
// sin(r) is approximated by 8 terms corresponding to x, x^3, ..., x^15
// cos(r) is approximated by 8 terms corresponding to 1, x^2, ..., x^14
// This "r" is more precise than FP64; it suffices to use the FP64-precise
// value for the last 6 terms for sin and cos. We only need to use the
// extra precise values for the first two terms for each of the above.
// Our strategy here is to use extra precision simulation with
// floating-point computation
//
// For sin(r), the first 2 terms are r + p r^3 where p is basically -1/6
// We decompose r into r = r_head + t_tail where r_head is r with the lower
// 36 bits set to 0. This way, r_head^3 can be computed exactly. r + p r^3 =
// r + r_head^3 * p + (r^3 - r_head^3) * p r + r_head^3 * p can be computed
// by sin_hi := r + r_head^3 * p (FMA) sin_corr := (r - sin_hi) + r_head^3
// * p (subtract and FMA) sin_hi + sin_corr is is r + r_head^3 * p to
// doubled FP64 precision (way more than needed) Next we need to add (r^3 -
// r_head^3) * p which is r_tail * (r^2 + r * r_head + r_head^2) * p because
// r_tail is small, rounding error in computing this is immaterial to the
// final result Finally, we need also to add r_delta * (1 - r^2/2) to
// sin_corr because sin(r + r_delta) ~=~ sin(r) + r_delta * cos(r) ~=~
// sin(r) + r_delta * (1 - r^2/2). Note that the term 1 - r^2/2 will be
// computed in the course of our computation of cos(r), discussed next.
//
// For cos(r), the first 2 terms are 1 - r^2/2. This can be easily computed
// to high precision. r_prime := r * 1/2; cos_hi := 1 - r * r_prime (FMA);
// cos_corr := (1 - cos_hi) - r * r_prime cos_hi can be used above to
// compute r_delta * (1 - r^2/2). Because cos(r + r_delta) ~=~ cos(r) -
// r_delta * sin(r) ~=~ cos(r) - r_delta * r we add the term -r_delta * r to
// cos_corr
//
// So in a nutshell sin(r) is approximated by sin_hi + sin_lo,
// sin_lo is the sum of sin_corr and a polynomial starting at r^5
//
// And cos(r) is approximated by cos_hi + cos_lo,
// cos_lo is the sum of cos_corr and a polynomial starting at r^4
//
// By suitably merging the two, we have numer_hi, numer_lo and denom_hi,
// denom_lo
//

VFLOAT rsq = __riscv_vfmul(r, r, vlen);

UINT mask_r_head = 1;
mask_r_head = ~((mask_r_head << 36) - 1);
VFLOAT r_head = U_AS_F(__riscv_vand(F_AS_U(r), mask_r_head, vlen));
VFLOAT r_tail = __riscv_vfsub(r, r_head, vlen);

UINT exp_m1 = 1;
exp_m1 = (exp_m1 << 52);
VFLOAT r_prime = U_AS_F(__riscv_vsub(F_AS_U(r), exp_m1, vlen));
// |r| is never too small, so subtracting 1 from exponent is division by 2

VFLOAT ONE = VFMV_VF(fp_posOne, vlen);
VFLOAT cos_hi = __riscv_vfnmsac(ONE, r, r_prime, vlen);
VFLOAT cos_corr = __riscv_vfsub(ONE, cos_hi, vlen);
cos_corr = __riscv_vfnmsac(cos_corr, r, r_prime, vlen);
cos_corr = __riscv_vfnmsac(cos_corr, r_delta, r, vlen);

double coeff = -0x1.5555555555555p-3;
VFLOAT r_head_cube = __riscv_vfmul(r_head, r_head, vlen);
r_head_cube = __riscv_vfmul(r_head_cube, r_head, vlen);
VFLOAT sin_hi = __riscv_vfmadd(r_head_cube, coeff, r, vlen);
VFLOAT sin_corr = __riscv_vfsub(r, sin_hi, vlen);
sin_corr = __riscv_vfmacc(sin_corr, coeff, r_head_cube, vlen);
VFLOAT tmp = __riscv_vfmadd(r_head, r_head, rsq, vlen);
VFLOAT tmp2 = __riscv_vfmul(r_tail, coeff, vlen);
tmp = __riscv_vfmacc(tmp, r_head, r, vlen);
sin_corr = __riscv_vfmacc(sin_corr, tmp, tmp2, vlen);
sin_corr = __riscv_vfmacc(sin_corr, r_delta, cos_hi, vlen);

VFLOAT poly_s =
PSTEP(0x1.1111111111069p-7, rsq,
PSTEP(-0x1.a01a019ffe527p-13, rsq,
PSTEP(0x1.71de3a33a62c6p-19, rsq,
PSTEP(-0x1.ae642c52fc493p-26, rsq,
PSTEP(0x1.6109be886e15cp-33,
-0x1.9ffe1dd295e78p-41, rsq, vlen),
vlen),
vlen),
vlen),
vlen);

VFLOAT poly_c =
PSTEP(0x1.5555555555546p-5, rsq,
PSTEP(-0x1.6c16c16c1450cp-10, rsq,
PSTEP(0x1.a01a019b77545p-16, rsq,
PSTEP(-0x1.27e4f72551e3dp-22, rsq,
PSTEP(0x1.1ee950032f74cp-29,
-0x1.8f5dd75850673p-37, rsq, vlen),
vlen),
vlen),
vlen),
vlen);

VFLOAT r_to_4 = __riscv_vfmul(rsq, rsq, vlen);
VFLOAT r_to_5 = __riscv_vfmul(r_to_4, r, vlen);

poly_c = __riscv_vfmadd(poly_c, r_to_4, cos_corr, vlen);
poly_s = __riscv_vfmadd(poly_s, r_to_5, sin_corr, vlen);

VFLOAT S, s, C, c;
FAST2SUM(sin_hi, poly_s, S, s, vlen);
FAST2SUM(cos_hi, poly_c, C, c, vlen);

VFLOAT numer_hi, numer_lo, denom_hi, denom_lo;
numer_hi = S;
numer_hi = __riscv_vmerge(numer_hi, C, numer_pick_c, vlen);
numer_lo = s;
numer_lo = __riscv_vmerge(numer_lo, c, numer_pick_c, vlen);

denom_hi = S;
denom_hi = __riscv_vmerge(denom_hi, C, denom_pick_c, vlen);
denom_lo = s;
denom_lo = __riscv_vmerge(denom_lo, c, denom_pick_c, vlen);

DIV_N2D2(numer_hi, numer_lo, denom_hi, denom_lo, vy, vlen);

// need to put the correct sign
n = __riscv_vsll(n, BIT_WIDTH - 1, vlen);
vy = __riscv_vfsgnjx(vy, I_AS_F(n), vlen);
vy = __riscv_vfsgnjx(vy, vx_orig, vlen);

vy = __riscv_vmerge(vy, vy_special, special_args, vlen);

// copy vy into y and increment addr pointers
VFSTORE_OUTARG1(vy, vlen);

INCREMENT_INARG1(vlen);
INCREMENT_OUTARG1(vlen);
}
RESTORE_FRM;
}
4 changes: 2 additions & 2 deletions include/rvvlm_trigD.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
#define PI_HI 0x1.921fb54442d18p+1
#define PI_MID 0x1.1a62633145c07p-53

#if defined(COMPILE_FOR_SIN)
#if defined(COMPILE_FOR_SIN) || defined(COMPILE_FOR_TAN)
#define FUNC_NEAR_ZERO(small_x, vx, vlen) \
__riscv_vfmadd((small_x), (vx), 0x1.0p-60, (vx), (vlen))
#elif defined(COMPILE_FOR_SINPI)
#elif defined(COMPILE_FOR_SINPI) || defined(COMPILE_FOR_TANPI)
#define FUNC_NEAR_ZERO(small_x, vx, vlen) \
__riscv_vfmadd((small_x), (vx), PI_HI, \
__riscv_vfmul((small_x), (vx), PI_MID, (vlen)), (vlen))
Expand Down
17 changes: 17 additions & 0 deletions src/rvvlm_tanD.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// SPDX-FileCopyrightText: 2023 Rivos Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <riscv_vector.h>
#include <stdio.h>

#include "rvvlm.h"
#define API_SIGNATURE API_SIGNATURE_11
#define STRIDE UNIT_STRIDE

#include RVVLM_TAND_VSET_CONFIG

#define COMPILE_FOR_TAN
#include "rvvlm_trigD.h"

#include "rvvlm_tanD.inc.h"
17 changes: 17 additions & 0 deletions src/rvvlm_tanDI.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// SPDX-FileCopyrightText: 2023 Rivos Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <riscv_vector.h>
#include <stdio.h>

#include "rvvlm.h"
#define API_SIGNATURE API_SIGNATURE_11
#define STRIDE GENERAL_STRIDE

#include RVVLM_TANDI_VSET_CONFIG

#define COMPILE_FOR_TAN
#include "rvvlm_trigD.h"

#include "rvvlm_tanD.inc.h"
17 changes: 17 additions & 0 deletions src/rvvlm_tanpiD.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// SPDX-FileCopyrightText: 2023 Rivos Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <riscv_vector.h>
#include <stdio.h>

#include "rvvlm.h"
#define API_SIGNATURE API_SIGNATURE_11
#define STRIDE UNIT_STRIDE

#include RVVLM_TANPID_VSET_CONFIG

#define COMPILE_FOR_TANPI
#include "rvvlm_trigD.h"

#include "rvvlm_tanD.inc.h"
Loading

0 comments on commit 984880a

Please sign in to comment.