From ca948ac683950bbe53bf5194c92e26c553692340 Mon Sep 17 00:00:00 2001
From: PingTakPeterTang <ptpt@rivosinc.com>
Date: Thu, 28 Mar 2024 17:37:17 -0700
Subject: [PATCH] add the FP64 true gamma function tgamma

---
 CMakeLists.txt               |   2 +
 include/rvvlm.h              |  23 ++
 include/rvvlm_gammafuncsD.h  |  27 +++
 include/rvvlm_tgammaD.inc.h  | 446 +++++++++++++++++++++++++++++++++++
 src/rvvlm_tgammaD.c          |  16 ++
 src/rvvlm_tgammaDI.c         |  16 ++
 test/CMakeLists.txt          |   2 +
 test/include/test_infra.h    |   5 +
 test/src/test_acos.cpp       |   8 +-
 test/src/test_acospi.cpp     |   8 +-
 test/src/test_asinh.cpp      |  16 +-
 test/src/test_atan2.cpp      |  24 +-
 test/src/test_atan2pi.cpp    |  48 ++--
 test/src/test_atanh.cpp      |  16 +-
 test/src/test_cdfnorm.cpp    |  12 +-
 test/src/test_cdfnorminv.cpp |  16 +-
 test/src/test_erf.cpp        |  10 +-
 test/src/test_erfcinv.cpp    |  16 +-
 test/src/test_erfinv.cpp     |  28 +--
 test/src/test_exp10.cpp      |  12 +-
 test/src/test_exp2.cpp       |   8 +-
 test/src/test_expm1.cpp      |  14 +-
 test/src/test_infra.cpp      |  37 +++
 test/src/test_log.cpp        |  10 +-
 test/src/test_log2.cpp       |  10 +-
 test/src/test_pow.cpp        |  12 +-
 test/src/test_tan.cpp        |  10 +-
 test/src/test_tgamma.cpp     |  83 +++++++
 test/src/test_tgammaI.cpp    |  24 ++
 29 files changed, 816 insertions(+), 143 deletions(-)
 create mode 100644 include/rvvlm_gammafuncsD.h
 create mode 100644 include/rvvlm_tgammaD.inc.h
 create mode 100644 src/rvvlm_tgammaD.c
 create mode 100644 src/rvvlm_tgammaDI.c
 create mode 100644 test/src/test_tgamma.cpp
 create mode 100644 test/src/test_tgammaI.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9b29a7f..a71de7e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -98,6 +98,8 @@ set(PROJECT_SOURCES
     src/rvvlm_sinhDI.c
     src/rvvlm_tanhD.c
     src/rvvlm_tanhDI.c
+    src/rvvlm_tgammaD.c
+    src/rvvlm_tgammaDI.c
 )
 
 add_library(vecm
diff --git a/include/rvvlm.h b/include/rvvlm.h
index 7e00870..3eb6ed8 100644
--- a/include/rvvlm.h
+++ b/include/rvvlm.h
@@ -172,6 +172,17 @@ union sui64_fp64 {
     (delta_Q) = __riscv_vfmul(_q, __riscv_vfrec7((denom), (vlen)), (vlen));    \
   } while (0)
 
+#define ACC_DIV2_N2D2(numer, delta_n, denom, delta_d, Q, delta_Q, vlen)        \
+  do {                                                                         \
+    VFLOAT _recip, _q;                                                         \
+    _recip = __riscv_vfrdiv((denom), 0x1.0p0, (vlen));                         \
+    (Q) = __riscv_vfmul((numer), _recip, (vlen));                              \
+    _q = __riscv_vfnmsub((Q), (denom), (numer), (vlen));                       \
+    _q = __riscv_vfnmsac(_q, (Q), (delta_d), (vlen));                          \
+    _q = __riscv_vfadd(_q, (delta_n), (vlen));                                 \
+    (delta_Q) = __riscv_vfmul(_q, _recip, (vlen));                             \
+  } while (0)
+
 #define SQRT2_X2(x, delta_x, r, delta_r, vlen)                                 \
   do {                                                                         \
     VFLOAT xx = __riscv_vfadd((x), (delta_x), (vlen));                         \
@@ -469,6 +480,13 @@ union sui64_fp64 {
 #define RVVLM_TANPIDI_VSET_CONFIG "rvvlm_fp64m2.h"
 #define RVVLM_TANPIDI_MERGED rvvlm_tanpiI
 
+// FP64 tgamma function configuration
+#define RVVLM_TGAMMAD_VSET_CONFIG "rvvlm_fp64m1.h"
+#define RVVLM_TGAMMAD_STD rvvlm_tgamma
+
+#define RVVLM_TGAMMADI_VSET_CONFIG "rvvlm_fp64m1.h"
+#define RVVLM_TGAMMADI_STD rvvlm_tgammaI
+
 // FP64 cosh function configuration
 #define RVVLM_COSHD_VSET_CONFIG "rvvlm_fp64m2.h"
 #define RVVLM_COSHD_STD rvvlm_coshD_std
@@ -499,6 +517,7 @@ extern int64_t expD_tbl64_fixedpt[64];
 extern int64_t logD_tbl128_fixedpt[128];
 extern double logtbl_4_powD_128_hi_lo[256];
 extern double dbl_2ovpi_tbl[28];
+extern int64_t factorial_fixedpt[180];
 
 // Define the functions in the vector math library
 void RVVLM_ACOSD_FIXEDPT(size_t x_len, const double *x, double *y);
@@ -703,6 +722,10 @@ void RVVLM_TANHD_STD(size_t x_len, const double *x, double *y);
 void RVVLM_TANHDI_STD(size_t x_len, const double *x, size_t stride_x, double *y,
                       size_t stride_y);
 
+void RVVLM_TGAMMAD_STD(size_t x_len, const double *x, double *y);
+void RVVLM_TGAMMADI_STD(size_t x_len, const double *x, size_t stride_x,
+                        double *y, size_t stride_y);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/rvvlm_gammafuncsD.h b/include/rvvlm_gammafuncsD.h
new file mode 100644
index 0000000..80ccbcf
--- /dev/null
+++ b/include/rvvlm_gammafuncsD.h
@@ -0,0 +1,27 @@
+// SPDX-FileCopyrightText: 2023 Rivos Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// gamma(+inf) = +inf; gamma(-inf/sNaN) is qNaN with invalid
+// gamma(qNaN) is qNaN
+// gamma(+-0) is +-inf and divide by 0
+// gamma(tiny) is 1/tiny
+#define EXCEPTION_HANDLING_TGAMMA(vx, special_args, vy_special, vlen)          \
+  do {                                                                         \
+    VUINT expo_x = __riscv_vand(__riscv_vsrl(F_AS_U((vx)), MAN_LEN, (vlen)),   \
+                                0x7FF, (vlen));                                \
+    VBOOL x_small = __riscv_vmsltu(expo_x, EXP_BIAS - 60, (vlen));             \
+    VBOOL x_InfNaN = __riscv_vmseq(expo_x, 0x7FF, (vlen));                     \
+    (special_args) = __riscv_vmor(x_small, x_InfNaN, (vlen));                  \
+    if (__riscv_vcpop((special_args), (vlen)) > 0) {                           \
+      VUINT vclass = __riscv_vfclass((vx), (vlen));                            \
+      VBOOL x_negInf;                                                          \
+      IDENTIFY(vclass, class_negInf, x_negInf, (vlen));                        \
+      (vx) = __riscv_vfmerge((vx), fp_sNaN, x_negInf, (vlen));                 \
+      VFLOAT y_tmp = __riscv_vfadd(x_InfNaN, (vx), (vx), (vlen));              \
+      (vy_special) = __riscv_vmerge((vy_special), y_tmp, x_InfNaN, (vlen));    \
+      y_tmp = __riscv_vfrdiv(x_small, (vx), fp_posOne, (vlen));                \
+      (vy_special) = __riscv_vmerge((vy_special), y_tmp, x_small, (vlen));     \
+      (vx) = __riscv_vfmerge((vx), fp_posOne, (special_args), (vlen));         \
+    }                                                                          \
+  } while (0)
diff --git a/include/rvvlm_tgammaD.inc.h b/include/rvvlm_tgammaD.inc.h
new file mode 100644
index 0000000..acecb82
--- /dev/null
+++ b/include/rvvlm_tgammaD.inc.h
@@ -0,0 +1,446 @@
+// SPDX-FileCopyrightText: 2023 Rivos Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#if (STRIDE == UNIT_STRIDE)
+#define F_VER1 RVVLM_TGAMMAD_STD
+#else
+#define F_VER1 RVVLM_TGAMMADI_STD
+#endif
+
+//---Approximate log(x) by w + w^3 poly(w^2)
+//   w = 2(x-1)/(x+1), x roughly in [1/rt(2), rt(2)]
+#define P_log_0 0x5555555555555090 // Q_66
+#define P_log_1 0x666666666686863a // Q_69
+#define P_log_2 0x49249248fc99ba4b // Q_71
+#define P_log_3 0x71c71ca402e164fa // Q_74
+#define P_log_4 0x5d1733e3ae94dde0 // Q_76
+#define P_log_5 0x4ec8b69784234032 // Q_78
+#define P_log_6 0x43cc44a056dc3c93 // Q_80
+#define P_log_7 0x4432439bb76e7d74 // Q_82
+
+#define LOG2_HI 0x1.62e42fefa4000p-1
+#define LOG2_LO -0x1.8432a1b0e2634p-43
+
+//---Approximate exp(R) by 1 + R + R^2*poly(R)
+#define P_exp_0 0x400000000000004e // Q_63
+#define P_exp_1 0x1555555555555b6e // Q_63
+#define P_exp_2 0x555555555553378  // Q_63
+#define P_exp_3 0x1111111110ec10d  // Q_63
+#define P_exp_4 0x2d82d82d87a9b5   // Q_63
+#define P_exp_5 0x6806806ce6d6f    // Q_63
+#define P_exp_6 0xd00d00841fcf     // Q_63
+#define P_exp_7 0x171ddefda54b     // Q_63
+#define P_exp_8 0x24fcc01d627      // Q_63
+#define P_exp_9 0x35ed8bbd24       // Q_63
+#define P_exp_10 0x477745b6c       // Q_63
+
+//---Approximate Stirling correction by P(t)/Q(t)
+//  Gamma(x) = (x/e)^(x-1/2) * P(t)/Q(t), t = 1/x, x in [2, 180]
+#define P_corr_0 0x599ecf7a9368327  // Q_78
+#define P_corr_1 0x120a4be8e3d8673d // Q_78
+#define P_corr_2 0x2ab73aec63e90213 // Q_78
+#define P_corr_3 0x32f903e18454e088 // Q_78
+#define P_corr_4 0x29f463d533d0a4b5 // Q_78
+#define P_corr_5 0x1212989fdf61f6c1 // Q_78
+#define P_corr_6 0x48706d4f75a0491  // Q_78
+#define P_corr_7 0x5591439d2d51a6   // Q_78
+
+#define Q_corr_0 0x75e5053ce715a76  // Q_79
+#define Q_corr_1 0x171e2068d3ef7453 // Q_79
+#define Q_corr_2 0x363d736690f2373f // Q_79
+#define Q_corr_3 0x3e793a1cc19bbc32 // Q_79
+#define Q_corr_4 0x31dc2fbf92ec978c // Q_79
+#define Q_corr_5 0x138c2244d1c1e0b1 // Q_79
+#define Q_corr_6 0x450a7392d81c20f  // Q_79
+#define Q_corr_7 0x1ed9c605221435   // Q_79
+
+//---Approximate sin(pi x)/pi as x + x^3 poly(x^2)
+#define P_sin_0 -0x694699894c1f4ae7 // Q_62
+#define P_sin_1 0x33f396805788034f  // Q_62
+#define P_sin_2 -0xc3547239048c220  // Q_62
+#define P_sin_3 0x1ac6805cc1cecf4   // Q_62
+#define P_sin_4 -0x26702d2fd5a3e6   // Q_62
+#define P_sin_5 0x26e8d360232c6     // Q_62
+#define P_sin_6 -0x1d3e4d9787ba     // Q_62
+#define P_sin_7 0x107298fc107       // Q_62
+
+//---Compute log(x/e) to 2^(-65) absolute accuracy
+//   for Stirlings formula (x/e)^x sqrt(2pi/x) = (x/e)^(x-1/2) sqrt(2pi/e)
+#define TGAMMA_LOG(x_hi, x_lo, y_hi, y_lo, vlen)                               \
+  do {                                                                         \
+    VFLOAT x_in_hi = (x_hi);                                                   \
+    VFLOAT x_in_lo = (x_lo);                                                   \
+    VINT n = __riscv_vadd(__riscv_vsra(F_AS_I(x_in_hi), MAN_LEN - 8, (vlen)),  \
+                          0x96, vlen);                                         \
+    n = __riscv_vsub(__riscv_vsra(n, 8, vlen), EXP_BIAS, vlen);                \
+    VFLOAT scale = I_AS_F(                                                     \
+        __riscv_vsll(__riscv_vrsub(n, EXP_BIAS, (vlen)), MAN_LEN, (vlen)));    \
+    x_in_hi = __riscv_vfmul(x_in_hi, scale, (vlen));                           \
+    x_in_lo = __riscv_vfmul(x_in_lo, scale, (vlen));                           \
+    /* x is scaled, and log(x) is 2 atanh(w/2); w = 2(x-1)/(x+1) */            \
+                                                                               \
+    VFLOAT numer, denom, denom_delta;                                          \
+    numer = __riscv_vfsub(x_in_hi, fp_posOne, (vlen)); /* exact */             \
+    denom = __riscv_vfadd(x_in_hi, fp_posOne, (vlen));                         \
+    denom_delta = __riscv_vfadd(__riscv_vfrsub(denom, fp_posOne, (vlen)),      \
+                                x_in_hi, (vlen));                              \
+    denom_delta = __riscv_vfadd(denom_delta, x_in_lo, (vlen));                 \
+    VFLOAT w_hi, w_lo;                                                         \
+    ACC_DIV2_N2D2(numer, x_in_lo, denom, denom_delta, w_hi, w_lo, vlen);       \
+    /* w_hi + w_lo is at this point (x-1)/(x+1) */                             \
+    /* Next get 2(x-1)/(x+1) in Q64 fixed point */                             \
+    VINT W = __riscv_vfcvt_x(__riscv_vfmul(w_hi, 0x1.0p65, (vlen)), (vlen));   \
+    W = __riscv_vadd(                                                          \
+        W, __riscv_vfcvt_x(__riscv_vfmul(w_lo, 0x1.0p65, (vlen)), (vlen)),     \
+        (vlen));                                                               \
+    /* W is in Q64 because W is 2(x-1)/(x+1) */                                \
+                                                                               \
+    VFLOAT n_flt = __riscv_vfcvt_f(n, (vlen));                                 \
+    VINT W2 = __riscv_vsmul(W, W, 1, (vlen)); /* Q65 */                        \
+                                                                               \
+    VINT P_right, P_left, W8;                                                  \
+    P_right = PSTEP_I_SRA(P_log_6, P_log_7, 4, W2, (vlen));                    \
+    P_right = PSTEP_I_SRA(P_log_5, W2, 4, P_right, (vlen));                    \
+    P_right = PSTEP_I_SRA(P_log_4, W2, 4, P_right, (vlen));                    \
+    /* P_right in Q76 */                                                       \
+    P_left = PSTEP_I_SRA(P_log_2, P_log_3, 5, W2, (vlen));                     \
+    P_left = PSTEP_I_SRA(P_log_1, W2, 4, P_left, (vlen));                      \
+    P_left = PSTEP_I_SRA(P_log_0, W2, 5, P_left, (vlen));                      \
+    /* P_left in Q66 */                                                        \
+    W8 = __riscv_vsmul(W2, W2, 1, (vlen));           /* Q67 */                 \
+    W8 = __riscv_vsmul(W8, W8, 1, (vlen));           /* Q71 */                 \
+    P_right = __riscv_vsmul(P_right, W8, 1, (vlen)); /* Q84 */                 \
+    P_right = __riscv_vsra(P_right, 18, (vlen));     /* Q66 */                 \
+    P_left = __riscv_vadd(P_left, P_right, (vlen));  /* Q66 */                 \
+                                                                               \
+    VINT W3 = __riscv_vsmul(W2, W, 1, (vlen));     /* Q66 */                   \
+    P_left = __riscv_vsmul(P_left, W3, 1, (vlen)); /* Q69 */                   \
+    VFLOAT poly_hi = __riscv_vfcvt_f(P_left, (vlen));                          \
+    P_left = __riscv_vsub(P_left, __riscv_vfcvt_x(poly_hi, (vlen)), (vlen));   \
+    VFLOAT poly_lo = __riscv_vfcvt_f(P_left, (vlen));                          \
+    poly_hi = __riscv_vfmul(poly_hi, 0x1.0p-69, (vlen));                       \
+    poly_lo = __riscv_vfmul(poly_lo, 0x1.0p-69, (vlen));                       \
+                                                                               \
+    /* n*log(2) - 1 + w + poly is the desired result */                        \
+    VFLOAT A, B;                                                               \
+    A = __riscv_vfmul(n_flt, LOG2_HI, (vlen)); /* exact */                     \
+    A = __riscv_vfsub(A, fp_posOne, (vlen));   /* exact due to A's range */    \
+    w_hi = __riscv_vfadd(w_hi, w_hi, (vlen));                                  \
+    w_lo = __riscv_vfadd(w_lo, w_lo, (vlen));                                  \
+    FAST2SUM(A, w_hi, B, (y_lo), (vlen));                                      \
+    w_lo = __riscv_vfadd((y_lo), w_lo, (vlen));                                \
+    w_lo = __riscv_vfmacc(w_lo, LOG2_LO, n_flt, (vlen));                       \
+    poly_lo = __riscv_vfadd(poly_lo, w_lo, (vlen));                            \
+    FAST2SUM(B, poly_hi, (y_hi), (y_lo), (vlen));                              \
+    (y_lo) = __riscv_vfadd((y_lo), poly_lo, (vlen));                           \
+  } while (0)
+
+//---Compute exp for Stirlings formula used in tgamma
+//   computes exp(x_hi + x_lo) as 2^n * EXP, EXP is fixed-point Q62
+#define TGAMMA_EXP(x_hi, x_lo, n, EXP, vlen)                                   \
+  do {                                                                         \
+    VFLOAT n_flt = __riscv_vfmul((x_hi), 0x1.71547652b82fep+0, (vlen));        \
+    (n) = __riscv_vfcvt_x(n_flt, (vlen));                                      \
+    n_flt = __riscv_vfcvt_f((n), (vlen));                                      \
+    VFLOAT r_hi = __riscv_vfnmsub(n_flt, LOG2_HI, (x_hi), (vlen));             \
+    VFLOAT r_lo = __riscv_vfnmsub(n_flt, LOG2_LO, (x_lo), (vlen));             \
+    r_hi = __riscv_vfmul(r_hi, 0x1.0p63, (vlen));                              \
+    r_lo = __riscv_vfmul(r_lo, 0x1.0p63, (vlen));                              \
+    VINT R = __riscv_vfcvt_x(r_hi, (vlen));                                    \
+    R = __riscv_vadd(R, __riscv_vfcvt_x(r_lo, (vlen)), (vlen));                \
+    /* R is reduced argument in Q63 */                                         \
+                                                                               \
+    VINT P_right =                                                             \
+        PSTEP_I(P_exp_5, R,                                                    \
+                PSTEP_I(P_exp_6, R,                                            \
+                        PSTEP_I(P_exp_7, R,                                    \
+                                PSTEP_I(P_exp_8, R,                            \
+                                        PSTEP_I(P_exp_9, P_exp_10, R, (vlen)), \
+                                        (vlen)),                               \
+                                (vlen)),                                       \
+                        (vlen)),                                               \
+                (vlen));                                                       \
+    VINT R_sq = __riscv_vsmul(R, R, 1, (vlen));                                \
+    VINT R_to_5 = __riscv_vsmul(R_sq, R_sq, 1, (vlen));                        \
+    R_to_5 = __riscv_vsmul(R_sq, R_sq, 1, (vlen));                             \
+    R_to_5 = __riscv_vsmul(R_to_5, R, 1, (vlen));                              \
+    VINT P_left =                                                              \
+        PSTEP_I(P_exp_0, R,                                                    \
+                PSTEP_I(P_exp_1, R,                                            \
+                        PSTEP_I(P_exp_2, R,                                    \
+                                PSTEP_I(P_exp_3, P_exp_4, R, (vlen)), (vlen)), \
+                        (vlen)),                                               \
+                (vlen));                                                       \
+    P_right = __riscv_vsmul(P_right, R_to_5, 1, (vlen));                       \
+    P_left = __riscv_vadd(P_right, P_left, (vlen));                            \
+    P_left = __riscv_vsmul(P_left, R_sq, 1, (vlen));                           \
+    P_left = __riscv_vadd(P_left, R, (vlen));                                  \
+    (EXP) = __riscv_vsra(P_left, 1, (vlen));                                   \
+    INT ONE = (1LL) << 62;                                                     \
+    (EXP) = __riscv_vadd((EXP), ONE, (vlen));                                  \
+  } while (0)
+
+// Compute the term (x/e)^(x-1/2) for 2 <= x <= 180.
+// Return integer n and Q62 fixed point EXP, 2^n value_of(EXP) is (x/e)^(x-1/2)
+#define STIRLING_POWER(x_hi, x_lo, n, EXP, vlen)                               \
+  do {                                                                         \
+    VFLOAT y_hi, y_lo;                                                         \
+    TGAMMA_LOG((x_hi), (x_lo), y_hi, y_lo, (vlen));                            \
+    VFLOAT x_m_half = __riscv_vfsub((x_hi), 0x1.0p-1, (vlen));                 \
+    /* compute (x_m_half, x_lo) * (y_hi, y_lo) */                              \
+    VFLOAT z_hi, z_lo;                                                         \
+    PROD_X1Y1(x_m_half, y_hi, z_hi, z_lo, (vlen));                             \
+    z_lo = __riscv_vfmacc(z_lo, x_m_half, y_lo, (vlen));                       \
+    z_lo = __riscv_vfmacc(z_lo, (x_lo), y_hi, (vlen));                         \
+    TGAMMA_EXP(z_hi, z_lo, (n), (EXP), (vlen));                                \
+  } while (0)
+
+// Gamma based on Stirling formula is Gamma(x) ~ (x/e)^x sqrt(2 pi / x)
+// poly(1/x) To incoporate the 1/sqrt(x) into the power calculation
+// (x/e)^(x-1/2) sqrt(2 pi / e ) poly(1/x)
+// This poly(1/x) is in essense a correction term
+#define STIRLING_CORRECTION(x_hi, x_lo, P_SC, Q_SC, vlen)                      \
+  do {                                                                         \
+    /* 2 <= x < 180. Use Q62 to represent 1/x in fixed point */                \
+    VFLOAT y_hi = __riscv_vfrdiv((x_hi), fp_posOne, (vlen));                   \
+    VFLOAT y_lo = VFMV_VF(fp_posOne, (vlen));                                  \
+    y_lo = __riscv_vfnmsub((x_hi), y_hi, y_lo, (vlen));                        \
+    y_lo = __riscv_vfnmsac(y_lo, (x_lo), y_hi, (vlen));                        \
+    y_lo = __riscv_vfmul(y_hi, y_lo, (vlen));                                  \
+    y_hi = __riscv_vfmul(y_hi, 0x1.0p62, (vlen));                              \
+    y_lo = __riscv_vfmul(y_lo, 0x1.0p62, (vlen));                              \
+    VINT R = __riscv_vfcvt_x(y_hi, (vlen));                                    \
+    R = __riscv_vadd(R, __riscv_vfcvt_x(y_lo, (vlen)), (vlen));                \
+    /* R is 1/(x_hi+x_lo) in Q62 */                                            \
+    (P_SC) = PSTEP_I_SLL(P_corr_6, P_corr_7, 1, R, (vlen));                    \
+    (P_SC) = PSTEP_I_SLL(P_corr_5, R, 1, (P_SC), (vlen));                      \
+    (P_SC) = PSTEP_I_SLL(P_corr_4, R, 1, (P_SC), (vlen));                      \
+    (P_SC) = PSTEP_I_SLL(P_corr_3, R, 1, (P_SC), (vlen));                      \
+    (P_SC) = PSTEP_I_SLL(P_corr_2, R, 1, (P_SC), (vlen));                      \
+    (P_SC) = PSTEP_I_SLL(P_corr_1, R, 1, (P_SC), (vlen));                      \
+    (P_SC) = PSTEP_I_SLL(P_corr_0, R, 1, (P_SC), (vlen));                      \
+                                                                               \
+    (Q_SC) = PSTEP_I_SLL(Q_corr_6, Q_corr_7, 1, R, (vlen));                    \
+    (Q_SC) = PSTEP_I_SLL(Q_corr_5, R, 1, (Q_SC), (vlen));                      \
+    (Q_SC) = PSTEP_I_SLL(Q_corr_4, R, 1, (Q_SC), (vlen));                      \
+    (Q_SC) = PSTEP_I_SLL(Q_corr_3, R, 1, (Q_SC), (vlen));                      \
+    (Q_SC) = PSTEP_I_SLL(Q_corr_2, R, 1, (Q_SC), (vlen));                      \
+    (Q_SC) = PSTEP_I_SLL(Q_corr_1, R, 1, (Q_SC), (vlen));                      \
+    (Q_SC) = PSTEP_I_SLL(Q_corr_0, R, 1, (Q_SC), (vlen));                      \
+  } while (0)
+
+// When input x to gamma(x) is negative, a factor of  sin(pi x)/pi
+// is needed. When x is an exact negative integer, we need to return
+// +-inf as special values and also raise the divide-by-zero signal
+// The input to TGAMMA_SIN is actually |x| clipped to [2^(-60), 179.5]
+#define TGAMMA_SIN(x, P_SIN, SIN_scale, n, vy_special, special_args, vlen)     \
+  do {                                                                         \
+    VFLOAT n_flt;                                                              \
+    (n) = __riscv_vfcvt_x((x), (vlen));                                        \
+    n_flt = __riscv_vfcvt_f((n), (vlen));                                      \
+    VFLOAT r = __riscv_vfsub((x), n_flt, (vlen));                              \
+    VINT m = __riscv_vsra(F_AS_I(r), MAN_LEN, (vlen));                         \
+    m = __riscv_vrsub(__riscv_vand(m, 0x7FF, (vlen)), EXP_BIAS, (vlen));       \
+    /* r = 2^(-m) * val, val in [1, 2). Note that 1 <= m <= 60 */              \
+    VFLOAT scale = I_AS_F(__riscv_vsll(__riscv_vadd(m, EXP_BIAS + 61, (vlen)), \
+                                       MAN_LEN, (vlen)));                      \
+    VINT R = __riscv_vfcvt_x(__riscv_vfmul(r, scale, (vlen)), (vlen));         \
+    /* R is fixed point in scale 61+m */                                       \
+    VFLOAT rsq = __riscv_vfmul(r, r, (vlen));                                  \
+    VFLOAT rsq_lo = __riscv_vfmsub(r, r, rsq, (vlen));                         \
+    VINT Rsq = __riscv_vfcvt_x(__riscv_vfmul(rsq, 0x1.0p63, (vlen)), (vlen));  \
+    Rsq = __riscv_vadd(                                                        \
+        Rsq, __riscv_vfcvt_x(__riscv_vfmul(rsq_lo, 0x1.0p63, (vlen)), (vlen)), \
+        (vlen));                                                               \
+    VINT P_right = PSTEP_I(                                                    \
+        P_sin_4, Rsq,                                                          \
+        PSTEP_I(P_sin_5, Rsq, PSTEP_I(P_sin_6, P_sin_7, Rsq, (vlen)), (vlen)), \
+        (vlen));                                                               \
+    VINT R8 = __riscv_vsmul(Rsq, Rsq, 1, (vlen));                              \
+    R8 = __riscv_vsmul(R8, R8, 1, (vlen));                                     \
+    VINT P_left = PSTEP_I(                                                     \
+        P_sin_0, Rsq,                                                          \
+        PSTEP_I(P_sin_1, Rsq, PSTEP_I(P_sin_2, P_sin_3, Rsq, (vlen)), (vlen)), \
+        (vlen));                                                               \
+    P_right = __riscv_vsmul(P_right, R8, 1, (vlen));                           \
+    P_left = __riscv_vadd(P_left, P_right, (vlen));                            \
+    P_left = __riscv_vsmul(P_left, Rsq, 1, (vlen));                            \
+    P_left = __riscv_vsmul(P_left, R, 1, (vlen));                              \
+    (P_SIN) = __riscv_vadd(R, __riscv_vsll(P_left, 1, (vlen)), (vlen));        \
+    (SIN_scale) = __riscv_vadd(m, 61, (vlen));                                 \
+    VBOOL pole = __riscv_vmseq(R, 0, (vlen));                                  \
+    if (__riscv_vcpop(pole, (vlen)) > 0) {                                     \
+      VFLOAT pm_inf = __riscv_vfrec7(pole, I_AS_F(R), (vlen));                 \
+      pm_inf = __riscv_vfsgnjn(pm_inf, I_AS_F(__riscv_vsll((n), 63, (vlen))),  \
+                               (vlen));                                        \
+      (vy_special) = __riscv_vmerge((vy_special), pm_inf, pole, (vlen));       \
+      (special_args) = __riscv_vmor((special_args), pole, (vlen));             \
+      (P_SIN) = __riscv_vmerge((P_SIN), 0x8000, pole, (vlen));                 \
+    }                                                                          \
+  } while (0)
+
+void F_VER1(API) {
+  size_t vlen;
+  VFLOAT vx, vx_orig, vy, vy_special;
+  VBOOL special_args;
+
+  SET_ROUNDTONEAREST;
+  // stripmining over input arguments
+  for (; _inarg_n > 0; _inarg_n -= vlen) {
+    vlen = VSET(_inarg_n);
+    vx = VFLOAD_INARG1(vlen);
+
+    // Handle Inf and NaN and |vx| < 2^(-60)
+    EXCEPTION_HANDLING_TGAMMA(vx, special_args, vy_special, vlen);
+    vx_orig = vx;
+
+    vx = __riscv_vfabs(vx, vlen);
+    vx = __riscv_vfmin(vx, 0x1.67p+7, vlen);
+    vx_orig = __riscv_vfsgnj(vx, vx_orig, vlen);
+
+    VFLOAT vx_hi = vx;
+    VFLOAT vx_lo = VFMV_VF(fp_posZero, vlen);
+    VBOOL x_lt_0 = __riscv_vmflt(vx_orig, fp_posZero, vlen);
+
+    // VINT P_SIN, SIN_scale, lsb;
+    // TGAMMA_SIN(vx_orig, P_SIN, SIN_scale, lsb, vy_special, special_args,
+    // vlen);
+
+    if (__riscv_vcpop(x_lt_0, vlen) > 0) {
+      // add 1 to argument
+      VFLOAT a_tmp_hi = __riscv_vfadd(vx_hi, fp_posOne, vlen);
+      VFLOAT a_tmp_lo = __riscv_vfrsub(a_tmp_hi, fp_posOne, vlen);
+      a_tmp_lo = __riscv_vfadd(a_tmp_lo, vx_hi, vlen);
+      vx_hi = __riscv_vmerge(vx_hi, a_tmp_hi, x_lt_0, vlen);
+      vx_lo = __riscv_vmerge(vx_lo, a_tmp_lo, x_lt_0, vlen);
+    }
+
+    VFLOAT y_hi, y_lo;
+    VINT n, EXP;
+
+    VINT XF = __riscv_vsll(VMVI_VX(1, vlen), 61, vlen);
+    VINT XF_scale = VMVI_VX(61, vlen);
+
+    VBOOL x_lt_2 = __riscv_vmflt(vx_hi, 0x1.0p1, vlen);
+
+    if (__riscv_vcpop(x_lt_2, vlen) > 0) {
+      // So x = 2^(-m) val, val is in [1, 2)
+      // Create fixed point X in scale Q(61+m)
+      VINT m = __riscv_vsra(F_AS_I(vx_hi), MAN_LEN, vlen);
+      m = __riscv_vrsub(__riscv_vand(m, 0x7FF, vlen), EXP_BIAS, vlen);
+      // at this point, m >= 0, x = 2^(-m) val, val in [1, 2)
+      VINT XF_scale_1 = __riscv_vadd(m, 61, vlen);
+      VINT scale_m =
+          __riscv_vsll(__riscv_vadd(XF_scale_1, EXP_BIAS, vlen), MAN_LEN, vlen);
+      VFLOAT x_tmp = __riscv_vfmul(vx_hi, I_AS_F(scale_m), vlen);
+      VINT X = __riscv_vfcvt_x(x_tmp, vlen);
+      // X is vx_hi in fixed-point, Q(61+m)
+      x_tmp = __riscv_vfmul(vx_lo, I_AS_F(scale_m), vlen);
+      X = __riscv_vadd(X, __riscv_vfcvt_x(x_tmp, vlen), vlen);
+      // X is (vx_hi + vx_lo) in fixed-point, Q(61+m)
+      VINT One_plus_X =
+          __riscv_vadd(XF, __riscv_vsra(X, I_AS_U(m), vlen), vlen);
+      // One_plus_X is 1+x in Q61
+      VFLOAT b = VFMV_VF(fp_posZero, vlen);
+
+      // if 1 <= x < 2, gamma(x) = (1/x) gamma(x+1)
+      // if 0 < x < 1,  gamma(x) = (1/(x(x+1))) gamma(x+2)
+      VBOOL x_ge_1 = __riscv_vmfge(vx_hi, fp_posOne, vlen);
+      VBOOL cond = __riscv_vmand(x_lt_2, x_ge_1, vlen);
+      // cond is 1 <= x < 2
+      XF_scale = __riscv_vmerge(XF_scale, XF_scale_1, cond, vlen);
+      XF = __riscv_vmerge(XF, X, cond, vlen);
+      b = __riscv_vfmerge(b, fp_posOne, cond, vlen);
+      // at this point, if input x is between [1, 2), XF is x in scale 61+m
+      // which is 61 (as m is 0).
+
+      cond = __riscv_vmandn(x_lt_2, x_ge_1, vlen);
+      // cond is 0 < x < 1
+      X = __riscv_vsmul(X, One_plus_X, 1, vlen);
+      XF_scale_1 = __riscv_vadd(m, 59, vlen);
+      XF_scale = __riscv_vmerge(XF_scale, XF_scale_1, cond, vlen);
+      XF = __riscv_vmerge(XF, X, cond, vlen);
+      b = __riscv_vfmerge(b, 0x1.0p1, cond, vlen);
+      // at this point, XF is either 1, x, or x(x+1) in fixed point
+      // scale given in XF_scale which is either 62, 61+m, or 59+m
+
+      // now set (vx_hi, vx_lo) to x + b, b = 0, 1, or 2
+      x_tmp = __riscv_vfadd(b, vx_hi, vlen);
+      VFLOAT x_tmp2 = __riscv_vfsub(b, x_tmp, vlen);
+      x_tmp2 = __riscv_vfadd(x_tmp2, vx_hi, vlen);
+      vx_hi = x_tmp;
+      vx_lo = __riscv_vfadd(vx_lo, x_tmp2, vlen);
+    }
+
+    STIRLING_POWER(vx_hi, vx_lo, n, EXP, vlen);
+    /* Stirling factor is 2^n * e, EXP is e in Q62 */
+
+    VINT P_SC, Q_SC, Numer_tail, Denom_tail;
+    STIRLING_CORRECTION(vx_hi, vx_lo, P_SC, Q_SC, vlen);
+    /* correction term is 2 * P_SC / Q_SC, P_SC is Q78, Q_SC is Q79 */
+
+    /* 2^(n-61) * EXP * P_SC / Q_SC is gamma(x) for x >= 2 */
+    VINT P = __riscv_vsmul(EXP, P_SC, 1, vlen);
+    /* P is Q77 */
+
+    /* now incoporate XF into Q_SC */
+    VINT Q = __riscv_vsmul(XF, Q_SC, 1, vlen);
+    /* scale of Q is 79 - 63 + XF_scale  = 16 + XF_scale */
+
+    /* difference is 16 + XF_scale - 77, which is XF_scale - 61 */
+    XF_scale = __riscv_vsub(XF_scale, 61, vlen);
+    n = __riscv_vadd(n, XF_scale, vlen);
+    /* 2^n P / Q is the answer if input is positive */
+    /* For negative input, the answer is the reciprocal times pi/sin(pi x) */
+
+    VINT Numer = P;
+    VINT Denom = Q;
+    VINT vy_sign;
+    vy_sign = __riscv_vxor(vy_sign, vy_sign, vlen);
+
+    if (__riscv_vcpop(x_lt_0, vlen) > 0) {
+      /* we first recipricate and change n to negative n */
+      Numer = __riscv_vmerge(Numer, Q, x_lt_0, vlen);
+      Denom = __riscv_vmerge(Denom, P, x_lt_0, vlen);
+
+      VINT P_SIN, SIN_scale, lsb;
+      TGAMMA_SIN(vx_orig, P_SIN, SIN_scale, lsb, vy_special, special_args,
+                 vlen);
+
+      vy_sign = __riscv_vmerge(vy_sign, lsb, x_lt_0, vlen);
+
+      P_SIN = __riscv_vsmul(P_SIN, Denom, 1, vlen);
+      Denom = __riscv_vmerge(Denom, P_SIN, x_lt_0, vlen);
+
+      SIN_scale = __riscv_vsub(SIN_scale, 63, vlen);
+      VINT n_prime = __riscv_vrsub(n, 0, vlen);
+      n_prime = __riscv_vadd(n_prime, SIN_scale, vlen);
+      n = __riscv_vmerge(n, n_prime, x_lt_0, vlen);
+    }
+
+    VFLOAT numer_hi, numer_lo, denom_hi, denom_lo;
+    numer_hi = __riscv_vfcvt_f(Numer, vlen);
+    Numer_tail = __riscv_vsub(Numer, __riscv_vfcvt_x(numer_hi, vlen), vlen);
+    numer_lo = __riscv_vfcvt_f(Numer_tail, vlen);
+
+    denom_hi = __riscv_vfcvt_f(Denom, vlen);
+    Denom_tail = __riscv_vsub(Denom, __riscv_vfcvt_x(denom_hi, vlen), vlen);
+    denom_lo = __riscv_vfcvt_f(Denom_tail, vlen);
+
+    DIV_N2D2(numer_hi, numer_lo, denom_hi, denom_lo, vy, vlen);
+    FAST_LDEXP(vy, n, vlen);
+
+    vy_sign = __riscv_vsll(vy_sign, 63, vlen);
+    vy = __riscv_vfsgnjx(vy, I_AS_F(vy_sign), vlen);
+
+    vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
+
+    // copy vy into y and increment addr pointers
+    VFSTORE_OUTARG1(vy, vlen);
+
+    INCREMENT_INARG1(vlen);
+    INCREMENT_OUTARG1(vlen);
+  }
+  RESTORE_FRM;
+}
diff --git a/src/rvvlm_tgammaD.c b/src/rvvlm_tgammaD.c
new file mode 100644
index 0000000..48038de
--- /dev/null
+++ b/src/rvvlm_tgammaD.c
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: 2023 Rivos Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <riscv_vector.h>
+#include <stdio.h>
+
+#include "rvvlm.h"
+#define API_SIGNATURE API_SIGNATURE_11
+#define STRIDE UNIT_STRIDE
+
+#include RVVLM_TGAMMAD_VSET_CONFIG
+
+#include "rvvlm_gammafuncsD.h"
+
+#include "rvvlm_tgammaD.inc.h"
diff --git a/src/rvvlm_tgammaDI.c b/src/rvvlm_tgammaDI.c
new file mode 100644
index 0000000..98655ab
--- /dev/null
+++ b/src/rvvlm_tgammaDI.c
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: 2023 Rivos Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <riscv_vector.h>
+#include <stdio.h>
+
+#include "rvvlm.h"
+#define API_SIGNATURE API_SIGNATURE_11
+#define STRIDE GENERAL_STRIDE
+
+#include RVVLM_TGAMMADI_VSET_CONFIG
+
+#include "rvvlm_gammafuncsD.h"
+
+#include "rvvlm_tgammaD.inc.h"
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index f6bed89..e47ab42 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -79,6 +79,8 @@ set(TEST_SOURCES
     src/test_sinhI.cpp
     src/test_tanh.cpp
     src/test_tanhI.cpp
+    src/test_tgamma.cpp
+    src/test_tgammaI.cpp
 )
 
 add_executable(test_veclibm ${TEST_SOURCES})
diff --git a/test/include/test_infra.h b/test/include/test_infra.h
index eb8609b..2bed42b 100644
--- a/test/include/test_infra.h
+++ b/test/include/test_infra.h
@@ -101,3 +101,8 @@ long double recip_scale(long double);
 long double erfl_prime(long double);
 long double erfcl_prime(long double);
 long double cdfnorml_prime(long double);
+long double log_4_stirling(long double);
+long double stirling_power(long double);
+long double stirling_correction(long double);
+long double tgammal_mod(long double);
+long double sinpix_by_pi(long double);
diff --git a/test/src/test_acos.cpp b/test/src/test_acos.cpp
index f0ee50f..de12eef 100644
--- a/test/src/test_acos.cpp
+++ b/test/src/test_acos.cpp
@@ -19,21 +19,21 @@ TEST(acos, test) {
   x_start = -0x1.0p-40;
   x_end = 0x1.0p-40;
   ;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_acos, acosl, x_start, x_end, nb_tests);
 
   x_start = -0.5;
   x_end = 0.5;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_acos, acosl, x_start, x_end, nb_tests);
 
   x_start = 0.5;
   x_end = 1.0;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_acos, acosl, x_start, x_end, nb_tests);
 
   x_start = -1.0;
   x_end = -0.5;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_acos, acosl, x_start, x_end, nb_tests);
 }
diff --git a/test/src/test_acospi.cpp b/test/src/test_acospi.cpp
index cf851f9..794e770 100644
--- a/test/src/test_acospi.cpp
+++ b/test/src/test_acospi.cpp
@@ -18,21 +18,21 @@ TEST(acospi, test) {
 
   x_start = -0x1.0p-40;
   x_end = 0x1.0p-40;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_acospi, acospil, x_start, x_end, nb_tests);
 
   x_start = -0.5;
   x_end = 0.5;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_acospi, acospil, x_start, x_end, nb_tests);
 
   x_start = 0.5;
   x_end = 1.0;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_acospi, acospil, x_start, x_end, nb_tests);
 
   x_start = -1.0;
   x_end = -0.5;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_acospi, acospil, x_start, x_end, nb_tests);
 }
diff --git a/test/src/test_asinh.cpp b/test/src/test_asinh.cpp
index 51f20ff..baf5b00 100644
--- a/test/src/test_asinh.cpp
+++ b/test/src/test_asinh.cpp
@@ -18,41 +18,41 @@ TEST(asinh, test) {
 
   x_start = 0x1.0p-40;
   x_end = 0x1.0p-35;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_fp64(rvvlm_asinh, asinhl, x_start, x_end, nb_tests);
 
   x_start = -0x1.0p-35;
   x_end = -0x1.0p-40;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_fp64(rvvlm_asinh, asinhl, x_start, x_end, nb_tests);
 
   x_start = 0x1.0p-20;
   x_end = 0x1.0p-10;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_fp64(rvvlm_asinh, asinhl, x_start, x_end, nb_tests);
 
   x_start = 0x1.0p-6;
   x_end = 0x1.0p0;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_fp64(rvvlm_asinh, asinhl, x_start, x_end, nb_tests);
 
   x_start = 0x1.0p0;
   x_end = 0x1.0p2;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_fp64(rvvlm_asinh, asinhl, x_start, x_end, nb_tests);
 
   x_start = -0x1.0p0;
   x_end = -0x1.0p2;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_fp64(rvvlm_asinh, asinhl, x_start, x_end, nb_tests);
 
   x_start = 0x1.0p490;
   x_end = 0x1.0p520;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_fp64(rvvlm_asinh, asinhl, x_start, x_end, nb_tests);
 
   x_start = 0x1.0p1020;
   x_end = 0x1.FFFFFFFFFFp1023;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_fp64(rvvlm_asinh, asinhl, x_start, x_end, nb_tests);
 }
diff --git a/test/src/test_atan2.cpp b/test/src/test_atan2.cpp
index 6c890f8..972fd43 100644
--- a/test/src/test_atan2.cpp
+++ b/test/src/test_atan2.cpp
@@ -24,7 +24,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = 0x1.01p0;
   y_end = 0x1.fffp0;
-  nb_y = 20000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 
@@ -34,7 +34,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = 0x1.01p1020;
   y_end = 0x1.ffffffffp1020;
-  nb_y = 20000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 
@@ -44,7 +44,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = 0x1.01p-1020;
   y_end = 0x1.ffffffffp-1020;
-  nb_y = 20000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 
@@ -54,7 +54,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = 0x1.01p0;
   y_end = 0x1.fffp0;
-  nb_y = 20000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 
@@ -64,7 +64,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = 0x1.01p1020;
   y_end = 0x1.ffffffffp1020;
-  nb_y = 20000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 
@@ -74,7 +74,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = 0x1.01p-1020;
   y_end = 0x1.ffffffffp-1020;
-  nb_y = 20000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 
@@ -84,7 +84,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = -0x1.01p0;
   y_end = -0x1.fffp0;
-  nb_y = 20000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 
@@ -94,7 +94,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = -0x1.01p1020;
   y_end = -0x1.ffffffffp1020;
-  nb_y = 20000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 
@@ -104,7 +104,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = -0x1.01p-1020;
   y_end = -0x1.ffffffffp-1020;
-  nb_y = 200000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 
@@ -114,7 +114,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = -0x1.01p0;
   y_end = -0x1.fffp0;
-  nb_y = 20000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 
@@ -124,7 +124,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = -0x1.01p1020;
   y_end = -0x1.ffffffffp1020;
-  nb_y = 20000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 
@@ -134,7 +134,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = -0x1.01p-1020;
   y_end = -0x1.ffffffffp-1020;
-  nb_y = 20000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 }
diff --git a/test/src/test_atan2pi.cpp b/test/src/test_atan2pi.cpp
index d3bbf65..85bf2f6 100644
--- a/test/src/test_atan2pi.cpp
+++ b/test/src/test_atan2pi.cpp
@@ -20,109 +20,109 @@ TEST(atan2pi, test) {
 
   x_start = 0x1.000000001p0;
   x_end = 0x1.ffffffffffp0;
-  nb_x = 8;
+  nb_x = 4;
   y_start = 0x1.01p0;
   y_end = 0x1.fffp0;
-  nb_y = 50000;
+  nb_y = 5000;
   report_err2_fp64(rvvlm_atan2pi, atan2pil, x_start, x_end, nb_x, y_start,
                    y_end, nb_y, swap_xy);
 
   x_start = 0x1.000000001p0;
   x_end = 0x1.ffffffffffp0;
-  nb_x = 8;
+  nb_x = 4;
   y_start = 0x1.01p1020;
   y_end = 0x1.ffffffffp1020;
-  nb_y = 50000;
+  nb_y = 5000;
   report_err2_fp64(rvvlm_atan2pi, atan2pil, x_start, x_end, nb_x, y_start,
                    y_end, nb_y, swap_xy);
 
   x_start = 0x1.000000001p0;
   x_end = 0x1.ffffffffffp0;
-  nb_x = 8;
+  nb_x = 4;
   y_start = 0x1.01p-1020;
   y_end = 0x1.ffffffffp-1020;
-  nb_y = 50000;
+  nb_y = 5000;
   report_err2_fp64(rvvlm_atan2pi, atan2pil, x_start, x_end, nb_x, y_start,
                    y_end, nb_y, swap_xy);
 
   x_start = -0x1.000000001p0;
   x_end = -0x1.ffffffffffp0;
-  nb_x = 8;
+  nb_x = 4;
   y_start = 0x1.01p0;
   y_end = 0x1.fffp0;
-  nb_y = 50000;
+  nb_y = 5000;
   report_err2_fp64(rvvlm_atan2pi, atan2pil, x_start, x_end, nb_x, y_start,
                    y_end, nb_y, swap_xy);
 
   x_start = -0x1.000000001p0;
   x_end = -0x1.ffffffffffp0;
-  nb_x = 8;
+  nb_x = 4;
   y_start = 0x1.01p1020;
   y_end = 0x1.ffffffffp1020;
-  nb_y = 50000;
+  nb_y = 5000;
   report_err2_fp64(rvvlm_atan2pi, atan2pil, x_start, x_end, nb_x, y_start,
                    y_end, nb_y, swap_xy);
 
   x_start = -0x1.000000001p0;
   x_end = -0x1.ffffffffffp0;
-  nb_x = 8;
+  nb_x = 4;
   y_start = 0x1.01p-1020;
   y_end = 0x1.ffffffffp-1020;
-  nb_y = 50000;
+  nb_y = 5000;
   report_err2_fp64(rvvlm_atan2pi, atan2pil, x_start, x_end, nb_x, y_start,
                    y_end, nb_y, swap_xy);
 
   x_start = 0x1.000000001p0;
   x_end = 0x1.ffffffffffp0;
-  nb_x = 8;
+  nb_x = 4;
   y_start = -0x1.01p0;
   y_end = -0x1.fffp0;
-  nb_y = 50000;
+  nb_y = 5000;
   report_err2_fp64(rvvlm_atan2pi, atan2pil, x_start, x_end, nb_x, y_start,
                    y_end, nb_y, swap_xy);
 
   x_start = 0x1.000000001p0;
   x_end = 0x1.ffffffffffp0;
-  nb_x = 8;
+  nb_x = 4;
   y_start = -0x1.01p1020;
   y_end = -0x1.ffffffffp1020;
-  nb_y = 50000;
+  nb_y = 5000;
   report_err2_fp64(rvvlm_atan2pi, atan2pil, x_start, x_end, nb_x, y_start,
                    y_end, nb_y, swap_xy);
 
   x_start = 0x1.000000001p0;
   x_end = 0x1.ffffffffffp0;
-  nb_x = 8;
+  nb_x = 4;
   y_start = -0x1.01p-1020;
   y_end = -0x1.ffffffffp-1020;
-  nb_y = 50000;
+  nb_y = 5000;
   report_err2_fp64(rvvlm_atan2pi, atan2pil, x_start, x_end, nb_x, y_start,
                    y_end, nb_y, swap_xy);
 
   x_start = -0x1.000000001p0;
   x_end = -0x1.ffffffffffp0;
-  nb_x = 8;
+  nb_x = 4;
   y_start = -0x1.01p0;
   y_end = -0x1.fffp0;
-  nb_y = 50000;
+  nb_y = 5000;
   report_err2_fp64(rvvlm_atan2pi, atan2pil, x_start, x_end, nb_x, y_start,
                    y_end, nb_y, swap_xy);
 
   x_start = -0x1.000000001p0;
   x_end = -0x1.ffffffffffp0;
-  nb_x = 8;
+  nb_x = 4;
   y_start = -0x1.01p1020;
   y_end = -0x1.ffffffffp1020;
-  nb_y = 50000;
+  nb_y = 5000;
   report_err2_fp64(rvvlm_atan2pi, atan2pil, x_start, x_end, nb_x, y_start,
                    y_end, nb_y, swap_xy);
 
   x_start = -0x1.000000001p0;
   x_end = -0x1.ffffffffffp0;
-  nb_x = 8;
+  nb_x = 4;
   y_start = -0x1.01p-1020;
   y_end = -0x1.ffffffffp-1020;
-  nb_y = 50000;
+  nb_y = 5000;
   report_err2_fp64(rvvlm_atan2pi, atan2pil, x_start, x_end, nb_x, y_start,
                    y_end, nb_y, swap_xy);
 }
diff --git a/test/src/test_atanh.cpp b/test/src/test_atanh.cpp
index da48400..9f1f492 100644
--- a/test/src/test_atanh.cpp
+++ b/test/src/test_atanh.cpp
@@ -18,41 +18,41 @@ TEST(atanh, test) {
 
   x_start = 0x1.0p-40;
   x_end = 0x1.0p-35;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_atanh, atanhl, x_start, x_end, nb_tests);
 
   x_start = -0x1.0p-35;
   x_end = -0x1.0p-40;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_atanh, atanhl, x_start, x_end, nb_tests);
 
   x_start = 0x1.0p-20;
   x_end = 0x1.0p-10;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_atanh, atanhl, x_start, x_end, nb_tests);
 
   x_start = 0x1.0p-10;
   x_end = 0x1.ffp-3;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_atanh, atanhl, x_start, x_end, nb_tests);
 
   x_start = 0x1.00p-9;
   x_end = 0x1.0p-2;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_atanh, atanhl, x_start, x_end, nb_tests);
 
   x_start = 0x1.0p-1;
   x_end = 0x1.0p0 - 0x1.0p-51;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_atanh, atanhl, x_start, x_end, nb_tests);
 
   x_start = -0x1.0p-1;
   x_end = -(0x1.0p0 - 0x1.0p-51);
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_atanh, atanhl, x_start, x_end, nb_tests);
 
   x_start = -1.0 + 0x1.0p-53;
   x_end = 1.0 - 0x1.0p-51;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_atanh, atanhl, x_start, x_end, nb_tests);
 }
diff --git a/test/src/test_cdfnorm.cpp b/test/src/test_cdfnorm.cpp
index e96c5f3..6197ef8 100644
--- a/test/src/test_cdfnorm.cpp
+++ b/test/src/test_cdfnorm.cpp
@@ -18,36 +18,36 @@ TEST(cdfnorm, test) {
 
   x_start = -0x1.0p0;
   x_end = 0x1.0p0;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_cdfnorm, cdfnorml, x_start, x_end, nb_tests);
 
   x_start = 0x1.0p-2;
   x_end = 0x1.0p0;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_cdfnorm, cdfnorml, x_start, x_end, nb_tests);
   report_err_fp64(rvvlm_cdfnorm, cdfnorml, -x_start, -x_end, nb_tests);
 
   x_start = 0x1.0p0;
   x_end = 0x1.0p3;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_cdfnorm, cdfnorml, x_start, x_end, nb_tests);
   report_err_fp64(rvvlm_cdfnorm, cdfnorml, -x_start, -x_end, nb_tests);
 
   x_start = 0x1.0p3;
   x_end = 30.0;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_cdfnorm, cdfnorml, x_start, x_end, nb_tests);
   report_err_fp64(rvvlm_cdfnorm, cdfnorml, -x_start, -x_end, nb_tests);
 
   x_start = 0x1.0p-80;
   x_end = 0x1.0p-50;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_cdfnorm, cdfnorml, x_start, x_end, nb_tests);
   report_err_fp64(rvvlm_cdfnorm, cdfnorml, -x_start, -x_end, nb_tests);
 
   x_start = 0x1.0p-50;
   x_end = 0x1.0p-10;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_cdfnorm, cdfnorml, x_start, x_end, nb_tests);
   report_err_fp64(rvvlm_cdfnorm, cdfnorml, -x_start, -x_end, nb_tests);
 }
diff --git a/test/src/test_cdfnorminv.cpp b/test/src/test_cdfnorminv.cpp
index 27b684f..4e90d75 100644
--- a/test/src/test_cdfnorminv.cpp
+++ b/test/src/test_cdfnorminv.cpp
@@ -26,19 +26,19 @@ TEST(cdfnorminv, tiny_args) {
 
   x_start = 0x1.0p-1074;
   x_end = 0x1.0p-1000;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_byinv_fp64(rvvlm_cdfnorminv, cdfnorml, cdfnorml_prime, x_start,
                         x_end, nb_tests);
 
   x_start = 0x1.0p-1000;
   x_end = 0x1.0p-500;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_byinv_fp64(rvvlm_cdfnorminv, cdfnorml, cdfnorml_prime, x_start,
                         x_end, nb_tests);
 
   x_start = 0x1.0p-55;
   x_end = 0x1.0p-53;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_byinv_fp64(rvvlm_cdfnorminv, cdfnorml, cdfnorml_prime, x_start,
                         x_end, nb_tests);
 }
@@ -51,13 +51,13 @@ TEST(cdfnorminv, small_args) {
 
   x_start = 0x1.0p-50;
   x_end = 0x1.0p-20;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_byinv_fp64(rvvlm_cdfnorminv, cdfnorml, cdfnorml_prime, x_start,
                         x_end, nb_tests);
 
   x_start = 0x1.0p-20;
   x_end = 0x1.0p-4;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_byinv_fp64(rvvlm_cdfnorminv, cdfnorml, cdfnorml_prime, x_start,
                         x_end, nb_tests);
 }
@@ -70,19 +70,19 @@ TEST(cdfnorminv, medium_args) {
 
   x_start = 0x1.0p-5;
   x_end = 0x1.0p-2;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_byinv_fp64(rvvlm_cdfnorminv, cdfnorml, cdfnorml_prime, x_start,
                         x_end, nb_tests);
 
   x_start = 0x1.0p-2;
   x_end = 0x1.8p-1;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_byinv_fp64(rvvlm_cdfnorminv, cdfnorml, cdfnorml_prime, x_start,
                         x_end, nb_tests);
 
   x_start = 1.0 - 0x1.0p-53;
   x_end = 0x1.8p-1;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_byinv_fp64(rvvlm_cdfnorminv, cdfnorml, cdfnorml_prime, x_start,
                         x_end, nb_tests);
 }
diff --git a/test/src/test_erf.cpp b/test/src/test_erf.cpp
index 6f82462..cd05e2a 100644
--- a/test/src/test_erf.cpp
+++ b/test/src/test_erf.cpp
@@ -18,26 +18,26 @@ TEST(erf, test) {
 
   x_start = 0x1.0p-40;
   x_end = 0x1.0p-20;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_erf, erfl, x_start, x_end, nb_tests);
 
   x_start = 0x1.0p-20;
   x_end = 0x1.0p1;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_erf, erfl, x_start, x_end, nb_tests);
 
   x_start = 0x1.0p1;
   x_end = 0x1.0p2;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_erf, erfl, x_start, x_end, nb_tests);
 
   x_start = 0x1.0p2;
   x_end = 7.0;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_erf, erfl, x_start, x_end, nb_tests);
 
   x_start = -6.5;
   x_end = 6.5;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_erf, erfl, x_start, x_end, nb_tests);
 }
diff --git a/test/src/test_erfcinv.cpp b/test/src/test_erfcinv.cpp
index 5dfae21..7000476 100644
--- a/test/src/test_erfcinv.cpp
+++ b/test/src/test_erfcinv.cpp
@@ -25,19 +25,19 @@ TEST(erfcinv, tiny_args) {
 
   x_start = 0x1.0p-1074;
   x_end = 0x1.0p-1000;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end,
                         nb_tests);
 
   x_start = 0x1.0p-1000;
   x_end = 0x1.0p-500;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end,
                         nb_tests);
 
   x_start = 0x1.0p-55;
   x_end = 0x1.0p-53;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end,
                         nb_tests);
 }
@@ -51,14 +51,14 @@ TEST(erfcinv, small_args) {
   x_start = 0x1.0p-50;
   ;
   x_end = 0x1.0p-20;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end,
                         nb_tests);
 
   x_start = 0x1.0p-20;
   ;
   x_end = 0x1.0p-4;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end,
                         nb_tests);
 }
@@ -72,20 +72,20 @@ TEST(erfcinv, medium_args) {
   x_start = 0x1.0p-4;
   ;
   x_end = 0x1.0p-1;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end,
                         nb_tests);
 
   x_start = 0x1.0p-1;
   ;
   x_end = 0x1.8p0;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end,
                         nb_tests);
 
   x_start = 2.0 - 0x1.0p-52;
   x_end = 0x1.8p0;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end,
                         nb_tests);
 }
diff --git a/test/src/test_erfinv.cpp b/test/src/test_erfinv.cpp
index 9e7a436..81839b0 100644
--- a/test/src/test_erfinv.cpp
+++ b/test/src/test_erfinv.cpp
@@ -24,23 +24,20 @@ TEST(erfinv, small_args) {
   COMMENT("erfinv: current chosen algorithm; reduced argument in FP64 only")
 
   x_start = 0x1.0p-40;
-  ;
   x_end = 0x1.0p-30;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end,
                         nb_tests);
 
   x_start = 0x1.0p-30;
-  ;
   x_end = 0x1.0p-10;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end,
                         nb_tests);
 
   x_start = 0x1.0p-10;
-  ;
   x_end = 0x1.0p-4;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end,
                         nb_tests);
 }
@@ -52,43 +49,38 @@ TEST(erfinv, middle_args) {
   COMMENT("erfinv: current chosen algorithm; reduced argument in FP64 only")
 
   x_start = 0x1.0p-4;
-  ;
   x_end = 0x1.0p-2;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end,
                         nb_tests);
 
   x_start = 0x1.0p-2;
-  ;
   x_end = 0x1.0p-1;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end,
                         nb_tests);
 
   x_start = 0x1.0p-1;
-  ;
   x_end = 0x1.78p-1;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end,
                         nb_tests);
 
   x_start = 0x1.6p-1;
-  ;
   x_end = 0x1.7cp-1;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end,
                         nb_tests);
 
   x_start = 0x1.71p-1;
-  ;
   x_end = 0x1.8p-1;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end,
                         nb_tests);
 
   x_start = -0.8;
   x_end = 0.8;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end,
                         nb_tests);
 
@@ -107,7 +99,7 @@ TEST(erfinv, close_to_1) {
 
   x_start = 1.0 - 0x1.0p-53;
   x_end = 0x1.ffp-1;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end,
                         nb_tests);
 }
diff --git a/test/src/test_exp10.cpp b/test/src/test_exp10.cpp
index bf563ee..ffdc649 100644
--- a/test/src/test_exp10.cpp
+++ b/test/src/test_exp10.cpp
@@ -18,12 +18,12 @@ TEST(exp10, small_args) {
 
   x_start = -0.34;
   x_end = 0.34;
-  nb_tests = 300000;
+  nb_tests = 30000;
   report_err_fp64(rvvlm_exp10, exp10l, x_start, x_end, nb_tests);
 
   x_start = -3.0;
   x_end = 3.0;
-  nb_tests = 400000;
+  nb_tests = 40000;
   report_err_fp64(rvvlm_exp10, exp10l, x_start, x_end, nb_tests);
 }
 
@@ -35,12 +35,12 @@ TEST(exp10, medium_args) {
 
   x_start = -15.0;
   x_end = -10.0;
-  nb_tests = 400000;
+  nb_tests = 40000;
   report_err_fp64(rvvlm_exp10, exp10l, x_start, x_end, nb_tests);
 
   x_start = 10.0;
   x_end = 15.0;
-  nb_tests = 400000;
+  nb_tests = 40000;
   report_err_fp64(rvvlm_exp10, exp10l, x_start, x_end, nb_tests);
 }
 
@@ -52,11 +52,11 @@ TEST(exp10, large_args) {
 
   x_start = 295.0;
   x_end = 308.0;
-  nb_tests = 400000;
+  nb_tests = 40000;
   report_err_fp64(rvvlm_exp10, exp10l, x_start, x_end, nb_tests);
 
   x_start = -323.0;
   x_end = -300.0;
-  nb_tests = 400000;
+  nb_tests = 40000;
   report_err_fp64(rvvlm_exp10, exp10l, x_start, x_end, nb_tests);
 }
diff --git a/test/src/test_exp2.cpp b/test/src/test_exp2.cpp
index 25358d7..9ff2919 100644
--- a/test/src/test_exp2.cpp
+++ b/test/src/test_exp2.cpp
@@ -18,12 +18,12 @@ TEST(exp2, small_args) {
 
   x_start = -0.34;
   x_end = 0.34;
-  nb_tests = 300000;
+  nb_tests = 80000;
   report_err_fp64(rvvlm_exp2, exp2l, x_start, x_end, nb_tests);
 
   x_start = -3.0;
   x_end = 3.0;
-  nb_tests = 400000;
+  nb_tests = 100000;
   report_err_fp64(rvvlm_exp2, exp2l, x_start, x_end, nb_tests);
 }
 
@@ -35,12 +35,12 @@ TEST(exp2, medium_args) {
 
   x_start = -14.0;
   x_end = -10.0;
-  nb_tests = 400000;
+  nb_tests = 100000;
   report_err_fp64(rvvlm_exp2, exp2l, x_start, x_end, nb_tests);
 
   x_start = 10.0;
   x_end = 15.0;
-  nb_tests = 400000;
+  nb_tests = 100000;
   report_err_fp64(rvvlm_exp2, exp2l, x_start, x_end, nb_tests);
 }
 
diff --git a/test/src/test_expm1.cpp b/test/src/test_expm1.cpp
index 0369d62..841c12d 100644
--- a/test/src/test_expm1.cpp
+++ b/test/src/test_expm1.cpp
@@ -18,12 +18,12 @@ TEST(expm1, small_args) {
 
   x_start = -0.01;
   x_end = 0.01;
-  nb_tests = 300000;
+  nb_tests = 30000;
   report_err_fp64(rvvlm_expm1, expm1l, x_start, x_end, nb_tests);
 
   x_start = -.3;
   x_end = 0.3;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_expm1, expm1l, x_start, x_end, nb_tests);
 }
 
@@ -35,12 +35,12 @@ TEST(expm1, medium_args) {
 
   x_start = 1.0;
   x_end = 10.0;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_expm1, expm1l, x_start, x_end, nb_tests);
 
   x_start = -10.0;
   x_end = -1.0;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_expm1, expm1l, x_start, x_end, nb_tests);
 
   x_start = -40.0;
@@ -50,7 +50,7 @@ TEST(expm1, medium_args) {
 
   x_start = 36.0;
   x_end = 40.0;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_expm1, expm1l, x_start, x_end, nb_tests);
 }
 
@@ -62,11 +62,11 @@ TEST(expm1, large_args) {
 
   x_start = 700.0;
   x_end = 709.0;
-  nb_tests = 400000;
+  nb_tests = 40000;
   report_err_fp64(rvvlm_expm1, expm1l, x_start, x_end, nb_tests);
 
   x_start = -50.0;
   x_end = -40.0;
-  nb_tests = 400000;
+  nb_tests = 40000;
   report_err_fp64(rvvlm_expm1, expm1l, x_start, x_end, nb_tests);
 }
diff --git a/test/src/test_infra.cpp b/test/src/test_infra.cpp
index ec2acd4..bb0eda0 100644
--- a/test/src/test_infra.cpp
+++ b/test/src/test_infra.cpp
@@ -1313,3 +1313,40 @@ long double cdfnorml_prime(long double x) {
   y = one_ov_rt_2pi * expl(-x * x * half);
   return y;
 }
+
+long double log_4_stirling(long double x) {
+  long double e = 2.7182818284590452353602874713526615L;
+  long double y = x / e;
+  y = logl(y);
+  return y;
+}
+
+long double stirling_power(long double x) {
+  long double e = 2.7182818284590452353602874713526615L;
+  long double half = 0x1.0p-1L;
+  long double x_minus_half = x - half;
+  long double y = x / e;
+  y = powl(y, x_minus_half);
+  return y;
+}
+
+long double stirling_correction(long double x) {
+  long double stirling = stirling_power(x);
+  long double y = tgammal(x) / stirling;
+  return y;
+}
+
+long double tgammal_mod(long double x) {
+  // tgammal(x) for x >= 0; else tgamma(1-x)
+  long double one_minus_x = 1.0L - x;
+  long double y;
+  y = (x >= 0.0) ? tgammal(x) : tgammal(one_minus_x);
+  return y;
+}
+
+long double sinpix_by_pi(long double x) {
+  long double pi = 0x1.921f'b5444'2d18'4698'98cc'5170'1b83'9p1L;
+  long double y = pi * x;
+  y = -sinl(y) / pi;
+  return y;
+}
diff --git a/test/src/test_log.cpp b/test/src/test_log.cpp
index 965b37e..9ec687b 100644
--- a/test/src/test_log.cpp
+++ b/test/src/test_log.cpp
@@ -25,17 +25,17 @@ TEST(log, around_1) {
 
   x_start = 0.7;
   x_end = 1.5;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_log, logl, x_start, x_end, nb_tests);
 
   x_start = 1.0;
   x_end = 4.0;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_log, logl, x_start, x_end, nb_tests);
 
   x_start = 0.25;
   x_end = 1.0;
-  nb_tests = 1000000;
+  nb_tests = 100000;
   report_err_fp64(rvvlm_log, logl, x_start, x_end, nb_tests);
 }
 
@@ -47,11 +47,11 @@ TEST(log, extreme_args) {
 
   x_start = 0x1.0p1023;
   x_end = 0x1.ffffffp1023;
-  nb_tests = 400000;
+  nb_tests = 100000;
   report_err_fp64(rvvlm_log, logl, x_start, x_end, nb_tests);
 
   x_start = 0x1.0p-1074;
   x_end = 0x1.0p-1022;
-  nb_tests = 400000;
+  nb_tests = 100000;
   report_err_fp64(rvvlm_log, logl, x_start, x_end, nb_tests);
 }
diff --git a/test/src/test_log2.cpp b/test/src/test_log2.cpp
index 3508c70..7c39aa7 100644
--- a/test/src/test_log2.cpp
+++ b/test/src/test_log2.cpp
@@ -25,17 +25,17 @@ TEST(log2, around_1) {
 
   x_start = 0.7;
   x_end = 1.5;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_log2, log2l, x_start, x_end, nb_tests);
 
   x_start = 1.0;
   x_end = 4.0;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_log2, log2l, x_start, x_end, nb_tests);
 
   x_start = 0.25;
   x_end = 1.0;
-  nb_tests = 40000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_log2, log2l, x_start, x_end, nb_tests);
 }
 
@@ -47,11 +47,11 @@ TEST(log2, extreme_args) {
 
   x_start = 0x1.0p1023;
   x_end = 0x1.ffffffp1023;
-  nb_tests = 400000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_log2, log2l, x_start, x_end, nb_tests);
 
   x_start = 0x1.0p-1074;
   x_end = 0x1.0p-1022;
-  nb_tests = 400000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_log2, log2l, x_start, x_end, nb_tests);
 }
diff --git a/test/src/test_pow.cpp b/test/src/test_pow.cpp
index 9411f0f..f8c4b2f 100644
--- a/test/src/test_pow.cpp
+++ b/test/src/test_pow.cpp
@@ -33,15 +33,15 @@ TEST(pow, medium_args) {
 
   x_start = 0x1.0p-3;
   x_end = 0x1.0p4;
-  nb_pts_x = 400;
+  nb_pts_x = 200;
   y_start = 0x1.0p-3;
   y_end = 0x1.0p4;
-  nb_pts_y = 400;
+  nb_pts_y = 200;
   report_err2_fp64(rvvlm_pow, powl, x_start, x_end, nb_pts_x, y_start, y_end,
                    nb_pts_y, swap_xy);
 
-  nb_targets = 10;
-  nb_tests = 10000;
+  nb_targets = 5;
+  nb_tests = 1000;
 
   target_start = -32.0 * log(2.0);
   target_end = 32.0 * log(2.0);
@@ -69,8 +69,8 @@ TEST(pow, large_args) {
 
   COMMENT("pow: current chosen algorithm; reduced argument in FP64 only")
 
-  nb_targets = 10;
-  nb_tests = 100000;
+  nb_targets = 5;
+  nb_tests = 1000;
 
   x_start = 0x1.0p-10;
   x_end = 0x1.0p10;
diff --git a/test/src/test_tan.cpp b/test/src/test_tan.cpp
index 317d3b6..e029114 100644
--- a/test/src/test_tan.cpp
+++ b/test/src/test_tan.cpp
@@ -25,12 +25,12 @@ TEST(tan, small_args) {
 
   x_start = -0.7;
   x_end = 0.7;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_tan, tanl, x_start, x_end, nb_tests);
 
   x_start = -6.0;
   x_end = 6.0;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_tan, tanl, x_start, x_end, nb_tests);
 }
 
@@ -42,16 +42,16 @@ TEST(tan, large_args) {
 
   x_start = 1.0;
   x_end = 0x1.0p23;
-  nb_tests = 100000;
+  nb_tests = 30000;
   report_err_fp64(rvvlm_tan, tanl, x_start, x_end, nb_tests);
 
   x_start = 0x1.0p25;
   x_end = 0x1.0p100;
-  nb_tests = 100000;
+  nb_tests = 30000;
   report_err_fp64(rvvlm_tan, tanl, x_start, x_end, nb_tests);
 
   x_start = 0x1.ffffp1023;
   x_end = 0x1.0p1000;
-  nb_tests = 100000;
+  nb_tests = 30000;
   report_err_fp64(rvvlm_tan, tanl, x_start, x_end, nb_tests);
 }
diff --git a/test/src/test_tgamma.cpp b/test/src/test_tgamma.cpp
new file mode 100644
index 0000000..2171cfa
--- /dev/null
+++ b/test/src/test_tgamma.cpp
@@ -0,0 +1,83 @@
+// SPDX-FileCopyrightText: 2023 Rivos Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gtest/gtest.h>
+#include <math.h>
+
+#include "rvvlm.h"
+#include "test_infra.h"
+
+TEST(tgamma, special) {
+  unsigned long nb_tests;
+  double x_start, x_end;
+
+  COMMENT("tgamma: current chosen algorithm; reduced argument in FP64 only")
+
+  show_special_fp64(rvvlm_tgamma, "Special Value handling of this function");
+}
+
+TEST(tgamma, gamma_tiny) {
+  unsigned long nb_tests;
+  double x_start, x_end;
+
+  x_start = 0x1.0p-120;
+  x_end = 0x1.0p-118;
+  nb_tests = 1000;
+  report_err_fp64(rvvlm_tgamma, tgammal, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-50;
+  x_end = 0x1.0p-40;
+  nb_tests = 1000;
+  report_err_fp64(rvvlm_tgamma, tgammal, x_start, x_end, nb_tests);
+
+  x_start = -0x1.0p-118;
+  x_end = -0x1.0p-120;
+  nb_tests = 1000;
+  report_err_fp64(rvvlm_tgamma, tgammal, x_start, x_end, nb_tests);
+
+  x_start = -0x1.0p-40;
+  x_end = -0x1.0p-50;
+  nb_tests = 1000;
+  report_err_fp64(rvvlm_tgamma, tgammal, x_start, x_end, nb_tests);
+}
+
+TEST(tgamma, gamma_moderate) {
+  unsigned long nb_tests;
+  double x_start, x_end;
+
+  x_start = 0.5;
+  x_end = 4.0;
+  nb_tests = 1000;
+  report_err_fp64(rvvlm_tgamma, tgammal, x_start, x_end, nb_tests);
+
+  x_start = 4.0;
+  x_end = 10.0;
+  nb_tests = 1000;
+  report_err_fp64(rvvlm_tgamma, tgammal, x_start, x_end, nb_tests);
+
+  x_start = -0.9;
+  x_end = -0.2;
+  nb_tests = 1000;
+  report_err_fp64(rvvlm_tgamma, tgammal, x_start, x_end, nb_tests);
+
+  x_start = -10.1;
+  x_end = -0.9;
+  nb_tests = 1231;
+  report_err_fp64(rvvlm_tgamma, tgammal, x_start, x_end, nb_tests);
+}
+
+TEST(tgamma, gamma_large) {
+  unsigned long nb_tests;
+  double x_start, x_end;
+
+  x_start = 100.0;
+  x_end = 171.0;
+  nb_tests = 1000;
+  report_err_fp64(rvvlm_tgamma, tgammal, x_start, x_end, nb_tests);
+
+  x_start = -171.1;
+  x_end = -100.3;
+  nb_tests = 1317;
+  report_err_fp64(rvvlm_tgamma, tgammal, x_start, x_end, nb_tests);
+}
diff --git a/test/src/test_tgammaI.cpp b/test/src/test_tgammaI.cpp
new file mode 100644
index 0000000..ec67b20
--- /dev/null
+++ b/test/src/test_tgammaI.cpp
@@ -0,0 +1,24 @@
+// SPDX-FileCopyrightText: 2023 Rivos Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gtest/gtest.h>
+#include <math.h>
+
+#include "rvvlm.h"
+#include "test_infra.h"
+
+TEST(tgammaI, test) {
+  unsigned long nb_tests;
+  double x_start, x_end;
+
+  COMMENT("tgammaI: current chosen algorithm; reduced argument in FP64 only")
+
+  x_start = 0.5;
+  x_end = 1.75;
+  nb_tests = 30;
+  int stride_x = 21;
+  int stride_y = 30;
+  report_err_fp64(rvvlm_tgammaI, tgammal, x_start, x_end, nb_tests, stride_x,
+                  stride_y);
+}