Merge pull request #29 from rivosinc/dev/PingTakPeterTang/gamma

add the FP64 true gamma function tgamma
rivosinc · Mar 29, 2024 · bddd67f · bddd67f
2 parents 194eb48 + ca948ac
commit bddd67f
Show file tree

Hide file tree

Showing 29 changed files with 816 additions and 143 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -98,6 +98,8 @@ set(PROJECT_SOURCES
     src/rvvlm_sinhDI.c
     src/rvvlm_tanhD.c
     src/rvvlm_tanhDI.c
+    src/rvvlm_tgammaD.c
+    src/rvvlm_tgammaDI.c
 )
 
 add_library(vecm

diff --git a/include/rvvlm.h b/include/rvvlm.h
@@ -172,6 +172,17 @@ union sui64_fp64 {
     (delta_Q) = __riscv_vfmul(_q, __riscv_vfrec7((denom), (vlen)), (vlen));    \
   } while (0)
 
+#define ACC_DIV2_N2D2(numer, delta_n, denom, delta_d, Q, delta_Q, vlen)        \
+  do {                                                                         \
+    VFLOAT _recip, _q;                                                         \
+    _recip = __riscv_vfrdiv((denom), 0x1.0p0, (vlen));                         \
+    (Q) = __riscv_vfmul((numer), _recip, (vlen));                              \
+    _q = __riscv_vfnmsub((Q), (denom), (numer), (vlen));                       \
+    _q = __riscv_vfnmsac(_q, (Q), (delta_d), (vlen));                          \
+    _q = __riscv_vfadd(_q, (delta_n), (vlen));                                 \
+    (delta_Q) = __riscv_vfmul(_q, _recip, (vlen));                             \
+  } while (0)
+
 #define SQRT2_X2(x, delta_x, r, delta_r, vlen)                                 \
   do {                                                                         \
     VFLOAT xx = __riscv_vfadd((x), (delta_x), (vlen));                         \
@@ -469,6 +480,13 @@ union sui64_fp64 {
 #define RVVLM_TANPIDI_VSET_CONFIG "rvvlm_fp64m2.h"
 #define RVVLM_TANPIDI_MERGED rvvlm_tanpiI
 
+// FP64 tgamma function configuration
+#define RVVLM_TGAMMAD_VSET_CONFIG "rvvlm_fp64m1.h"
+#define RVVLM_TGAMMAD_STD rvvlm_tgamma
+
+#define RVVLM_TGAMMADI_VSET_CONFIG "rvvlm_fp64m1.h"
+#define RVVLM_TGAMMADI_STD rvvlm_tgammaI
+
 // FP64 cosh function configuration
 #define RVVLM_COSHD_VSET_CONFIG "rvvlm_fp64m2.h"
 #define RVVLM_COSHD_STD rvvlm_coshD_std
@@ -499,6 +517,7 @@ extern int64_t expD_tbl64_fixedpt[64];
 extern int64_t logD_tbl128_fixedpt[128];
 extern double logtbl_4_powD_128_hi_lo[256];
 extern double dbl_2ovpi_tbl[28];
+extern int64_t factorial_fixedpt[180];
 
 // Define the functions in the vector math library
 void RVVLM_ACOSD_FIXEDPT(size_t x_len, const double *x, double *y);
@@ -703,6 +722,10 @@ void RVVLM_TANHD_STD(size_t x_len, const double *x, double *y);
 void RVVLM_TANHDI_STD(size_t x_len, const double *x, size_t stride_x, double *y,
                       size_t stride_y);
 
+void RVVLM_TGAMMAD_STD(size_t x_len, const double *x, double *y);
+void RVVLM_TGAMMADI_STD(size_t x_len, const double *x, size_t stride_x,
+                        double *y, size_t stride_y);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/rvvlm_gammafuncsD.h b/include/rvvlm_gammafuncsD.h
@@ -0,0 +1,27 @@
+// SPDX-FileCopyrightText: 2023 Rivos Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// gamma(+inf) = +inf; gamma(-inf/sNaN) is qNaN with invalid
+// gamma(qNaN) is qNaN
+// gamma(+-0) is +-inf and divide by 0
+// gamma(tiny) is 1/tiny
+#define EXCEPTION_HANDLING_TGAMMA(vx, special_args, vy_special, vlen)          \
+  do {                                                                         \
+    VUINT expo_x = __riscv_vand(__riscv_vsrl(F_AS_U((vx)), MAN_LEN, (vlen)),   \
+                                0x7FF, (vlen));                                \
+    VBOOL x_small = __riscv_vmsltu(expo_x, EXP_BIAS - 60, (vlen));             \
+    VBOOL x_InfNaN = __riscv_vmseq(expo_x, 0x7FF, (vlen));                     \
+    (special_args) = __riscv_vmor(x_small, x_InfNaN, (vlen));                  \
+    if (__riscv_vcpop((special_args), (vlen)) > 0) {                           \
+      VUINT vclass = __riscv_vfclass((vx), (vlen));                            \
+      VBOOL x_negInf;                                                          \
+      IDENTIFY(vclass, class_negInf, x_negInf, (vlen));                        \
+      (vx) = __riscv_vfmerge((vx), fp_sNaN, x_negInf, (vlen));                 \
+      VFLOAT y_tmp = __riscv_vfadd(x_InfNaN, (vx), (vx), (vlen));              \
+      (vy_special) = __riscv_vmerge((vy_special), y_tmp, x_InfNaN, (vlen));    \
+      y_tmp = __riscv_vfrdiv(x_small, (vx), fp_posOne, (vlen));                \
+      (vy_special) = __riscv_vmerge((vy_special), y_tmp, x_small, (vlen));     \
+      (vx) = __riscv_vfmerge((vx), fp_posOne, (special_args), (vlen));         \
+    }                                                                          \
+  } while (0)
diff --git a/include/rvvlm_tgammaD.inc.h b/include/rvvlm_tgammaD.inc.h
diff --git a/src/rvvlm_tgammaD.c b/src/rvvlm_tgammaD.c
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: 2023 Rivos Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <riscv_vector.h>
+#include <stdio.h>
+
+#include "rvvlm.h"
+#define API_SIGNATURE API_SIGNATURE_11
+#define STRIDE UNIT_STRIDE
+
+#include RVVLM_TGAMMAD_VSET_CONFIG
+
+#include "rvvlm_gammafuncsD.h"
+
+#include "rvvlm_tgammaD.inc.h"
diff --git a/src/rvvlm_tgammaDI.c b/src/rvvlm_tgammaDI.c
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: 2023 Rivos Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <riscv_vector.h>
+#include <stdio.h>
+
+#include "rvvlm.h"
+#define API_SIGNATURE API_SIGNATURE_11
+#define STRIDE GENERAL_STRIDE
+
+#include RVVLM_TGAMMADI_VSET_CONFIG
+
+#include "rvvlm_gammafuncsD.h"
+
+#include "rvvlm_tgammaD.inc.h"
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -79,6 +79,8 @@ set(TEST_SOURCES
     src/test_sinhI.cpp
     src/test_tanh.cpp
     src/test_tanhI.cpp
+    src/test_tgamma.cpp
+    src/test_tgammaI.cpp
 )
 
 add_executable(test_veclibm ${TEST_SOURCES})

diff --git a/test/include/test_infra.h b/test/include/test_infra.h
@@ -101,3 +101,8 @@ long double recip_scale(long double);
 long double erfl_prime(long double);
 long double erfcl_prime(long double);
 long double cdfnorml_prime(long double);
+long double log_4_stirling(long double);
+long double stirling_power(long double);
+long double stirling_correction(long double);
+long double tgammal_mod(long double);
+long double sinpix_by_pi(long double);
diff --git a/test/src/test_acos.cpp b/test/src/test_acos.cpp
@@ -19,21 +19,21 @@ TEST(acos, test) {
   x_start = -0x1.0p-40;
   x_end = 0x1.0p-40;
   ;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_acos, acosl, x_start, x_end, nb_tests);
 
   x_start = -0.5;
   x_end = 0.5;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_acos, acosl, x_start, x_end, nb_tests);
 
   x_start = 0.5;
   x_end = 1.0;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_acos, acosl, x_start, x_end, nb_tests);
 
   x_start = -1.0;
   x_end = -0.5;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_acos, acosl, x_start, x_end, nb_tests);
 }
diff --git a/test/src/test_acospi.cpp b/test/src/test_acospi.cpp
@@ -18,21 +18,21 @@ TEST(acospi, test) {
 
   x_start = -0x1.0p-40;
   x_end = 0x1.0p-40;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_acospi, acospil, x_start, x_end, nb_tests);
 
   x_start = -0.5;
   x_end = 0.5;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_acospi, acospil, x_start, x_end, nb_tests);
 
   x_start = 0.5;
   x_end = 1.0;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_acospi, acospil, x_start, x_end, nb_tests);
 
   x_start = -1.0;
   x_end = -0.5;
-  nb_tests = 100000;
+  nb_tests = 10000;
   report_err_fp64(rvvlm_acospi, acospil, x_start, x_end, nb_tests);
 }
diff --git a/test/src/test_asinh.cpp b/test/src/test_asinh.cpp
@@ -18,41 +18,41 @@ TEST(asinh, test) {
 
   x_start = 0x1.0p-40;
   x_end = 0x1.0p-35;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_fp64(rvvlm_asinh, asinhl, x_start, x_end, nb_tests);
 
   x_start = -0x1.0p-35;
   x_end = -0x1.0p-40;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_fp64(rvvlm_asinh, asinhl, x_start, x_end, nb_tests);
 
   x_start = 0x1.0p-20;
   x_end = 0x1.0p-10;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_fp64(rvvlm_asinh, asinhl, x_start, x_end, nb_tests);
 
   x_start = 0x1.0p-6;
   x_end = 0x1.0p0;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_fp64(rvvlm_asinh, asinhl, x_start, x_end, nb_tests);
 
   x_start = 0x1.0p0;
   x_end = 0x1.0p2;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_fp64(rvvlm_asinh, asinhl, x_start, x_end, nb_tests);
 
   x_start = -0x1.0p0;
   x_end = -0x1.0p2;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_fp64(rvvlm_asinh, asinhl, x_start, x_end, nb_tests);
 
   x_start = 0x1.0p490;
   x_end = 0x1.0p520;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_fp64(rvvlm_asinh, asinhl, x_start, x_end, nb_tests);
 
   x_start = 0x1.0p1020;
   x_end = 0x1.FFFFFFFFFFp1023;
-  nb_tests = 40000;
+  nb_tests = 20000;
   report_err_fp64(rvvlm_asinh, asinhl, x_start, x_end, nb_tests);
 }
diff --git a/test/src/test_atan2.cpp b/test/src/test_atan2.cpp
@@ -24,7 +24,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = 0x1.01p0;
   y_end = 0x1.fffp0;
-  nb_y = 20000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 
@@ -34,7 +34,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = 0x1.01p1020;
   y_end = 0x1.ffffffffp1020;
-  nb_y = 20000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 
@@ -44,7 +44,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = 0x1.01p-1020;
   y_end = 0x1.ffffffffp-1020;
-  nb_y = 20000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 
@@ -54,7 +54,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = 0x1.01p0;
   y_end = 0x1.fffp0;
-  nb_y = 20000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 
@@ -64,7 +64,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = 0x1.01p1020;
   y_end = 0x1.ffffffffp1020;
-  nb_y = 20000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 
@@ -74,7 +74,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = 0x1.01p-1020;
   y_end = 0x1.ffffffffp-1020;
-  nb_y = 20000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 
@@ -84,7 +84,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = -0x1.01p0;
   y_end = -0x1.fffp0;
-  nb_y = 20000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 
@@ -94,7 +94,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = -0x1.01p1020;
   y_end = -0x1.ffffffffp1020;
-  nb_y = 20000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 
@@ -104,7 +104,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = -0x1.01p-1020;
   y_end = -0x1.ffffffffp-1020;
-  nb_y = 200000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 
@@ -114,7 +114,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = -0x1.01p0;
   y_end = -0x1.fffp0;
-  nb_y = 20000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 
@@ -124,7 +124,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = -0x1.01p1020;
   y_end = -0x1.ffffffffp1020;
-  nb_y = 20000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 
@@ -134,7 +134,7 @@ TEST(atan2, test) {
   nb_x = 8;
   y_start = -0x1.01p-1020;
   y_end = -0x1.ffffffffp-1020;
-  nb_y = 20000;
+  nb_y = 2000;
   report_err2_fp64(rvvlm_atan2, atan2l, x_start, x_end, nb_x, y_start, y_end,
                    nb_y, swap_xy);
 }