Merge pull request #30 from rivosinc/dev/PingTakPeterTang/loggamma

add FP64 loggamma function
rivosinc · Apr 10, 2024 · 1f8d20f · 1f8d20f
2 parents bddd67f + e71cbc7
commit 1f8d20f
Show file tree

Hide file tree

Showing 10 changed files with 906 additions and 0 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -98,6 +98,8 @@ set(PROJECT_SOURCES
     src/rvvlm_sinhDI.c
     src/rvvlm_tanhD.c
     src/rvvlm_tanhDI.c
+    src/rvvlm_lgammaD.c
+    src/rvvlm_lgammaDI.c
     src/rvvlm_tgammaD.c
     src/rvvlm_tgammaDI.c
 )

diff --git a/include/rvvlm.h b/include/rvvlm.h
@@ -143,6 +143,21 @@ union sui64_fp64 {
     (prod_lo) = __riscv_vfmsub((x), (y), (prod_hi), (vlen));                   \
   } while (0)
 
+#define PROD_X1Y2(x, y_hi, y_lo, prod_hi, prod_lo, vlen)                       \
+  do {                                                                         \
+    (prod_hi) = __riscv_vfmul((x), (y_hi), (vlen));                            \
+    (prod_lo) = __riscv_vfmsub((x), (y_hi), (prod_hi), (vlen));                \
+    (prod_lo) = __riscv_vfmacc((prod_lo), (x), (y_lo), (vlen));                \
+  } while (0)
+
+#define PROD_X2Y2(x_hi, x_lo, y_hi, y_lo, prod_hi, prod_lo, vlen)              \
+  do {                                                                         \
+    (prod_hi) = __riscv_vfmul((x_hi), (y_hi), (vlen));                         \
+    (prod_lo) = __riscv_vfmsub((x_hi), (y_hi), (prod_hi), (vlen));             \
+    (prod_lo) = __riscv_vfmacc((prod_lo), (x_hi), (y_lo), (vlen));             \
+    (prod_lo) = __riscv_vfmacc((prod_lo), (x_lo), (y_hi), (vlen));             \
+  } while (0)
+
 #define DIV_N1D2(numer, denom, delta_d, Q, q, vlen)                            \
   do {                                                                         \
     Q = __riscv_vfdiv((numer), (denom), (vlen));                               \
@@ -172,6 +187,16 @@ union sui64_fp64 {
     (delta_Q) = __riscv_vfmul(_q, __riscv_vfrec7((denom), (vlen)), (vlen));    \
   } while (0)
 
+#define ACC_DIV2_N1D2(numer, denom, delta_d, Q, delta_Q, vlen)                 \
+  do {                                                                         \
+    VFLOAT _recip, _q;                                                         \
+    _recip = __riscv_vfrdiv((denom), 0x1.0p0, (vlen));                         \
+    (Q) = __riscv_vfmul((numer), _recip, (vlen));                              \
+    _q = __riscv_vfnmsub((Q), (denom), (numer), (vlen));                       \
+    _q = __riscv_vfnmsac(_q, (Q), (delta_d), (vlen));                          \
+    (delta_Q) = __riscv_vfmul(_q, _recip, (vlen));                             \
+  } while (0)
+
 #define ACC_DIV2_N2D2(numer, delta_n, denom, delta_d, Q, delta_Q, vlen)        \
   do {                                                                         \
     VFLOAT _recip, _q;                                                         \
@@ -480,6 +505,13 @@ union sui64_fp64 {
 #define RVVLM_TANPIDI_VSET_CONFIG "rvvlm_fp64m2.h"
 #define RVVLM_TANPIDI_MERGED rvvlm_tanpiI
 
+// FP64 lgamma function configuration
+#define RVVLM_LGAMMAD_VSET_CONFIG "rvvlm_fp64m1.h"
+#define RVVLM_LGAMMAD_STD rvvlm_lgamma
+
+#define RVVLM_LGAMMADI_VSET_CONFIG "rvvlm_fp64m1.h"
+#define RVVLM_LGAMMADI_STD rvvlm_lgammaI
+
 // FP64 tgamma function configuration
 #define RVVLM_TGAMMAD_VSET_CONFIG "rvvlm_fp64m1.h"
 #define RVVLM_TGAMMAD_STD rvvlm_tgamma
@@ -722,6 +754,10 @@ void RVVLM_TANHD_STD(size_t x_len, const double *x, double *y);
 void RVVLM_TANHDI_STD(size_t x_len, const double *x, size_t stride_x, double *y,
                       size_t stride_y);
 
+void RVVLM_LGAMMAD_STD(size_t x_len, const double *x, double *y);
+void RVVLM_LGAMMADI_STD(size_t x_len, const double *x, size_t stride_x,
+                        double *y, size_t stride_y);
+
 void RVVLM_TGAMMAD_STD(size_t x_len, const double *x, double *y);
 void RVVLM_TGAMMADI_STD(size_t x_len, const double *x, size_t stride_x,
                         double *y, size_t stride_y);

diff --git a/include/rvvlm_lgammaD.inc.h b/include/rvvlm_lgammaD.inc.h
diff --git a/src/rvvlm_lgammaD.c b/src/rvvlm_lgammaD.c
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: 2023 Rivos Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <riscv_vector.h>
+#include <stdio.h>
+
+#include "rvvlm.h"
+#define API_SIGNATURE API_SIGNATURE_11
+#define STRIDE UNIT_STRIDE
+
+#include RVVLM_LGAMMAD_VSET_CONFIG
+
+#include "rvvlm_gammafuncsD.h"
+
+#include "rvvlm_lgammaD.inc.h"
diff --git a/src/rvvlm_lgammaDI.c b/src/rvvlm_lgammaDI.c
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: 2023 Rivos Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <riscv_vector.h>
+#include <stdio.h>
+
+#include "rvvlm.h"
+#define API_SIGNATURE API_SIGNATURE_11
+#define STRIDE GENERAL_STRIDE
+
+#include RVVLM_LGAMMADI_VSET_CONFIG
+
+#include "rvvlm_gammafuncsD.h"
+
+#include "rvvlm_lgammaD.inc.h"
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -79,6 +79,8 @@ set(TEST_SOURCES
     src/test_sinhI.cpp
     src/test_tanh.cpp
     src/test_tanhI.cpp
+    src/test_lgamma.cpp
+    src/test_lgammaI.cpp
     src/test_tgamma.cpp
     src/test_tgammaI.cpp
 )

diff --git a/test/include/test_infra.h b/test/include/test_infra.h
@@ -23,6 +23,10 @@ void report_err_fp64(void (*test_func)(size_t, const double *, double *,
 void report_err_fp64(void (*test_func)(size_t, const double *, double *),
                      long double (*ref_func)(long double), const double *, int);
 
+void report_mixederr_fp64(void (*test_func)(size_t, const double *, double *),
+                          long double (*ref_func)(long double), double, double,
+                          int, double = 1.0);
+
 // Second most common interface: testing on 1 interval
 // for 1-in-1-out general-stride function
 void report_err_fp64(void (*test_func)(size_t, const double *, size_t, double *,
@@ -106,3 +110,6 @@ long double stirling_power(long double);
 long double stirling_correction(long double);
 long double tgammal_mod(long double);
 long double sinpix_by_pi(long double);
+long double lgammap1l(long double);
+long double log_stirling(long double);
+long double log_stirling_correction(long double);
diff --git a/test/src/test_infra.cpp b/test/src/test_infra.cpp
@@ -832,6 +832,80 @@ void report_err_fp64(double (*test_func)(double), double (*ref_func)(double),
   printf("Maximum observed ULP      error is %3.3lf\n", max_ulp_err);
 }
 
+void report_mixederr_fp64(void (*test_func)(size_t, const double *, double *),
+                          long double (*ref_func)(long double), double start,
+                          double end, int nb_pts, double threshold) {
+
+  long double y_ref;
+  double *x, *y, delta;
+  long double abs_err, rel_err, ulp_err, mixed_err;
+  long double max_abs_err, max_rel_err, max_ulp_err, max_mixed_err;
+
+  x = (double *)malloc(nb_pts * sizeof(double));
+  y = (double *)malloc(nb_pts * sizeof(double));
+
+  if (nb_pts <= 1) {
+    delta = 0.0;
+    nb_pts = 1;
+  } else {
+    delta = (end - start) / (double)(nb_pts - 1);
+  }
+
+  max_abs_err = 0.0;
+  max_rel_err = 0.0;
+  max_ulp_err = 0.0;
+  max_mixed_err = 0.0;
+
+  // fill up the set of test points
+  for (int i = 0; i < nb_pts; ++i) {
+    x[i] = start + (double)i * delta;
+  }
+
+  // call function under test
+  test_func((size_t)nb_pts, x, y);
+
+  // now for each point we compute error and log the max
+  for (int j = 0; j < nb_pts; j++) {
+    y_ref = ref_func((long double)x[j]);
+    abs_err = (long double)y[j] - y_ref;
+    abs_err = ABS(abs_err);
+    if (ABS((double)y_ref) > 0.0) {
+      rel_err = abs_err / ABS((double)y_ref);
+    } else {
+      rel_err = abs_err / 0x1.0p-1074;
+    }
+    ulp_err = abs_err / ulp_64((double)y_ref);
+
+    mixed_err = MIN(ulp_err, abs_err * 0x1.0p+52);
+
+    max_abs_err = MAX(max_abs_err, abs_err);
+    max_rel_err = MAX(max_rel_err, rel_err);
+    max_ulp_err = MAX(max_ulp_err, ulp_err);
+    max_mixed_err = MAX(max_mixed_err, mixed_err);
+
+    if (VERBOSE) {
+      union sui64_fp64 xxx, yyy;
+      xxx.f = x[j];
+      yyy.f = y[j];
+      printf("--input %24.17le, 0x%016lx, output %24.17le, 0x%016lx \n", xxx.f,
+             xxx.ui, yyy.f, yyy.ui);
+      printf("  reference %24.17le\n\n", (double)y_ref);
+    }
+  }
+  printf("----------------------------\n");
+  if ((ABS(start) > 100.) || (ABS(end) < 1.e-2)) {
+    printf("Tested %d points in [%8.3le, %8.3le]\n", nb_pts, start, end);
+  } else {
+    printf("Tested %d points in [%3.3lf, %3.3lf]\n", nb_pts, start, end);
+  }
+  printf("Maximum observed mixed error is %8.3Le\n", max_mixed_err);
+
+  EXPECT_LT((double)max_mixed_err, threshold);
+
+  free(x);
+  free(y);
+}
+
 void report_err2_fp64(void (*test_func)(size_t, const double *, const double *,
                                         double *),
                       long double (*ref_func)(long double, long double),
@@ -1350,3 +1424,25 @@ long double sinpix_by_pi(long double x) {
   y = -sinl(y) / pi;
   return y;
 }
+
+long double lgammap1l(long double x) {
+  long double one = 1.0L;
+  long double y = x + one;
+  y = lgammal(y);
+  return y;
+}
+
+long double log_stirling(long double x) {
+  long double e = 2.7182818284590452353602874713526615L;
+  long double half = 0x1.0p-1L;
+  long double x_minus_half = x - half;
+  long double y = x / e;
+  y = (x - half) * logl(y);
+  return y;
+}
+
+long double log_stirling_correction(long double x) {
+  long double log_stir = log_stirling(x);
+  long double y = lgammal(x) - log_stir;
+  return y;
+}
diff --git a/test/src/test_lgamma.cpp b/test/src/test_lgamma.cpp
@@ -0,0 +1,137 @@
+// SPDX-FileCopyrightText: 2023 Rivos Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gtest/gtest.h>
+#include <math.h>
+
+#include "rvvlm.h"
+#include "test_infra.h"
+
+TEST(lgamma, special) {
+  unsigned long nb_tests;
+  double x_start, x_end;
+
+  COMMENT("lgamma: current chosen algorithm; reduced argument in FP64 only")
+
+  show_special_fp64(rvvlm_lgamma, "Special Value handling of this function");
+}
+
+TEST(lgamma, lgamma_neg) {
+  unsigned long nb_tests;
+  double x_start, x_end;
+
+  x_start = -0.999;
+  x_end = -0.0001;
+  nb_tests = 5000;
+  report_mixederr_fp64(rvvlm_lgamma, lgammal, x_start, x_end, nb_tests);
+
+  x_start = -1.99;
+  x_end = -1.01;
+  nb_tests = 5000;
+  report_mixederr_fp64(rvvlm_lgamma, lgammal, x_start, x_end, nb_tests);
+
+  x_start = -2.99;
+  x_end = -2.01;
+  nb_tests = 5000;
+  report_mixederr_fp64(rvvlm_lgamma, lgammal, x_start, x_end, nb_tests);
+
+  x_start = -3.99;
+  x_end = -3.01;
+  nb_tests = 5000;
+  report_mixederr_fp64(rvvlm_lgamma, lgammal, x_start, x_end, nb_tests);
+
+  x_start = -4.99;
+  x_end = -4.01;
+  nb_tests = 5000;
+  report_mixederr_fp64(rvvlm_lgamma, lgammal, x_start, x_end, nb_tests);
+
+  x_start = -5.99;
+  x_end = -5.01;
+  nb_tests = 5000;
+  report_mixederr_fp64(rvvlm_lgamma, lgammal, x_start, x_end, nb_tests);
+
+  x_start = -6.9999;
+  x_end = -6.00001;
+  nb_tests = 10000;
+  report_mixederr_fp64(rvvlm_lgamma, lgammal, x_start, x_end, nb_tests);
+
+  x_start = -1000.9999;
+  x_end = -1000.0001;
+  nb_tests = 10000;
+  report_mixederr_fp64(rvvlm_lgamma, lgammal, x_start, x_end, nb_tests);
+}
+TEST(lgamma, lgamma_left) {
+  unsigned long nb_tests;
+  double x_start, x_end;
+
+  x_start = 0x1.0p-1070;
+  x_end = 0x1.0p-1065;
+  nb_tests = 5000;
+  report_err_fp64(rvvlm_lgamma, lgammal, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-62;
+  x_end = 0x1.0p-58;
+  nb_tests = 5000;
+  report_err_fp64(rvvlm_lgamma, lgammal, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-58;
+  x_end = 0x1.0p-40;
+  nb_tests = 5000;
+  report_err_fp64(rvvlm_lgamma, lgammal, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-4;
+  x_end = 0x1.0p-1;
+  nb_tests = 5000;
+  report_err_fp64(rvvlm_lgamma, lgammal, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-1;
+  x_end = 0x1.ffffffffp-1;
+  nb_tests = 5000;
+  report_err_fp64(rvvlm_lgamma, lgammal, x_start, x_end, nb_tests);
+
+  x_start = 1.0;
+  x_end = 0x1.ffffffffp0;
+  nb_tests = 5000;
+  report_err_fp64(rvvlm_lgamma, lgammal, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p1;
+  x_end = 0x1.ffffffffp1;
+  nb_tests = 5000;
+  report_err_fp64(rvvlm_lgamma, lgammal, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p2;
+  x_end = 0x1.dfffffffp-1;
+  nb_tests = 5000;
+  report_err_fp64(rvvlm_lgamma, lgammal, x_start, x_end, nb_tests);
+}
+
+TEST(lgamma, lgamma_right) {
+  unsigned long nb_tests;
+  double x_start, x_end;
+
+  x_start = 2.25;
+  x_end = 10.0;
+  nb_tests = 5000;
+  report_err_fp64(rvvlm_lgamma, lgammal, x_start, x_end, nb_tests);
+
+  x_start = 10.0;
+  x_end = 100.0;
+  nb_tests = 5000;
+  report_err_fp64(rvvlm_lgamma, lgammal, x_start, x_end, nb_tests);
+}
+
+TEST(lgamma, lgamma_large) {
+  unsigned long nb_tests;
+  double x_start, x_end;
+
+  x_start = 0x1.0p800;
+  x_end = 0x1.0p802;
+  nb_tests = 5000;
+  report_err_fp64(rvvlm_lgamma, lgammal, x_start, x_end, nb_tests);
+
+  x_start = 0x1.1p1012;
+  x_end = 0x1.fffffp1012;
+  nb_tests = 5000;
+  report_err_fp64(rvvlm_lgamma, lgammal, x_start, x_end, nb_tests);
+}