diff --git a/.clang-format b/.clang-format
index 468ceea..67aace7 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1 +1,5 @@
-BasedOnStyle: LLVM
\ No newline at end of file
+# SPDX-FileCopyrightText: 2023 Rivos Inc.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+BasedOnStyle: LLVM
diff --git a/.clang-tidy b/.clang-tidy
index d97a435..3ac9c9c 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2023 Rivos Inc.
+#
+# SPDX-License-Identifier: Apache-2.0
+
 Checks: '-*,clang-diagnostic-*,llvm-*,misc-*,-misc-const-correctness,-misc-unused-parameters,-misc-non-private-member-variables-in-classes,-misc-no-recursion,-misc-use-anonymous-namespace,readability-identifier-naming'
 CheckOptions:
   - key:             readability-identifier-naming.ClassCase
diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
new file mode 100644
index 0000000..41e6a53
--- /dev/null
+++ b/.github/workflows/clang-format.yml
@@ -0,0 +1,40 @@
+# SPDX-FileCopyrightText: 2023 Rivos Inc.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+name: Format checking
+
+on: [push, pull_request]
+
+permissions:
+  contents: read
+
+jobs:
+  clang_formatting:
+    name: check_clang_format
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        python-version: ['3.11']
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          show-progress: true
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install Packages
+        run: |
+          sudo apt update
+          sudo apt install -y clang-format
+          python -m pip install click
+
+      - name: Check Clang Format
+        run: |
+          set -euo pipefail
+          python tools/run-clang-format.py check
+
diff --git a/include/rvvlm.h b/include/rvvlm.h
index ee53f37..2f96636 100644
--- a/include/rvvlm.h
+++ b/include/rvvlm.h
@@ -2,20 +2,30 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <stdint.h>
 #include <stdbool.h>
+#include <stdint.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-union sui32_fp32 {int32_t si; uint32_t ui; float f;};
-union sui64_fp64 {int64_t si; uint64_t ui; double f; uint32_t ui_hilo[2];};
+union sui32_fp32 {
+  int32_t si;
+  uint32_t ui;
+  float f;
+};
+union sui64_fp64 {
+  int64_t si;
+  uint64_t ui;
+  double f;
+  uint32_t ui_hilo[2];
+};
 
 #define ui_hilo_HI 1
 #define ui_hilo_LO 0
-// so that union sui64_f64 X will have X.hilo[HI] as the high bits (containing expoent)
-// and X.hilo[LO] has the lower order bits (containing the lsb for example)
+// so that union sui64_f64 X will have X.hilo[HI] as the high bits (containing
+// expoent) and X.hilo[LO] has the lower order bits (containing the lsb for
+// example)
 
 #define API_SIGNAUTRE_11 1
 #define API_SIGNATURE_21 2
@@ -25,125 +35,130 @@ union sui64_fp64 {int64_t si; uint64_t ui; double f; uint32_t ui_hilo[2];};
 #define UNIT_STRIDE 1
 #define GENERAL_STRIDE 2
 
-#define SET_ROUNDTONEAREST \
-int original_frm; bool need_to_restore; \
-do { \
-  (original_frm) = fegetround(); \
-  need_to_restore = (original_frm != FE_TONEAREST); \
-} while(0)
-
-#define RESTORE_FRM \
-do { \
-  if (need_to_restore) {fesetround((original_frm));} \
-} while(0)
-
-
-#define PSTEP(coeff_j, x, poly, vlen) \
-  __riscv_vfmadd((poly), (x), VFMV_VF((coeff_j), (vlen)), (vlen))   
-
-#define FAST2SUM(X, Y, S, s, vlen)                              \
-do {                                                            \
-  S = __riscv_vfadd((X), (Y), (vlen));                          \
-  s = __riscv_vfsub((X), (S), (vlen));                          \
-  s = __riscv_vfadd((s), (Y), (vlen));                          \
-} while(0)
-
-#define POS2SUM(X, Y, S, s, vlen) \
-do { \
-  VFLOAT _first = __riscv_vfmax((X), (Y), (vlen)); \
-  VFLOAT _second = __riscv_vfmin((X), (Y), (vlen)); \
-  S = __riscv_vfadd((X), (Y), (vlen)); \
-  s = __riscv_vfadd(__riscv_vfsub(_first, (S), (vlen)), _second, (vlen)); \
-} while(0)
-
-#define KNUTH2SUM(X, Y, S, s, vlen)                                                 \
-do {                                                                                \
-  S = __riscv_vfadd((X), (Y), (vlen));                                              \
-  VFLOAT X_hat = __riscv_vfsub((S), (Y), (vlen));                                   \
-  s = __riscv_vfadd(__riscv_vfsub((X), X_hat, (vlen)),                              \
-             __riscv_vfsub((Y), __riscv_vfsub((S), X_hat, (vlen)), (vlen)), (vlen));\
-} while(0)
-
-#define PROD_X1Y1(x, y, prod_hi, prod_lo, vlen) \
-do {\
-  prod_hi = __riscv_vfmul((x), (y), (vlen)); \
-  prod_lo = __riscv_vfmsub((x), (y), (prod_hi), (vlen)); \
-} while(0)
-
-#define DIV_N1D2(numer, denom, delta_d, Q, q, vlen) \
-do { \
-  Q = __riscv_vfdiv((numer), (denom), (vlen)); \
-  q = __riscv_vfnmsub((Q), (denom), (numer), (vlen)); \
-  q = __riscv_vfnmsac((q), (Q), (delta_d), (vlen)); \
-  q = __riscv_vfmul(q, __riscv_vfrec7((denom), (vlen)), (vlen)); \
-} while(0)
-
-#define IDENTIFY2(__vclass, __stencil, __signature, __identity_mask, __vlen)    \
-do {                                                                            \
-  __signature = __riscv_vand(__vclass, __stencil, __vlen);                      \
-  __identity_mask = __riscv_vmsgtu( __riscv_vand(__vclass, __stencil, __vlen),  \
-                            0, __vlen)                                          \
-} while(0)
-
-#define IDENTIFY(vclass, stencil, identity_mask, vlen) \
-  identity_mask = __riscv_vmsgtu( __riscv_vand((vclass), (stencil), (vlen)), 0, (vlen)); 
-
-#define FCLIP(vx, x_min, x_max, vlen) \
+#define SET_ROUNDTONEAREST                                                     \
+  int original_frm;                                                            \
+  bool need_to_restore;                                                        \
+  do {                                                                         \
+    (original_frm) = fegetround();                                             \
+    need_to_restore = (original_frm != FE_TONEAREST);                          \
+  } while (0)
+
+#define RESTORE_FRM                                                            \
+  do {                                                                         \
+    if (need_to_restore) {                                                     \
+      fesetround((original_frm));                                              \
+    }                                                                          \
+  } while (0)
+
+#define PSTEP(coeff_j, x, poly, vlen)                                          \
+  __riscv_vfmadd((poly), (x), VFMV_VF((coeff_j), (vlen)), (vlen))
+
+#define FAST2SUM(X, Y, S, s, vlen)                                             \
+  do {                                                                         \
+    S = __riscv_vfadd((X), (Y), (vlen));                                       \
+    s = __riscv_vfsub((X), (S), (vlen));                                       \
+    s = __riscv_vfadd((s), (Y), (vlen));                                       \
+  } while (0)
+
+#define POS2SUM(X, Y, S, s, vlen)                                              \
+  do {                                                                         \
+    VFLOAT _first = __riscv_vfmax((X), (Y), (vlen));                           \
+    VFLOAT _second = __riscv_vfmin((X), (Y), (vlen));                          \
+    S = __riscv_vfadd((X), (Y), (vlen));                                       \
+    s = __riscv_vfadd(__riscv_vfsub(_first, (S), (vlen)), _second, (vlen));    \
+  } while (0)
+
+#define KNUTH2SUM(X, Y, S, s, vlen)                                            \
+  do {                                                                         \
+    S = __riscv_vfadd((X), (Y), (vlen));                                       \
+    VFLOAT X_hat = __riscv_vfsub((S), (Y), (vlen));                            \
+    s = __riscv_vfadd(                                                         \
+        __riscv_vfsub((X), X_hat, (vlen)),                                     \
+        __riscv_vfsub((Y), __riscv_vfsub((S), X_hat, (vlen)), (vlen)),         \
+        (vlen));                                                               \
+  } while (0)
+
+#define PROD_X1Y1(x, y, prod_hi, prod_lo, vlen)                                \
+  do {                                                                         \
+    prod_hi = __riscv_vfmul((x), (y), (vlen));                                 \
+    prod_lo = __riscv_vfmsub((x), (y), (prod_hi), (vlen));                     \
+  } while (0)
+
+#define DIV_N1D2(numer, denom, delta_d, Q, q, vlen)                            \
+  do {                                                                         \
+    Q = __riscv_vfdiv((numer), (denom), (vlen));                               \
+    q = __riscv_vfnmsub((Q), (denom), (numer), (vlen));                        \
+    q = __riscv_vfnmsac((q), (Q), (delta_d), (vlen));                          \
+    q = __riscv_vfmul(q, __riscv_vfrec7((denom), (vlen)), (vlen));             \
+  } while (0)
+
+#define IDENTIFY2(__vclass, __stencil, __signature, __identity_mask, __vlen)   \
+  do {                                                                         \
+    __signature = __riscv_vand(__vclass, __stencil, __vlen);                   \
+    __identity_mask =                                                          \
+        __riscv_vmsgtu(__riscv_vand(__vclass, __stencil, __vlen), 0, __vlen)   \
+  } while (0)
+
+#define IDENTIFY(vclass, stencil, identity_mask, vlen)                         \
+  identity_mask =                                                              \
+      __riscv_vmsgtu(__riscv_vand((vclass), (stencil), (vlen)), 0, (vlen));
+
+#define FCLIP(vx, x_min, x_max, vlen)                                          \
   __riscv_vfmin(__riscv_vfmax((vx), X_MIN, (vlen)), X_MAX, (vlen))
 
-#define FAST_LDEXP(num, exp, vlen)                          \
-do {                                                        \
-  VINT n1 =__riscv_vsra((exp), 1, (vlen));                  \
-  VINT n2 = __riscv_vsub((exp), n1, (vlen));                \
-  n1 = __riscv_vsll(n1, MAN_LEN, (vlen));                   \
-  num = I_AS_F( __riscv_vadd(F_AS_I((num)), n1, (vlen)));   \
-  n2 = __riscv_vadd( n2, EXP_BIAS, (vlen));                 \
-  n2 = __riscv_vsll(n2, MAN_LEN, (vlen));                   \
-  num = __riscv_vfmul((num), I_AS_F(n2), (vlen));           \
-} while(0)
-
-// Some of the functions have multiple implementations using different algorithms or styles.
-// The following configure the name of each of these variations, thus allowing one to be
-// set to the standard libm name.
+#define FAST_LDEXP(num, exp, vlen)                                             \
+  do {                                                                         \
+    VINT n1 = __riscv_vsra((exp), 1, (vlen));                                  \
+    VINT n2 = __riscv_vsub((exp), n1, (vlen));                                 \
+    n1 = __riscv_vsll(n1, MAN_LEN, (vlen));                                    \
+    num = I_AS_F(__riscv_vadd(F_AS_I((num)), n1, (vlen)));                     \
+    n2 = __riscv_vadd(n2, EXP_BIAS, (vlen));                                   \
+    n2 = __riscv_vsll(n2, MAN_LEN, (vlen));                                    \
+    num = __riscv_vfmul((num), I_AS_F(n2), (vlen));                            \
+  } while (0)
+
+// Some of the functions have multiple implementations using different
+// algorithms or styles. The following configure the name of each of these
+// variations, thus allowing one to be set to the standard libm name.
 
 // FP64 exp function configuration
 #define RVVLM_EXPD_VSET_CONFIG "rvvlm_fp64m4.h"
 #define RVVLM_EXPD_STD rvvlm_expD_std
 #define RVVLM_EXPD_STD_R_EXTRA rvvlm_exp
 #define RVVLM_EXPD_STD_EPSIM rvvlm_expD_std_epsim
-#define RVVLM_EXPD_TBL64  rvvlm_expD_tbl64
+#define RVVLM_EXPD_TBL64 rvvlm_expD_tbl64
 
 #define RVVLM_EXPDI_VSET_CONFIG "rvvlm_fp64m4.h"
 #define RVVLM_EXPDI_STD rvvlm_expID_std
 #define RVVLM_EXPDI_STD_R_EXTRA rvvlm_expI
 #define RVVLM_EXPDI_STD_EPSIM rvvlm_expDI_std_epsim
-#define RVVLM_EXPDI_TBL64  rvvlm_expDI_tbl64
+#define RVVLM_EXPDI_TBL64 rvvlm_expDI_tbl64
 
 // FP64 exp2 function configuration
 #define RVVLM_EXP2D_VSET_CONFIG "rvvlm_fp64m2.h"
 #define RVVLM_EXP2D_STD rvvlm_exp2D_std
 #define RVVLM_EXP2D_STD_R_EXTRA rvvlm_exp2
 #define RVVLM_EXP2D_STD_EPSIM rvvlm_exp2D_std_epsim
-#define RVVLM_EXP2D_TBL64  rvvlm_exp2D_tbl64
+#define RVVLM_EXP2D_TBL64 rvvlm_exp2D_tbl64
 
 #define RVVLM_EXP2DI_VSET_CONFIG "rvvlm_fp64m2.h"
 #define RVVLM_EXP2DI_STD rvvlm_exp2ID_std
 #define RVVLM_EXP2DI_STD_R_EXTRA rvvlm_exp2I
 #define RVVLM_EXP2DI_STD_EPSIM rvvlm_exp2DI_std_epsim
-#define RVVLM_EXP2DI_TBL64  rvvlm_exp2DI_tbl64
+#define RVVLM_EXP2DI_TBL64 rvvlm_exp2DI_tbl64
 
 // FP64 exp10 function configuration
 #define RVVLM_EXP10D_VSET_CONFIG "rvvlm_fp64m1.h"
 #define RVVLM_EXP10D_STD rvvlm_exp10D_std
 #define RVVLM_EXP10D_STD_R_EXTRA rvvlm_exp10
 #define RVVLM_EXP10D_STD_EPSIM rvvlm_exp10D_std_epsim
-#define RVVLM_EXP10D_TBL64  rvvlm_exp10D_tbl64
+#define RVVLM_EXP10D_TBL64 rvvlm_exp10D_tbl64
 
 #define RVVLM_EXP10DI_VSET_CONFIG "rvvlm_fp64m1.h"
 #define RVVLM_EXP10DI_STD rvvlm_exp10ID_std
 #define RVVLM_EXP10DI_STD_R_EXTRA rvvlm_exp10I
 #define RVVLM_EXP10DI_STD_EPSIM rvvlm_exp10DI_std_epsim
-#define RVVLM_EXP10DI_TBL64  rvvlm_exp10DI_tbl64
+#define RVVLM_EXP10DI_TBL64 rvvlm_exp10DI_tbl64
 
 // FP64 expm1 function configuration
 #define RVVLM_EXPM1D_VSET_CONFIG "rvvlm_fp64m2.h"
@@ -198,57 +213,77 @@ extern double logtbl_4_powD_128_hi_lo[256];
 
 // Define the functions in the vector math library
 void RVVLM_EXPD_STD(size_t x_len, const double *x, double *y);
-void RVVLM_EXPDI_STD(size_t x_len, const double *x, size_t stride_x, double *y, size_t stride_y);
+void RVVLM_EXPDI_STD(size_t x_len, const double *x, size_t stride_x, double *y,
+                     size_t stride_y);
 void RVVLM_EXPD_STD_R_EXTRA(size_t x_len, const double *x, double *y);
-void RVVLM_EXPDI_STD_R_EXTRA(size_t x_len, const double *x, size_t stride_x, double *y, size_t stride_y);
+void RVVLM_EXPDI_STD_R_EXTRA(size_t x_len, const double *x, size_t stride_x,
+                             double *y, size_t stride_y);
 void RVVLM_EXPD_STD_EPSIM(size_t x_len, const double *x, double *y);
-void RVVLM_EXPDI_STD_EPSIM(size_t x_len, const double *x, size_t stride_x, double *y, size_t stride_y);
+void RVVLM_EXPDI_STD_EPSIM(size_t x_len, const double *x, size_t stride_x,
+                           double *y, size_t stride_y);
 void RVVLM_EXPD_TBL64(size_t x_len, const double *x, double *y);
-void RVVLM_EXPDI_TBL64(size_t x_len, const double *x, size_t stride_x, double *y, size_t stride_y);
+void RVVLM_EXPDI_TBL64(size_t x_len, const double *x, size_t stride_x,
+                       double *y, size_t stride_y);
 
 void RVVLM_EXP2D_STD(size_t x_len, const double *x, double *y);
-void RVVLM_EXP2DI_STD(size_t x_len, const double *x, size_t stride_x, double *y, size_t stride_y);
+void RVVLM_EXP2DI_STD(size_t x_len, const double *x, size_t stride_x, double *y,
+                      size_t stride_y);
 void RVVLM_EXP2D_STD_R_EXTRA(size_t x_len, const double *x, double *y);
-void RVVLM_EXP2DI_STD_R_EXTRA(size_t x_len, const double *x, size_t stride_x, double *y, size_t stride_y);
+void RVVLM_EXP2DI_STD_R_EXTRA(size_t x_len, const double *x, size_t stride_x,
+                              double *y, size_t stride_y);
 void RVVLM_EXP2D_STD_EPSIM(size_t x_len, const double *x, double *y);
-void RVVLM_EXP2DI_STD_EPSIM(size_t x_len, const double *x, size_t stride_x, double *y, size_t stride_y);
+void RVVLM_EXP2DI_STD_EPSIM(size_t x_len, const double *x, size_t stride_x,
+                            double *y, size_t stride_y);
 void RVVLM_EXP2D_TBL64(size_t x_len, const double *x, double *y);
-void RVVLM_EXP2DI_TBL64(size_t x_len, const double *x, size_t stride_x, double *y, size_t stride_y);
+void RVVLM_EXP2DI_TBL64(size_t x_len, const double *x, size_t stride_x,
+                        double *y, size_t stride_y);
 
 void RVVLM_EXP10D_STD(size_t x_len, const double *x, double *y);
-void RVVLM_EXP10DI_STD(size_t x_len, const double *x, size_t stride_x, double *y, size_t stride_y);
+void RVVLM_EXP10DI_STD(size_t x_len, const double *x, size_t stride_x,
+                       double *y, size_t stride_y);
 void RVVLM_EXP10D_STD_R_EXTRA(size_t x_len, const double *x, double *y);
-void RVVLM_EXP10DI_STD_R_EXTRA(size_t x_len, const double *x, size_t stride_x, double *y, size_t stride_y);
+void RVVLM_EXP10DI_STD_R_EXTRA(size_t x_len, const double *x, size_t stride_x,
+                               double *y, size_t stride_y);
 void RVVLM_EXP10D_STD_EPSIM(size_t x_len, const double *x, double *y);
-void RVVLM_EXP10DI_STD_EPSIM(size_t x_len, const double *x, size_t stride_x, double *y, size_t stride_y);
+void RVVLM_EXP10DI_STD_EPSIM(size_t x_len, const double *x, size_t stride_x,
+                             double *y, size_t stride_y);
 void RVVLM_EXP10D_TBL64(size_t x_len, const double *x, double *y);
-void RVVLM_EXP10DI_TBL64(size_t x_len, const double *x, size_t stride_x, double *y, size_t stride_y);
+void RVVLM_EXP10DI_TBL64(size_t x_len, const double *x, size_t stride_x,
+                         double *y, size_t stride_y);
 
 void RVVLM_EXPM1D_STD_EPSIM(size_t x_len, const double *x, double *y);
-void RVVLM_EXPM1DI_STD_EPSIM(size_t x_len, const double *x, size_t stride_x, double *y, size_t stride_y);
+void RVVLM_EXPM1DI_STD_EPSIM(size_t x_len, const double *x, size_t stride_x,
+                             double *y, size_t stride_y);
 
 void RVVLM_LOGD_TBL128(size_t x_len, const double *x, double *y);
-void RVVLM_LOGDI_TBL128(size_t x_len, const double *x, size_t stride_x, double *y, size_t stride_y);
+void RVVLM_LOGDI_TBL128(size_t x_len, const double *x, size_t stride_x,
+                        double *y, size_t stride_y);
 void RVVLM_LOGD_ATANH(size_t x_len, const double *x, double *y);
-void RVVLM_LOGDI_ATANH(size_t x_len, const double *x, size_t stride_x, double *y, size_t stride_y);
+void RVVLM_LOGDI_ATANH(size_t x_len, const double *x, size_t stride_x,
+                       double *y, size_t stride_y);
 
 void RVVLM_LOG10D_TBL128(size_t x_len, const double *x, double *y);
-void RVVLM_LOG10DI_TBL128(size_t x_len, const double *x, size_t stride_x, double *y, size_t stride_y);
+void RVVLM_LOG10DI_TBL128(size_t x_len, const double *x, size_t stride_x,
+                          double *y, size_t stride_y);
 void RVVLM_LOG10D_ATANH(size_t x_len, const double *x, double *y);
-void RVVLM_LOG10DI_ATANH(size_t x_len, const double *x, size_t stride_x, double *y, size_t stride_y);
+void RVVLM_LOG10DI_ATANH(size_t x_len, const double *x, size_t stride_x,
+                         double *y, size_t stride_y);
 
 void RVVLM_LOG2D_TBL128(size_t x_len, const double *x, double *y);
-void RVVLM_LOG2DI_TBL128(size_t x_len, const double *x, size_t stride_x, double *y, size_t stride_y);
+void RVVLM_LOG2DI_TBL128(size_t x_len, const double *x, size_t stride_x,
+                         double *y, size_t stride_y);
 void RVVLM_LOG2D_ATANH(size_t x_len, const double *x, double *y);
-void RVVLM_LOG2DI_ATANH(size_t x_len, const double *x, size_t stride_x, double *y, size_t stride_y);
+void RVVLM_LOG2DI_ATANH(size_t x_len, const double *x, size_t stride_x,
+                        double *y, size_t stride_y);
 
 void RVVLM_LOG1PD_TBL128(size_t x_len, const double *x, double *y);
-void RVVLM_LOG1PDI_TBL128(size_t x_len, const double *x, size_t stride_x, double *y, size_t stride_y);
+void RVVLM_LOG1PDI_TBL128(size_t x_len, const double *x, size_t stride_x,
+                          double *y, size_t stride_y);
 
 void RVVLM_POWD_TBL(size_t x_len, const double *x, const double *y, double *z);
-void RVVLM_POWDI_TBL(size_t x_len, const double *x, size_t stride_x, 
-                     const double *y, size_t stride_y, double *z, size_t stride_z);
-
+void RVVLM_POWDI_TBL(size_t x_len, const double *x, size_t stride_x,
+                     const double *y, size_t stride_y, double *z,
+                     size_t stride_z);
 
 #ifdef __cplusplus
 }
diff --git a/include/rvvlm_expD.h b/include/rvvlm_expD.h
index f691770..ae957af 100644
--- a/include/rvvlm_expD.h
+++ b/include/rvvlm_expD.h
@@ -3,28 +3,28 @@
 // SPDX-License-Identifier: Apache-2.0
 
 // Macros for common small-block codes
-#define EXCEPTION_HANDLING_EXP(vx, special_args, vy_special, vlen) {     \
-  VUINT vclass = __riscv_vfclass((vx), (vlen));  			 \
-  IDENTIFY(vclass, class_NaN|class_Inf, (special_args), (vlen)) 	 \
-  UINT nb_special_args = __riscv_vcpop((special_args), (vlen)); 	 \
-  if (nb_special_args > 0) { 						 \
-  /* Substitute -Inf with +0 */ 					 \
-  VBOOL id_mask; 							 \
-  IDENTIFY(vclass, class_negInf, id_mask, (vlen)) 			 \
-  vx = __riscv_vfmerge(vx, fp_posZero, id_mask, (vlen)); 		 \
-  vy_special = __riscv_vfadd((special_args), (vx), (vx), (vlen));        \
-  vx = __riscv_vfmerge((vx), fp_posZero, (special_args), (vlen));        \
-  } 									 \
-}	
+#define EXCEPTION_HANDLING_EXP(vx, special_args, vy_special, vlen)             \
+  {                                                                            \
+    VUINT vclass = __riscv_vfclass((vx), (vlen));                              \
+    IDENTIFY(vclass, class_NaN | class_Inf, (special_args), (vlen))            \
+    UINT nb_special_args = __riscv_vcpop((special_args), (vlen));              \
+    if (nb_special_args > 0) {                                                 \
+      /* Substitute -Inf with +0 */                                            \
+      VBOOL id_mask;                                                           \
+      IDENTIFY(vclass, class_negInf, id_mask, (vlen))                          \
+      vx = __riscv_vfmerge(vx, fp_posZero, id_mask, (vlen));                   \
+      vy_special = __riscv_vfadd((special_args), (vx), (vx), (vlen));          \
+      vx = __riscv_vfmerge((vx), fp_posZero, (special_args), (vlen));          \
+    }                                                                          \
+  }
 
 #define LOG2_INV 0x1.71547652b82fep+0
-#define LOG2_HI  0x1.62e42fefa39efp-1
-#define LOG2_LO  0x1.abc9e3b39803fp-56
+#define LOG2_HI 0x1.62e42fefa39efp-1
+#define LOG2_LO 0x1.abc9e3b39803fp-56
 
-#define LOG2_INV_64   0x1.71547652b82fep+6
-#define LOG2_BY64_HI  0x1.62e42fefa39efp-7
-#define LOG2_BY64_LO  0x1.abc9e3b39803fp-62
+#define LOG2_INV_64 0x1.71547652b82fep+6
+#define LOG2_BY64_HI 0x1.62e42fefa39efp-7
+#define LOG2_BY64_LO 0x1.abc9e3b39803fp-62
 
-#define X_MAX  0x1.65p+9
+#define X_MAX 0x1.65p+9
 #define X_MIN -0x1.77p+9
-
diff --git a/include/rvvlm_expD.inc.h b/include/rvvlm_expD.inc.h
index 30e7e4f..1c2af72 100644
--- a/include/rvvlm_expD.inc.h
+++ b/include/rvvlm_expD.inc.h
@@ -3,85 +3,85 @@
 // SPDX-License-Identifier: Apache-2.0
 
 // Macros for common small-block codes
-#define EXCEPTION_HANDLING_EXP(vx, special_args, vy_special, vlen)      \
-do {                                                                    \
-  VUINT vclass = __riscv_vfclass((vx), (vlen));                         \
-  IDENTIFY(vclass, class_NaN|class_Inf, (special_args), (vlen))         \
-  UINT nb_special_args = __riscv_vcpop((special_args), (vlen));         \
-  if (nb_special_args > 0) {                                            \
-  /* Substitute -Inf with +0 */                                         \
-  VBOOL id_mask;                                                        \
-  IDENTIFY(vclass, class_negInf, id_mask, (vlen))                       \
-  vx = __riscv_vfmerge(vx, fp_posZero, id_mask, (vlen));                \
-  vy_special = __riscv_vfadd((special_args), (vx), (vx), (vlen));       \
-  vx = __riscv_vfmerge((vx), fp_posZero, (special_args), (vlen));       \
-  }                                                                     \
-} while(0)
+#define EXCEPTION_HANDLING_EXP(vx, special_args, vy_special, vlen)             \
+  do {                                                                         \
+    VUINT vclass = __riscv_vfclass((vx), (vlen));                              \
+    IDENTIFY(vclass, class_NaN | class_Inf, (special_args), (vlen))            \
+    UINT nb_special_args = __riscv_vcpop((special_args), (vlen));              \
+    if (nb_special_args > 0) {                                                 \
+      /* Substitute -Inf with +0 */                                            \
+      VBOOL id_mask;                                                           \
+      IDENTIFY(vclass, class_negInf, id_mask, (vlen))                          \
+      vx = __riscv_vfmerge(vx, fp_posZero, id_mask, (vlen));                   \
+      vy_special = __riscv_vfadd((special_args), (vx), (vx), (vlen));          \
+      vx = __riscv_vfmerge((vx), fp_posZero, (special_args), (vlen));          \
+    }                                                                          \
+  } while (0)
 
 #if defined(COMPILE_FOR_EXP)
 #if (STRIDE == UNIT_STRIDE)
-#define F_VER1    RVVLM_EXPD_STD
-#define F_VER2    RVVLM_EXPD_STD_R_EXTRA
-#define F_VER3    RVVLM_EXPD_STD_EPSIM
-#define F_VER4    RVVLM_EXPD_TBL64
+#define F_VER1 RVVLM_EXPD_STD
+#define F_VER2 RVVLM_EXPD_STD_R_EXTRA
+#define F_VER3 RVVLM_EXPD_STD_EPSIM
+#define F_VER4 RVVLM_EXPD_TBL64
 #else
-#define F_VER1    RVVLM_EXPDI_STD
-#define F_VER2    RVVLM_EXPDI_STD_R_EXTRA
-#define F_VER3    RVVLM_EXPDI_STD_EPSIM
-#define F_VER4    RVVLM_EXPDI_TBL64
+#define F_VER1 RVVLM_EXPDI_STD
+#define F_VER2 RVVLM_EXPDI_STD_R_EXTRA
+#define F_VER3 RVVLM_EXPDI_STD_EPSIM
+#define F_VER4 RVVLM_EXPDI_TBL64
 #endif
 #define P_INV_STD 0x1.71547652b82fep+0
-#define P_HI_STD  0x1.62e42fefa39efp-1
-#define P_LO_STD  0x1.abc9e3b39803fp-56
+#define P_HI_STD 0x1.62e42fefa39efp-1
+#define P_LO_STD 0x1.abc9e3b39803fp-56
 #define P_INV_TBL 0x1.71547652b82fep+6
-#define P_HI_TBL  0x1.62e42fefa39efp-7
-#define P_LO_TBL  0x1.abc9e3b39803fp-62
-#define X_MAX     0x1.65p+9
-#define X_MIN    -0x1.77p+9
+#define P_HI_TBL 0x1.62e42fefa39efp-7
+#define P_LO_TBL 0x1.abc9e3b39803fp-62
+#define X_MAX 0x1.65p+9
+#define X_MIN -0x1.77p+9
 #elif defined(COMPILE_FOR_EXP2)
 #if (STRIDE == UNIT_STRIDE)
-#define F_VER1    RVVLM_EXP2D_STD
-#define F_VER2    RVVLM_EXP2D_STD_R_EXTRA
-#define F_VER3    RVVLM_EXP2D_STD_EPSIM
-#define F_VER4    RVVLM_EXP2D_TBL64
+#define F_VER1 RVVLM_EXP2D_STD
+#define F_VER2 RVVLM_EXP2D_STD_R_EXTRA
+#define F_VER3 RVVLM_EXP2D_STD_EPSIM
+#define F_VER4 RVVLM_EXP2D_TBL64
 #else
-#define F_VER1    RVVLM_EXP2DI_STD
-#define F_VER2    RVVLM_EXP2DI_STD_R_EXTRA
-#define F_VER3    RVVLM_EXP2DI_STD_EPSIM
-#define F_VER4    RVVLM_EXP2DI_TBL64
+#define F_VER1 RVVLM_EXP2DI_STD
+#define F_VER2 RVVLM_EXP2DI_STD_R_EXTRA
+#define F_VER3 RVVLM_EXP2DI_STD_EPSIM
+#define F_VER4 RVVLM_EXP2DI_TBL64
 #endif
 #define P_INV_STD 0x1.71547652b82fep+0
-#define P_HI_STD  0x1.62e42fefa39efp-1
-#define P_LO_STD  0x1.abc9e3b39803fp-56
+#define P_HI_STD 0x1.62e42fefa39efp-1
+#define P_LO_STD 0x1.abc9e3b39803fp-56
 #define P_INV_TBL 0x1.0p6
-#define P_HI_TBL  0x1.0p-6
-#define P_LO_TBL  0x1.abc9e3b39803fp-56
-#define LOGB_HI   0x1.62e42fefa39efp-1
-#define LOGB_LO   0x1.abc9e3b39803fp-56
-#define X_MAX     0x1.018p10
-#define X_MIN    -0x1.0ep10
+#define P_HI_TBL 0x1.0p-6
+#define P_LO_TBL 0x1.abc9e3b39803fp-56
+#define LOGB_HI 0x1.62e42fefa39efp-1
+#define LOGB_LO 0x1.abc9e3b39803fp-56
+#define X_MAX 0x1.018p10
+#define X_MIN -0x1.0ep10
 #elif defined(COMPILE_FOR_EXP10)
 #if (STRIDE == 1)
-#define F_VER1    RVVLM_EXP10D_STD
-#define F_VER2    RVVLM_EXP10D_STD_R_EXTRA
-#define F_VER3    RVVLM_EXP10D_STD_EPSIM
-#define F_VER4    RVVLM_EXP10D_TBL64
+#define F_VER1 RVVLM_EXP10D_STD
+#define F_VER2 RVVLM_EXP10D_STD_R_EXTRA
+#define F_VER3 RVVLM_EXP10D_STD_EPSIM
+#define F_VER4 RVVLM_EXP10D_TBL64
 #else
-#define F_VER1    RVVLM_EXP10DI_STD
-#define F_VER2    RVVLM_EXP10DI_STD_R_EXTRA
-#define F_VER3    RVVLM_EXP10DI_STD_EPSIM
-#define F_VER4    RVVLM_EXP10DI_TBL64
+#define F_VER1 RVVLM_EXP10DI_STD
+#define F_VER2 RVVLM_EXP10DI_STD_R_EXTRA
+#define F_VER3 RVVLM_EXP10DI_STD_EPSIM
+#define F_VER4 RVVLM_EXP10DI_TBL64
 #endif
 #define P_INV_STD 0x1.a934f0979a371p+1
-#define P_HI_STD  0x1.34413509f79ffp-2  
-#define P_LO_STD -0x1.9dc1da994fd21p-59 
+#define P_HI_STD 0x1.34413509f79ffp-2
+#define P_LO_STD -0x1.9dc1da994fd21p-59
 #define P_INV_TBL 0x1.a934f0979a371p+7
-#define P_HI_TBL  0x1.34413509f79ffp-8  
-#define P_LO_TBL -0x1.9dc1da994fd21p-65 
-#define LOGB_HI   0x1.26bb1bbb55516p+1   
-#define LOGB_LO  -0x1.f48ad494ea3e9p-53
-#define X_MAX     0x1.36p8
-#define X_MIN    -0x1.46p8
+#define P_HI_TBL 0x1.34413509f79ffp-8
+#define P_LO_TBL -0x1.9dc1da994fd21p-65
+#define LOGB_HI 0x1.26bb1bbb55516p+1
+#define LOGB_LO -0x1.f48ad494ea3e9p-53
+#define X_MAX 0x1.36p8
+#define X_MIN -0x1.46p8
 #else
 static_assert(false, "Must specify base of exponential" __FILE__);
 #endif
@@ -91,370 +91,371 @@ static_assert(false, "Must specify base of exponential" __FILE__);
 // Version 1 is reduction to standard primary interval.
 // Reduced argument is represented as one FP64 variable.
 void F_VER1(API) {
-    size_t vlen;
-    VFLOAT vx, vy, vy_special;
-    VBOOL special_args;
-
-    SET_ROUNDTONEAREST;
-    // stripmining over input arguments
-    for (; _inarg_n > 0; _inarg_n -= vlen) {
-        vlen = VSET(_inarg_n);
-        vx = VFLOAD_INARG1(vlen);
-
-        // Set results for input of NaN and Inf; substitute them with zero
-        EXCEPTION_HANDLING_EXP(vx, special_args, vy_special, vlen);
- 
-        // Clip
-        vx = FCLIP(vx, X_MIN, X_MAX, vlen);
-
-        // Argument reduction
+  size_t vlen;
+  VFLOAT vx, vy, vy_special;
+  VBOOL special_args;
+
+  SET_ROUNDTONEAREST;
+  // stripmining over input arguments
+  for (; _inarg_n > 0; _inarg_n -= vlen) {
+    vlen = VSET(_inarg_n);
+    vx = VFLOAD_INARG1(vlen);
+
+    // Set results for input of NaN and Inf; substitute them with zero
+    EXCEPTION_HANDLING_EXP(vx, special_args, vy_special, vlen);
+
+    // Clip
+    vx = FCLIP(vx, X_MIN, X_MAX, vlen);
+
+    // Argument reduction
 #if !defined(COMPILE_FOR_EXP2)
-        VFLOAT flt_n = __riscv_vfmul(vx, P_INV_STD, vlen);
-        VINT n = __riscv_vfcvt_x(flt_n, vlen);
-        flt_n = __riscv_vfcvt_f(n, vlen);
-        VFLOAT r = __riscv_vfnmsac(vx, P_HI_STD, flt_n, vlen);
+    VFLOAT flt_n = __riscv_vfmul(vx, P_INV_STD, vlen);
+    VINT n = __riscv_vfcvt_x(flt_n, vlen);
+    flt_n = __riscv_vfcvt_f(n, vlen);
+    VFLOAT r = __riscv_vfnmsac(vx, P_HI_STD, flt_n, vlen);
 #else
-        VINT n = __riscv_vfcvt_x(vx, vlen);
-        VFLOAT flt_n = __riscv_vfcvt_f(n, vlen);
-        VFLOAT r = __riscv_vfsub(vx, flt_n, vlen);
-        r = __riscv_vfmul(r, LOGB_HI, vlen); 
+    VINT n = __riscv_vfcvt_x(vx, vlen);
+    VFLOAT flt_n = __riscv_vfcvt_f(n, vlen);
+    VFLOAT r = __riscv_vfsub(vx, flt_n, vlen);
+    r = __riscv_vfmul(r, LOGB_HI, vlen);
 #endif
 
-#if defined(COMPILE_FOR_EXP) 
-        r = __riscv_vfnmsac(r, P_LO_STD, flt_n, vlen);
+#if defined(COMPILE_FOR_EXP)
+    r = __riscv_vfnmsac(r, P_LO_STD, flt_n, vlen);
 #elif defined(COMPILE_FOR_EXP10)
-        VFLOAT r_lo = __riscv_vfmul(flt_n, P_LO_STD, vlen);
-        r_lo = __riscv_vfnmsac(__riscv_vfmul(r, LOGB_LO, vlen), LOGB_HI, r_lo, vlen);
-        r = __riscv_vfmadd(r, LOGB_HI, r_lo, vlen);
+    VFLOAT r_lo = __riscv_vfmul(flt_n, P_LO_STD, vlen);
+    r_lo =
+        __riscv_vfnmsac(__riscv_vfmul(r, LOGB_LO, vlen), LOGB_HI, r_lo, vlen);
+    r = __riscv_vfmadd(r, LOGB_HI, r_lo, vlen);
 #endif
 
-        // Polynomial computation, we have a degree 11
-        // We compute the part from r^3 in three segments, increasing parallelism
-        // Ideally the compiler will interleave the computations of the segments
-        VFLOAT poly_right = PSTEP(  0x1.71df804f1baa1p-19, r,
-                            PSTEP(  0x1.28aa3ea739296p-22, 0x1.acf86201fd199p-26, r,
-                            vlen), vlen);
+    // Polynomial computation, we have a degree 11
+    // We compute the part from r^3 in three segments, increasing parallelism
+    // Ideally the compiler will interleave the computations of the segments
+    VFLOAT poly_right = PSTEP(
+        0x1.71df804f1baa1p-19, r,
+        PSTEP(0x1.28aa3ea739296p-22, 0x1.acf86201fd199p-26, r, vlen), vlen);
 
-        VFLOAT poly_mid = PSTEP( 0x1.6c16c1825c970p-10, r,
-                          PSTEP( 0x1.a01a00fe6f730p-13, 0x1.a0199e1789c72p-16, r, 
-                          vlen), vlen);
+    VFLOAT poly_mid = PSTEP(
+        0x1.6c16c1825c970p-10, r,
+        PSTEP(0x1.a01a00fe6f730p-13, 0x1.a0199e1789c72p-16, r, vlen), vlen);
 
-        VFLOAT poly_left = PSTEP( 0x1.55555555554d2p-3, r, 
-                           PSTEP( 0x1.5555555551307p-5, 0x1.11111111309a4p-7, r,  
-                           vlen), vlen);
+    VFLOAT poly_left =
+        PSTEP(0x1.55555555554d2p-3, r,
+              PSTEP(0x1.5555555551307p-5, 0x1.11111111309a4p-7, r, vlen), vlen);
 
-        VFLOAT r_sq = __riscv_vfmul(r, r, vlen);
-        VFLOAT r_cube = __riscv_vfmul(r_sq, r, vlen);
+    VFLOAT r_sq = __riscv_vfmul(r, r, vlen);
+    VFLOAT r_cube = __riscv_vfmul(r_sq, r, vlen);
 
-        VFLOAT poly = __riscv_vfmadd(poly_right, r_cube, poly_mid, vlen);
-        poly = __riscv_vfmadd(poly, r_cube, poly_left, vlen);
+    VFLOAT poly = __riscv_vfmadd(poly_right, r_cube, poly_mid, vlen);
+    poly = __riscv_vfmadd(poly, r_cube, poly_left, vlen);
 
-        poly = PSTEP( 0x1.0000000000007p-1, r, poly, vlen);
+    poly = PSTEP(0x1.0000000000007p-1, r, poly, vlen);
 
-        r = __riscv_vfmacc(r, r_sq, poly, vlen);
-        vy = __riscv_vfadd(r, 0x1.0p0, vlen);
+    r = __riscv_vfmacc(r, r_sq, poly, vlen);
+    vy = __riscv_vfadd(r, 0x1.0p0, vlen);
 
-        // at this point, vy is the entire degree-11 polynomial
-        // vy ~=~ exp(r)
+    // at this point, vy is the entire degree-11 polynomial
+    // vy ~=~ exp(r)
 
-        // Need to compute 2^n * exp(r). 
-        FAST_LDEXP(vy, n, vlen);
+    // Need to compute 2^n * exp(r).
+    FAST_LDEXP(vy, n, vlen);
 
-        // Incorporate results of exceptional inputs
-        vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
+    // Incorporate results of exceptional inputs
+    vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
 
-        // copy vy into y and increment addr pointers
-        VFSTORE_OUTARG1(vy, vlen);
+    // copy vy into y and increment addr pointers
+    VFSTORE_OUTARG1(vy, vlen);
 
-        INCREMENT_INARG1(vlen);
-        INCREMENT_OUTARG1(vlen);
-    }
-    RESTORE_FRM;
+    INCREMENT_INARG1(vlen);
+    INCREMENT_OUTARG1(vlen);
+  }
+  RESTORE_FRM;
 }
 
 // Version 2 is reduction to standard primary interval.
 // Reduced argument is represented as two FP64 variables r, delta_r.
 void F_VER2(API) {
-    size_t vlen;
-    VFLOAT vx, vy, vy_special;
-    VBOOL special_args;
-
-    SET_ROUNDTONEAREST;
-    // stripmining over input arguments
-    for (; _inarg_n > 0; _inarg_n -= vlen) {
-        vlen = VSET(_inarg_n);
-        vx = VFLOAD_INARG1(vlen);
-
-        // Set results for input of NaN and Inf; substitute them with zero
-        EXCEPTION_HANDLING_EXP(vx, special_args, vy_special, vlen);
- 
-        // Clip
-        vx = FCLIP(vx, X_MIN, X_MAX, vlen);
-
-        // Argument reduction
+  size_t vlen;
+  VFLOAT vx, vy, vy_special;
+  VBOOL special_args;
+
+  SET_ROUNDTONEAREST;
+  // stripmining over input arguments
+  for (; _inarg_n > 0; _inarg_n -= vlen) {
+    vlen = VSET(_inarg_n);
+    vx = VFLOAD_INARG1(vlen);
+
+    // Set results for input of NaN and Inf; substitute them with zero
+    EXCEPTION_HANDLING_EXP(vx, special_args, vy_special, vlen);
+
+    // Clip
+    vx = FCLIP(vx, X_MIN, X_MAX, vlen);
+
+    // Argument reduction
 #if defined(COMPILE_FOR_EXP)
-        VFLOAT flt_n = __riscv_vfmul(vx, P_INV_STD, vlen);
-        VINT n = __riscv_vfcvt_x(flt_n, vlen);
-        flt_n = __riscv_vfcvt_f(n, vlen);
-        VFLOAT r_tmp = __riscv_vfnmsac(vx, P_HI_STD, flt_n, vlen);
-        VFLOAT r = __riscv_vfnmsac(r_tmp, P_LO_STD, flt_n, vlen);
-        VFLOAT delta_r = __riscv_vfsub(r_tmp, r, vlen);
-        delta_r = __riscv_vfnmsac(delta_r, P_LO_STD, flt_n, vlen);
+    VFLOAT flt_n = __riscv_vfmul(vx, P_INV_STD, vlen);
+    VINT n = __riscv_vfcvt_x(flt_n, vlen);
+    flt_n = __riscv_vfcvt_f(n, vlen);
+    VFLOAT r_tmp = __riscv_vfnmsac(vx, P_HI_STD, flt_n, vlen);
+    VFLOAT r = __riscv_vfnmsac(r_tmp, P_LO_STD, flt_n, vlen);
+    VFLOAT delta_r = __riscv_vfsub(r_tmp, r, vlen);
+    delta_r = __riscv_vfnmsac(delta_r, P_LO_STD, flt_n, vlen);
 #elif defined(COMPILE_FOR_EXP2)
-        VINT n = __riscv_vfcvt_x(vx, vlen);
-        VFLOAT flt_n = __riscv_vfcvt_f(n, vlen);
-        VFLOAT r_tmp = __riscv_vfsub(vx, flt_n, vlen);
-        VFLOAT r = __riscv_vfmul(r_tmp, LOGB_HI, vlen); 
-        VFLOAT delta_r = __riscv_vfmsub(r_tmp, LOGB_HI, r, vlen);
-        delta_r = __riscv_vfmacc(delta_r, LOGB_LO, r_tmp, vlen);
+    VINT n = __riscv_vfcvt_x(vx, vlen);
+    VFLOAT flt_n = __riscv_vfcvt_f(n, vlen);
+    VFLOAT r_tmp = __riscv_vfsub(vx, flt_n, vlen);
+    VFLOAT r = __riscv_vfmul(r_tmp, LOGB_HI, vlen);
+    VFLOAT delta_r = __riscv_vfmsub(r_tmp, LOGB_HI, r, vlen);
+    delta_r = __riscv_vfmacc(delta_r, LOGB_LO, r_tmp, vlen);
 #else
-        VFLOAT flt_n = __riscv_vfmul(vx, P_INV_STD, vlen);
-        VINT n = __riscv_vfcvt_x(flt_n, vlen);
-        flt_n = __riscv_vfcvt_f(n, vlen);
-        VFLOAT s_tmp = __riscv_vfnmsac(vx, P_HI_STD, flt_n, vlen);
-        VFLOAT s = __riscv_vfnmsac(s_tmp, P_LO_STD, flt_n, vlen);
-        VFLOAT delta_s = __riscv_vfsub(s_tmp, s, vlen);
-        delta_s = __riscv_vfnmsac(delta_s, P_LO_STD, flt_n, vlen);
-
-        VFLOAT r = __riscv_vfmul(s, LOGB_HI, vlen);
-        s_tmp = __riscv_vfmsub(s, LOGB_HI, r, vlen);
-        VFLOAT delta_r = __riscv_vfmul(s, LOGB_LO, vlen);
-        delta_r = __riscv_vfmacc(delta_r, LOGB_HI, delta_s, vlen);
-        delta_r = __riscv_vfadd(delta_r, s_tmp, vlen);
+    VFLOAT flt_n = __riscv_vfmul(vx, P_INV_STD, vlen);
+    VINT n = __riscv_vfcvt_x(flt_n, vlen);
+    flt_n = __riscv_vfcvt_f(n, vlen);
+    VFLOAT s_tmp = __riscv_vfnmsac(vx, P_HI_STD, flt_n, vlen);
+    VFLOAT s = __riscv_vfnmsac(s_tmp, P_LO_STD, flt_n, vlen);
+    VFLOAT delta_s = __riscv_vfsub(s_tmp, s, vlen);
+    delta_s = __riscv_vfnmsac(delta_s, P_LO_STD, flt_n, vlen);
+
+    VFLOAT r = __riscv_vfmul(s, LOGB_HI, vlen);
+    s_tmp = __riscv_vfmsub(s, LOGB_HI, r, vlen);
+    VFLOAT delta_r = __riscv_vfmul(s, LOGB_LO, vlen);
+    delta_r = __riscv_vfmacc(delta_r, LOGB_HI, delta_s, vlen);
+    delta_r = __riscv_vfadd(delta_r, s_tmp, vlen);
 #endif
 
-        // Polynomial computation, we have a degree 11
-        // We compute the part from r^3 in three segments, increasing parallelism
-        // Ideally the compiler will interleave the computations of the segments
-        VFLOAT poly_right = PSTEP(  0x1.71df804f1baa1p-19, r,
-                            PSTEP(  0x1.28aa3ea739296p-22, 0x1.acf86201fd199p-26, r,
-                            vlen), vlen);
+    // Polynomial computation, we have a degree 11
+    // We compute the part from r^3 in three segments, increasing parallelism
+    // Ideally the compiler will interleave the computations of the segments
+    VFLOAT poly_right = PSTEP(
+        0x1.71df804f1baa1p-19, r,
+        PSTEP(0x1.28aa3ea739296p-22, 0x1.acf86201fd199p-26, r, vlen), vlen);
 
-        VFLOAT poly_mid = PSTEP( 0x1.6c16c1825c970p-10, r,
-                          PSTEP( 0x1.a01a00fe6f730p-13, 0x1.a0199e1789c72p-16, r, 
-                          vlen), vlen);
+    VFLOAT poly_mid = PSTEP(
+        0x1.6c16c1825c970p-10, r,
+        PSTEP(0x1.a01a00fe6f730p-13, 0x1.a0199e1789c72p-16, r, vlen), vlen);
 
-        VFLOAT poly_left = PSTEP( 0x1.55555555554d2p-3, r, 
-                           PSTEP( 0x1.5555555551307p-5, 0x1.11111111309a4p-7, r,  
-                           vlen), vlen);
+    VFLOAT poly_left =
+        PSTEP(0x1.55555555554d2p-3, r,
+              PSTEP(0x1.5555555551307p-5, 0x1.11111111309a4p-7, r, vlen), vlen);
 
-        VFLOAT r_sq = __riscv_vfmul(r, r, vlen);
-        VFLOAT r_cube = __riscv_vfmul(r_sq, r, vlen);
+    VFLOAT r_sq = __riscv_vfmul(r, r, vlen);
+    VFLOAT r_cube = __riscv_vfmul(r_sq, r, vlen);
 
-        VFLOAT poly = __riscv_vfmadd(poly_right, r_cube, poly_mid, vlen);
-        poly = __riscv_vfmadd(poly, r_cube, poly_left, vlen);
+    VFLOAT poly = __riscv_vfmadd(poly_right, r_cube, poly_mid, vlen);
+    poly = __riscv_vfmadd(poly, r_cube, poly_left, vlen);
 
-        poly = PSTEP( 0x1.0000000000007p-1, r, poly, vlen);
+    poly = PSTEP(0x1.0000000000007p-1, r, poly, vlen);
 
-        poly = __riscv_vfmadd(poly, r_sq, delta_r, vlen);
-        vy = __riscv_vfadd(poly, r, vlen);
-        vy = __riscv_vfadd(vy, 0x1.0p0, vlen);
-        // at this point, vy is the entire degree-11 polynomial
-        // vy ~=~ exp(r)
+    poly = __riscv_vfmadd(poly, r_sq, delta_r, vlen);
+    vy = __riscv_vfadd(poly, r, vlen);
+    vy = __riscv_vfadd(vy, 0x1.0p0, vlen);
+    // at this point, vy is the entire degree-11 polynomial
+    // vy ~=~ exp(r)
 
-        // Need to compute 2^n * exp(r). 
-        FAST_LDEXP(vy, n, vlen);
+    // Need to compute 2^n * exp(r).
+    FAST_LDEXP(vy, n, vlen);
 
-        // Incorporate results of exceptional inputs
-        vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
+    // Incorporate results of exceptional inputs
+    vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
 
-        // copy vy into y and increment addr pointers
-        VFSTORE_OUTARG1(vy, vlen);
+    // copy vy into y and increment addr pointers
+    VFSTORE_OUTARG1(vy, vlen);
 
-        INCREMENT_INARG1(vlen); 
-        INCREMENT_OUTARG1(vlen);
-    }
-    RESTORE_FRM;
+    INCREMENT_INARG1(vlen);
+    INCREMENT_OUTARG1(vlen);
+  }
+  RESTORE_FRM;
 }
 
 // Version 3 computes 1 + r + r^2/2 to extra precision using FP techniques
 // The r in this expression is the extra-precise reduced argument
 void F_VER3(API) {
-    size_t vlen;
-    VFLOAT vx, vy, vy_special;
-    VBOOL special_args;
-    double Peg = 0x1.8p+27;
-    
-    SET_ROUNDTONEAREST;
-    // stripmining over input arguments
-    for (; _inarg_n > 0; _inarg_n -= vlen) {
-        vlen = VSET(_inarg_n);
-        vx = VFLOAD_INARG1(vlen);
-
-        // Set results for input of NaN and Inf; substitute them with zero
-        EXCEPTION_HANDLING_EXP(vx, special_args, vy_special, vlen);
- 
-        // Clip
-        vx = FCLIP(vx, X_MIN, X_MAX, vlen);
-
-        // Argument reduction
+  size_t vlen;
+  VFLOAT vx, vy, vy_special;
+  VBOOL special_args;
+  double Peg = 0x1.8p+27;
+
+  SET_ROUNDTONEAREST;
+  // stripmining over input arguments
+  for (; _inarg_n > 0; _inarg_n -= vlen) {
+    vlen = VSET(_inarg_n);
+    vx = VFLOAD_INARG1(vlen);
+
+    // Set results for input of NaN and Inf; substitute them with zero
+    EXCEPTION_HANDLING_EXP(vx, special_args, vy_special, vlen);
+
+    // Clip
+    vx = FCLIP(vx, X_MIN, X_MAX, vlen);
+
+    // Argument reduction
 #if defined(COMPILE_FOR_EXP)
-        VFLOAT flt_n = __riscv_vfmul(vx, P_INV_STD, vlen);
-        VINT n = __riscv_vfcvt_x(flt_n, vlen);
-        flt_n = __riscv_vfcvt_f(n, vlen);
-        VFLOAT r = __riscv_vfnmsac(vx, P_HI_STD, flt_n, vlen);
-        VFLOAT r_hi = __riscv_vfadd(r, Peg, vlen);
-        r_hi = __riscv_vfsub(r_hi, Peg, vlen);
-        VFLOAT r_lo = __riscv_vfsub(r, r_hi, vlen);
-        r_lo = __riscv_vfnmsac(r_lo, P_LO_STD, flt_n, vlen);
-        r = __riscv_vfadd(r_hi, r_lo, vlen);
+    VFLOAT flt_n = __riscv_vfmul(vx, P_INV_STD, vlen);
+    VINT n = __riscv_vfcvt_x(flt_n, vlen);
+    flt_n = __riscv_vfcvt_f(n, vlen);
+    VFLOAT r = __riscv_vfnmsac(vx, P_HI_STD, flt_n, vlen);
+    VFLOAT r_hi = __riscv_vfadd(r, Peg, vlen);
+    r_hi = __riscv_vfsub(r_hi, Peg, vlen);
+    VFLOAT r_lo = __riscv_vfsub(r, r_hi, vlen);
+    r_lo = __riscv_vfnmsac(r_lo, P_LO_STD, flt_n, vlen);
+    r = __riscv_vfadd(r_hi, r_lo, vlen);
 #elif defined(COMPILE_FOR_EXP2)
-        VINT n = __riscv_vfcvt_x(vx, vlen);
-        VFLOAT flt_n = __riscv_vfcvt_f(n, vlen);
-        VFLOAT r_tmp = __riscv_vfsub(vx, flt_n, vlen);
-        VFLOAT r_hi = VFMV_VF(Peg, vlen);
-        r_hi = __riscv_vfmacc(r_hi, LOGB_HI, r_tmp, vlen); 
-        r_hi = __riscv_vfsub(r_hi, Peg, vlen);
-        VFLOAT r_lo = __riscv_vfmsub(r_tmp, LOGB_HI, r_hi, vlen);
-        r_lo = __riscv_vfmacc(r_lo, LOGB_LO, r_tmp, vlen);
-        VFLOAT r = __riscv_vfadd(r_hi, r_lo, vlen);
+    VINT n = __riscv_vfcvt_x(vx, vlen);
+    VFLOAT flt_n = __riscv_vfcvt_f(n, vlen);
+    VFLOAT r_tmp = __riscv_vfsub(vx, flt_n, vlen);
+    VFLOAT r_hi = VFMV_VF(Peg, vlen);
+    r_hi = __riscv_vfmacc(r_hi, LOGB_HI, r_tmp, vlen);
+    r_hi = __riscv_vfsub(r_hi, Peg, vlen);
+    VFLOAT r_lo = __riscv_vfmsub(r_tmp, LOGB_HI, r_hi, vlen);
+    r_lo = __riscv_vfmacc(r_lo, LOGB_LO, r_tmp, vlen);
+    VFLOAT r = __riscv_vfadd(r_hi, r_lo, vlen);
 #else
-        VFLOAT flt_n = __riscv_vfmul(vx, P_INV_STD, vlen);
-        VINT n = __riscv_vfcvt_x(flt_n, vlen);
-        flt_n = __riscv_vfcvt_f(n, vlen);
-        VFLOAT s_tmp = __riscv_vfnmsac(vx, P_HI_STD, flt_n, vlen);
-        VFLOAT s = __riscv_vfnmsac(s_tmp, P_LO_STD, flt_n, vlen);
-        VFLOAT delta_s = __riscv_vfsub(s_tmp, s, vlen);
-        delta_s = __riscv_vfnmsac(delta_s, P_LO_STD, flt_n, vlen);
-
-        VFLOAT r_hi = VFMV_VF(Peg, vlen);
-        r_hi = __riscv_vfmacc(r_hi, LOGB_HI, s, vlen); 
-        r_hi = __riscv_vfsub(r_hi, Peg, vlen);
-        VFLOAT r_lo = __riscv_vfmsub(s, LOGB_HI, r_hi, vlen);
-        r_lo = __riscv_vfmacc(r_lo, LOGB_LO, s, vlen);
-        VFLOAT r = __riscv_vfadd(r_hi, r_lo, vlen);
+    VFLOAT flt_n = __riscv_vfmul(vx, P_INV_STD, vlen);
+    VINT n = __riscv_vfcvt_x(flt_n, vlen);
+    flt_n = __riscv_vfcvt_f(n, vlen);
+    VFLOAT s_tmp = __riscv_vfnmsac(vx, P_HI_STD, flt_n, vlen);
+    VFLOAT s = __riscv_vfnmsac(s_tmp, P_LO_STD, flt_n, vlen);
+    VFLOAT delta_s = __riscv_vfsub(s_tmp, s, vlen);
+    delta_s = __riscv_vfnmsac(delta_s, P_LO_STD, flt_n, vlen);
+
+    VFLOAT r_hi = VFMV_VF(Peg, vlen);
+    r_hi = __riscv_vfmacc(r_hi, LOGB_HI, s, vlen);
+    r_hi = __riscv_vfsub(r_hi, Peg, vlen);
+    VFLOAT r_lo = __riscv_vfmsub(s, LOGB_HI, r_hi, vlen);
+    r_lo = __riscv_vfmacc(r_lo, LOGB_LO, s, vlen);
+    VFLOAT r = __riscv_vfadd(r_hi, r_lo, vlen);
 #endif
 
-        // r_hi + r_lo is extra-precise reduced argument
-        // and also 1 + r_hi + r_hi^2/2 is computable error-free
-        // 1 + (r_hi + r_lo) + (r_hi + r_lo)^2/2  is
-        // P_head + P_tail where
-        // P_head = 1 + r_hi + r_hi^2/2 = 1 + r_hi*(1 + r_hi/2) 
-        // P_tail = r_lo + r_lo(r_hi + r_lo/2)
-        VFLOAT P_head = PSTEP( 0x1.0p0, r_hi, 
-                        PSTEP( 0x1.0p0, 0x1.0p-1, r_hi,
-                        vlen), vlen);   
-
-        VFLOAT P_tail = __riscv_vfmacc(r_hi, 0x1.0p-1, r_lo, vlen);
-        P_tail = __riscv_vfmadd(P_tail, r_lo, r_lo, vlen);
-
-        // Polynomial computation, we have a degree 11
-        // We compute the part from r^3 in three segments, increasing parallelism
-        // Ideally the compiler will interleave the computations of the segments
-        VFLOAT poly_right = PSTEP( 0x1.71dfc3d471117p-19, r,
-                            PSTEP( 0x1.289feff6e458ap-22, 0x1.ac82ecdeaa346p-26, r,  
-                            vlen), vlen);
-
-        VFLOAT poly_mid = PSTEP( 0x1.6c16c17ce42b2p-10, r,
-                          PSTEP( 0x1.a01a00e44cd99p-13, 0x1.a019aac62dedbp-16, r, 
-                          vlen), vlen);
-
-        VFLOAT poly_left = PSTEP( 0x1.55555555554cap-3, r,
-                           PSTEP( 0x1.555555555319fp-5,  0x1.1111111134581p-7, r,
-                           vlen), vlen);
-
-        VFLOAT r_sq = __riscv_vfmul(r, r, vlen);
-        VFLOAT r_cube = __riscv_vfmul(r_sq, r, vlen);
-
-        VFLOAT poly = __riscv_vfmadd(poly_right, r_cube, poly_mid, vlen);
-        poly = __riscv_vfmadd(poly, r_cube, poly_left, vlen);
-        poly = __riscv_vfmadd(poly, r_cube, P_tail, vlen);
-        vy = __riscv_vfadd(poly, P_head, vlen);
-        // at this point, vy is the entire degree-11 polynomial
-        // vy ~=~ exp(r)
-    
-        // Need to compute 2^n * exp(r). 
-        FAST_LDEXP(vy, n, vlen);
-
-        // Incorporate results of exceptional inputs
-        vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
-
-        // copy vy into y and increment addr pointers
-        VFSTORE_OUTARG1(vy, vlen);
-
-        INCREMENT_INARG1(vlen); 
-        INCREMENT_OUTARG1(vlen);
-    }
-    RESTORE_FRM;
+    // r_hi + r_lo is extra-precise reduced argument
+    // and also 1 + r_hi + r_hi^2/2 is computable error-free
+    // 1 + (r_hi + r_lo) + (r_hi + r_lo)^2/2  is
+    // P_head + P_tail where
+    // P_head = 1 + r_hi + r_hi^2/2 = 1 + r_hi*(1 + r_hi/2)
+    // P_tail = r_lo + r_lo(r_hi + r_lo/2)
+    VFLOAT P_head =
+        PSTEP(0x1.0p0, r_hi, PSTEP(0x1.0p0, 0x1.0p-1, r_hi, vlen), vlen);
+
+    VFLOAT P_tail = __riscv_vfmacc(r_hi, 0x1.0p-1, r_lo, vlen);
+    P_tail = __riscv_vfmadd(P_tail, r_lo, r_lo, vlen);
+
+    // Polynomial computation, we have a degree 11
+    // We compute the part from r^3 in three segments, increasing parallelism
+    // Ideally the compiler will interleave the computations of the segments
+    VFLOAT poly_right = PSTEP(
+        0x1.71dfc3d471117p-19, r,
+        PSTEP(0x1.289feff6e458ap-22, 0x1.ac82ecdeaa346p-26, r, vlen), vlen);
+
+    VFLOAT poly_mid = PSTEP(
+        0x1.6c16c17ce42b2p-10, r,
+        PSTEP(0x1.a01a00e44cd99p-13, 0x1.a019aac62dedbp-16, r, vlen), vlen);
+
+    VFLOAT poly_left =
+        PSTEP(0x1.55555555554cap-3, r,
+              PSTEP(0x1.555555555319fp-5, 0x1.1111111134581p-7, r, vlen), vlen);
+
+    VFLOAT r_sq = __riscv_vfmul(r, r, vlen);
+    VFLOAT r_cube = __riscv_vfmul(r_sq, r, vlen);
+
+    VFLOAT poly = __riscv_vfmadd(poly_right, r_cube, poly_mid, vlen);
+    poly = __riscv_vfmadd(poly, r_cube, poly_left, vlen);
+    poly = __riscv_vfmadd(poly, r_cube, P_tail, vlen);
+    vy = __riscv_vfadd(poly, P_head, vlen);
+    // at this point, vy is the entire degree-11 polynomial
+    // vy ~=~ exp(r)
+
+    // Need to compute 2^n * exp(r).
+    FAST_LDEXP(vy, n, vlen);
+
+    // Incorporate results of exceptional inputs
+    vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
+
+    // copy vy into y and increment addr pointers
+    VFSTORE_OUTARG1(vy, vlen);
+
+    INCREMENT_INARG1(vlen);
+    INCREMENT_OUTARG1(vlen);
+  }
+  RESTORE_FRM;
 }
 
 // Version 4 is a table driven algorithm. The table contains exp(j/64)
 // for j = 0, 1, ..., 63. One key feature is that this table of values
 // are stored as fixed point numbers in "Q62" format, i.e. they correspond
-// to 2^62 * exp(j/64). 
+// to 2^62 * exp(j/64).
 void F_VER4(API) {
-    size_t vlen;
-    VFLOAT vx, vy, vy_special;
-    VBOOL special_args;
-    
-    SET_ROUNDTONEAREST;
-    // stripmining over input arguments
-    for (; _inarg_n > 0; _inarg_n -= vlen) {
-        vlen = VSET(_inarg_n);
-        vx = VFLOAD_INARG1(vlen);
-
-        // Set results for input of NaN and Inf; substitute them with zero
-        EXCEPTION_HANDLING_EXP(vx, special_args, vy_special, vlen);
- 
-        // Clip
-        vx = FCLIP(vx, X_MIN, X_MAX, vlen);
-
-        // Argument reduction
+  size_t vlen;
+  VFLOAT vx, vy, vy_special;
+  VBOOL special_args;
+
+  SET_ROUNDTONEAREST;
+  // stripmining over input arguments
+  for (; _inarg_n > 0; _inarg_n -= vlen) {
+    vlen = VSET(_inarg_n);
+    vx = VFLOAD_INARG1(vlen);
+
+    // Set results for input of NaN and Inf; substitute them with zero
+    EXCEPTION_HANDLING_EXP(vx, special_args, vy_special, vlen);
+
+    // Clip
+    vx = FCLIP(vx, X_MIN, X_MAX, vlen);
+
+    // Argument reduction
 #if !defined(COMPILE_FOR_EXP2)
-        VFLOAT flt_n = __riscv_vfmul(vx, P_INV_TBL, vlen);
-        VINT n = __riscv_vfcvt_x(flt_n, vlen);
-        flt_n = __riscv_vfcvt_f(n, vlen);
-        VFLOAT r = __riscv_vfnmsac(vx, P_HI_TBL, flt_n, vlen);
+    VFLOAT flt_n = __riscv_vfmul(vx, P_INV_TBL, vlen);
+    VINT n = __riscv_vfcvt_x(flt_n, vlen);
+    flt_n = __riscv_vfcvt_f(n, vlen);
+    VFLOAT r = __riscv_vfnmsac(vx, P_HI_TBL, flt_n, vlen);
 #else
-        VINT n = __riscv_vfcvt_x(__riscv_vfmul(vx, 0x1.0p6, vlen), vlen);
-        VFLOAT flt_n = __riscv_vfcvt_f(n, vlen);
-        VFLOAT r = __riscv_vfnmsub(flt_n, 0x1.0p-6, vx, vlen);
-        r = __riscv_vfmul(r, LOGB_HI, vlen); 
+    VINT n = __riscv_vfcvt_x(__riscv_vfmul(vx, 0x1.0p6, vlen), vlen);
+    VFLOAT flt_n = __riscv_vfcvt_f(n, vlen);
+    VFLOAT r = __riscv_vfnmsub(flt_n, 0x1.0p-6, vx, vlen);
+    r = __riscv_vfmul(r, LOGB_HI, vlen);
 #endif
 
-#if defined(COMPILE_FOR_EXP) 
-        r = __riscv_vfnmsac(r, P_LO_TBL, flt_n, vlen);
+#if defined(COMPILE_FOR_EXP)
+    r = __riscv_vfnmsac(r, P_LO_TBL, flt_n, vlen);
 #elif defined(COMPILE_FOR_EXP10)
-        VFLOAT r_lo = __riscv_vfmul(flt_n, P_LO_TBL, vlen);
-        r_lo = __riscv_vfnmsac(__riscv_vfmul(r, LOGB_LO, vlen), LOGB_HI, r_lo, vlen);
-        r = __riscv_vfmadd(r, LOGB_HI, r_lo, vlen);
+    VFLOAT r_lo = __riscv_vfmul(flt_n, P_LO_TBL, vlen);
+    r_lo =
+        __riscv_vfnmsac(__riscv_vfmul(r, LOGB_LO, vlen), LOGB_HI, r_lo, vlen);
+    r = __riscv_vfmadd(r, LOGB_HI, r_lo, vlen);
 #endif
 
-        // Polynomial computation, we have a degree 5
-        // We break this up into 2 pieces
-        // Ideally the compiler will interleave the computations of the segments
-        VFLOAT poly_right = PSTEP(0x1.5555722e87735p-5, 0x1.1107f5fc29bb7p-7, r, vlen);
-        VFLOAT poly_left = PSTEP(0x1.fffffffffe1f5p-2, 0x1.55555556582a8p-3, r, vlen);
-        VFLOAT r_sq = __riscv_vfmul(r, r, vlen);
-        VFLOAT poly = __riscv_vfmadd(poly_right, r_sq, poly_left, vlen);
-        poly = __riscv_vfmadd(poly, r_sq, r, vlen);
-        poly = __riscv_vfmul(poly, 0x1.0p63, vlen);
-
-        VINT P = __riscv_vfcvt_x(poly, vlen);
-        VINT j = __riscv_vand(n, 0x3f, vlen);
-        j = __riscv_vsll(j, 3, vlen);
-        VINT T = __riscv_vluxei64(expD_tbl64_fixedpt, I_AS_U(j), vlen);
-
-        P = __riscv_vsmul(P, T, 1, vlen);
-        P = __riscv_vsadd(P, T, vlen);
-        vy = __riscv_vfcvt_f(P, vlen);
-        // at this point, vy ~=~ 2^62 * exp(r)
-
-        // Need to compute 2^(n-62) * exp(r). 
-        n = __riscv_vsra(n, 6, vlen);
-        
-        n = __riscv_vsub(n, 62, vlen);
-        FAST_LDEXP(vy, n, vlen);
-
-        vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
-
-        // copy vy into y and increment addr pointers
-        VFSTORE_OUTARG1(vy, vlen);
-
-        INCREMENT_INARG1(vlen);
-        INCREMENT_OUTARG1(vlen);
-    }
-    RESTORE_FRM;
+    // Polynomial computation, we have a degree 5
+    // We break this up into 2 pieces
+    // Ideally the compiler will interleave the computations of the segments
+    VFLOAT poly_right =
+        PSTEP(0x1.5555722e87735p-5, 0x1.1107f5fc29bb7p-7, r, vlen);
+    VFLOAT poly_left =
+        PSTEP(0x1.fffffffffe1f5p-2, 0x1.55555556582a8p-3, r, vlen);
+    VFLOAT r_sq = __riscv_vfmul(r, r, vlen);
+    VFLOAT poly = __riscv_vfmadd(poly_right, r_sq, poly_left, vlen);
+    poly = __riscv_vfmadd(poly, r_sq, r, vlen);
+    poly = __riscv_vfmul(poly, 0x1.0p63, vlen);
+
+    VINT P = __riscv_vfcvt_x(poly, vlen);
+    VINT j = __riscv_vand(n, 0x3f, vlen);
+    j = __riscv_vsll(j, 3, vlen);
+    VINT T = __riscv_vluxei64(expD_tbl64_fixedpt, I_AS_U(j), vlen);
+
+    P = __riscv_vsmul(P, T, 1, vlen);
+    P = __riscv_vsadd(P, T, vlen);
+    vy = __riscv_vfcvt_f(P, vlen);
+    // at this point, vy ~=~ 2^62 * exp(r)
+
+    // Need to compute 2^(n-62) * exp(r).
+    n = __riscv_vsra(n, 6, vlen);
+
+    n = __riscv_vsub(n, 62, vlen);
+    FAST_LDEXP(vy, n, vlen);
+
+    vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
+
+    // copy vy into y and increment addr pointers
+    VFSTORE_OUTARG1(vy, vlen);
+
+    INCREMENT_INARG1(vlen);
+    INCREMENT_OUTARG1(vlen);
+  }
+  RESTORE_FRM;
 }
-
-
diff --git a/include/rvvlm_expm1D.inc.h b/include/rvvlm_expm1D.inc.h
index 161bbaf..abec00c 100644
--- a/include/rvvlm_expm1D.inc.h
+++ b/include/rvvlm_expm1D.inc.h
@@ -3,153 +3,154 @@
 // SPDX-License-Identifier: Apache-2.0
 
 // Macros for common small-block codes
-#define EXCEPTION_HANDLING_EXPM1(vx, special_args, vy_special, vlen)    \
-do {                                                                    \
-  VUINT vclass = __riscv_vfclass((vx), (vlen));                         \
-  IDENTIFY(vclass, class_NaN|class_Inf, (special_args), (vlen))         \
-  UINT nb_special_args = __riscv_vcpop((special_args), (vlen));         \
-  if (nb_special_args > 0) {                                            \
-  /* Substitute -Inf with -1 */                                         \
-  VBOOL id_mask;                                                        \
-  IDENTIFY(vclass, class_negInf, id_mask, (vlen))                       \
-  vx = __riscv_vfmerge((vx), fp_negOne, id_mask, (vlen));               \
-  vy_special = __riscv_vfmul((special_args), (vx), fp_posOne, (vlen));  \
-  vx = __riscv_vfmerge((vx), fp_posZero, (special_args), (vlen));       \
-  }                                                                     \
-} while(0)
+#define EXCEPTION_HANDLING_EXPM1(vx, special_args, vy_special, vlen)           \
+  do {                                                                         \
+    VUINT vclass = __riscv_vfclass((vx), (vlen));                              \
+    IDENTIFY(vclass, class_NaN | class_Inf, (special_args), (vlen))            \
+    UINT nb_special_args = __riscv_vcpop((special_args), (vlen));              \
+    if (nb_special_args > 0) {                                                 \
+      /* Substitute -Inf with -1 */                                            \
+      VBOOL id_mask;                                                           \
+      IDENTIFY(vclass, class_negInf, id_mask, (vlen))                          \
+      vx = __riscv_vfmerge((vx), fp_negOne, id_mask, (vlen));                  \
+      vy_special = __riscv_vfmul((special_args), (vx), fp_posOne, (vlen));     \
+      vx = __riscv_vfmerge((vx), fp_posZero, (special_args), (vlen));          \
+    }                                                                          \
+  } while (0)
 
 #if (STRIDE == UNIT_STRIDE)
-#define F_VER1    RVVLM_EXPM1D_STD_EPSIM
+#define F_VER1 RVVLM_EXPM1D_STD_EPSIM
 #else
-#define F_VER1    RVVLM_EXPM1DI_STD_EPSIM
+#define F_VER1 RVVLM_EXPM1DI_STD_EPSIM
 #endif
 #define P_INV_STD 0x1.71547652b82fep+0
-#define P_HI_STD  0x1.62e42fefa39efp-1
-#define P_LO_STD  0x1.abc9e3b39803fp-56
+#define P_HI_STD 0x1.62e42fefa39efp-1
+#define P_LO_STD 0x1.abc9e3b39803fp-56
 #define P_INV_TBL 0x1.71547652b82fep+6
-#define P_HI_TBL  0x1.62e42fefa39efp-7
-#define P_LO_TBL  0x1.abc9e3b39803fp-62
-#define X_MAX     0x1.65p+9
-#define X_MIN    -0x1.5p+5
+#define P_HI_TBL 0x1.62e42fefa39efp-7
+#define P_LO_TBL 0x1.abc9e3b39803fp-62
+#define X_MAX 0x1.65p+9
+#define X_MIN -0x1.5p+5
 
 #include <fenv.h>
 
 // We use the EPsim version of expD to compute expm1
-void F_VER1( API ){
-    size_t vlen;
-    VFLOAT vx, vy, vy_special;
-    VBOOL special_args;
-   
-    SET_ROUNDTONEAREST;
-    // stripmining over input arguments
-    for (; _inarg_n > 0; _inarg_n -= vlen) {
-        vlen = VSET(_inarg_n);
-        vx = VFLOAD_INARG1(vlen);
-
-        EXCEPTION_HANDLING_EXPM1(vx, special_args, vy_special, vlen);
-
-        // Clip
-        vx = FCLIP(vx, X_MIN, X_MAX, vlen);
-
-        // Argument reduction
-        VFLOAT flt_n = __riscv_vfmul(vx, P_INV_STD, vlen);
-        VINT n = __riscv_vfcvt_x(flt_n, vlen);
-        flt_n = __riscv_vfcvt_f(n, vlen);
-
-        VFLOAT r_tmp = __riscv_vfnmsac(vx, P_HI_STD, flt_n, vlen);
-        VFLOAT r = __riscv_vfnmsub(flt_n, P_LO_STD, r_tmp, vlen);
-        VFLOAT r_lo = __riscv_vfsub(r_tmp, r, vlen);
-        r_lo = __riscv_vfnmsac(r_lo, P_LO_STD, flt_n, vlen);
-        // r is the reduced argument in working precision; but r + r_lo is extra precise
-        // exp(r+r_lo) is 1 + (r+r_lo) + (r+r_lo)^2/2 + r^3 * polynomial(r) 
-        // 1 + r + r^2/2 + (r_lo + r * r_lo) + r^3 * polynomial(r) 
-        r_lo = __riscv_vfmacc(r_lo, r, r_lo, vlen);
-        // 1 + r + r^2/2 + r_lo + r^3 * polynomial(r)
-
-        // Compute P_head + P_tail as r + r^2/2 accurately
-        VFLOAT r_prime = __riscv_vfmul(r, 0x1.0p-1, vlen); // this coeff is 1/2
-        VFLOAT P_head = __riscv_vfmadd(r, r_prime, r, vlen);
-        VFLOAT P_tail = __riscv_vfsub(r, P_head, vlen);
-        P_tail = __riscv_vfmacc(P_tail, r, r_prime, vlen);
-
-        // Polynomial computation, we have a degree 11 polynomial
-        VFLOAT poly_right = PSTEP( 0x1.71ddf7aef0679p-19, r,
-                            PSTEP( 0x1.27e4e210af311p-22, r,
-                            PSTEP( 0x1.af5ff637cd647p-26, 0x1.1f6562eae5ba9p-29, r,
-                            vlen), vlen), vlen);
-
-        VFLOAT poly_mid = PSTEP( 0x1.6c16c16c166f3p-10, r,
-                          PSTEP( 0x1.a01a01b0207e3p-13, 0x1.a01a01a4af90ap-16, r,
-                          vlen), vlen);
-
-        VFLOAT poly_left = PSTEP( 0x1.5555555555559p-3, r,
-                           PSTEP( 0x1.5555555555556p-5, 0x1.111111110f62ap-7, r,
-                           vlen), vlen);
-
-        VFLOAT r_sq = __riscv_vfmul(r, r, vlen);
-        VFLOAT r_cube = __riscv_vfmul(r_sq, r, vlen);
-
-        VFLOAT poly = __riscv_vfmadd(poly_right, r_cube, poly_mid, vlen);
-        poly = __riscv_vfmadd(poly, r_cube, poly_left, vlen);
-        // At this point, exp(r) is 1 + P_head + P_tail + r_lo + r^3 * poly
-        // expm1(x) = 2^n ( 1 - 2^(-n) + P_head + P_tail + r_lo + r^3 * poly )
-
-
-        // Compute 1 - 2^(-n) accurately using Knuth-2-sum
-        // 
-        VFLOAT One = VFMV_VF(fp_posOne, vlen);
-	    VINT n_clip = __riscv_vmin(n, 64, vlen); //  n_clip <= 64; note that n_clip >= -61 
-        n_clip = __riscv_vrsub(n_clip, -1025, vlen); // (sign,expo) of -2^(-n_clip)
-        VFLOAT u = I_AS_F(__riscv_vsll(n_clip, 52, vlen)); // u = -2^(-n_clip)
-        VFLOAT A, a;
-        KNUTH2SUM(One, u, A, a, vlen);
-
-        //VBOOL n_ge_0 = __riscv_vmsge(n_clip, 0, vlen);
-	    //VFLOAT first = __riscv_vmerge(u, One, n_ge_0, vlen);
-        //VFLOAT second = __riscv_vmerge(One, u, n_ge_0, vlen);
-        //VFLOAT A = __riscv_vfadd(u, One, vlen);
-        //VFLOAT a = __riscv_vfsub(first, A, vlen);
-        //a = __riscv_vfadd(a, second, vlen);
-        
-        // Compute A + a + P_head + P_tail + r_lo + r^3 * poly
-    	P_tail = __riscv_vfadd(P_tail, a, vlen);
-        P_tail = __riscv_vfadd(P_tail, r_lo, vlen);
-        poly = __riscv_vfmadd(poly, r_cube, P_tail, vlen);
-        P_head = __riscv_vfadd(P_head, poly, vlen);
-
-        vy = __riscv_vfadd(P_head, A, vlen);
-        // vy is now exp(r) - 1 accurately.  
-
-        // Need to compute 2^n * exp(r). 
-        // Although most of the time, it suffices to add n to the exponent field of exp(r)
-        // this will fail n is just a bit too positive or negative, corresponding to
-        // 2^n * exp(r) causing over or underflow.
-        // So we have to decompose n into n1 + n2  where n1 = n >> 1
-        // 2^n1 * exp(r) can be performed by adding n to exp(r)'s exponent field
-        // But we need to create the floating point value scale = 2^n2 
-        // and perform a multiplication to finish the task.
-
-    	//n1 =__riscv_vsra_vx_i64m1(n, (size_t) 1, vlen);
-        //n2 = __riscv_vsub(n, n1, vlen); 
-        //n1 = __riscv_vsll_vx_i64m1(n1, (size_t) 52, vlen);
-        //vy = I_AS_F( __riscv_vadd(F_AS_I(vy), n1, vlen));
-        //n2 = __riscv_vadd_vx_i64m1(n2, (INT) 1023, vlen);
-        //n2 = __riscv_vsll_vx_i64m1(n2, (size_t) 52, vlen);
-        //vy = __riscv_vfmul(vy, I_AS_F(n2), vlen);
-
-        FAST_LDEXP(vy, n, vlen);
-
-    	vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
-
-        // copy vy into y and increment addr pointers
-        // VSE (y, vy, vlen);
-        VFSTORE_OUTARG1(vy, vlen);
-
-        INCREMENT_INARG1(vlen);
-        INCREMENT_OUTARG1(vlen);
-
-        // x += vlen; y += vlen;
-    }
-    RESTORE_FRM;
+void F_VER1(API) {
+  size_t vlen;
+  VFLOAT vx, vy, vy_special;
+  VBOOL special_args;
+
+  SET_ROUNDTONEAREST;
+  // stripmining over input arguments
+  for (; _inarg_n > 0; _inarg_n -= vlen) {
+    vlen = VSET(_inarg_n);
+    vx = VFLOAD_INARG1(vlen);
+
+    EXCEPTION_HANDLING_EXPM1(vx, special_args, vy_special, vlen);
+
+    // Clip
+    vx = FCLIP(vx, X_MIN, X_MAX, vlen);
+
+    // Argument reduction
+    VFLOAT flt_n = __riscv_vfmul(vx, P_INV_STD, vlen);
+    VINT n = __riscv_vfcvt_x(flt_n, vlen);
+    flt_n = __riscv_vfcvt_f(n, vlen);
+
+    VFLOAT r_tmp = __riscv_vfnmsac(vx, P_HI_STD, flt_n, vlen);
+    VFLOAT r = __riscv_vfnmsub(flt_n, P_LO_STD, r_tmp, vlen);
+    VFLOAT r_lo = __riscv_vfsub(r_tmp, r, vlen);
+    r_lo = __riscv_vfnmsac(r_lo, P_LO_STD, flt_n, vlen);
+    // r is the reduced argument in working precision; but r + r_lo is extra
+    // precise exp(r+r_lo) is 1 + (r+r_lo) + (r+r_lo)^2/2 + r^3 * polynomial(r)
+    // 1 + r + r^2/2 + (r_lo + r * r_lo) + r^3 * polynomial(r)
+    r_lo = __riscv_vfmacc(r_lo, r, r_lo, vlen);
+    // 1 + r + r^2/2 + r_lo + r^3 * polynomial(r)
+
+    // Compute P_head + P_tail as r + r^2/2 accurately
+    VFLOAT r_prime = __riscv_vfmul(r, 0x1.0p-1, vlen); // this coeff is 1/2
+    VFLOAT P_head = __riscv_vfmadd(r, r_prime, r, vlen);
+    VFLOAT P_tail = __riscv_vfsub(r, P_head, vlen);
+    P_tail = __riscv_vfmacc(P_tail, r, r_prime, vlen);
+
+    // Polynomial computation, we have a degree 11 polynomial
+    VFLOAT poly_right = PSTEP(
+        0x1.71ddf7aef0679p-19, r,
+        PSTEP(0x1.27e4e210af311p-22, r,
+              PSTEP(0x1.af5ff637cd647p-26, 0x1.1f6562eae5ba9p-29, r, vlen),
+              vlen),
+        vlen);
+
+    VFLOAT poly_mid = PSTEP(
+        0x1.6c16c16c166f3p-10, r,
+        PSTEP(0x1.a01a01b0207e3p-13, 0x1.a01a01a4af90ap-16, r, vlen), vlen);
+
+    VFLOAT poly_left =
+        PSTEP(0x1.5555555555559p-3, r,
+              PSTEP(0x1.5555555555556p-5, 0x1.111111110f62ap-7, r, vlen), vlen);
+
+    VFLOAT r_sq = __riscv_vfmul(r, r, vlen);
+    VFLOAT r_cube = __riscv_vfmul(r_sq, r, vlen);
+
+    VFLOAT poly = __riscv_vfmadd(poly_right, r_cube, poly_mid, vlen);
+    poly = __riscv_vfmadd(poly, r_cube, poly_left, vlen);
+    // At this point, exp(r) is 1 + P_head + P_tail + r_lo + r^3 * poly
+    // expm1(x) = 2^n ( 1 - 2^(-n) + P_head + P_tail + r_lo + r^3 * poly )
+
+    // Compute 1 - 2^(-n) accurately using Knuth-2-sum
+    //
+    VFLOAT One = VFMV_VF(fp_posOne, vlen);
+    VINT n_clip =
+        __riscv_vmin(n, 64, vlen); //  n_clip <= 64; note that n_clip >= -61
+    n_clip = __riscv_vrsub(n_clip, -1025, vlen); // (sign,expo) of -2^(-n_clip)
+    VFLOAT u = I_AS_F(__riscv_vsll(n_clip, 52, vlen)); // u = -2^(-n_clip)
+    VFLOAT A, a;
+    KNUTH2SUM(One, u, A, a, vlen);
+
+    // VBOOL n_ge_0 = __riscv_vmsge(n_clip, 0, vlen);
+    // VFLOAT first = __riscv_vmerge(u, One, n_ge_0, vlen);
+    // VFLOAT second = __riscv_vmerge(One, u, n_ge_0, vlen);
+    // VFLOAT A = __riscv_vfadd(u, One, vlen);
+    // VFLOAT a = __riscv_vfsub(first, A, vlen);
+    // a = __riscv_vfadd(a, second, vlen);
+
+    // Compute A + a + P_head + P_tail + r_lo + r^3 * poly
+    P_tail = __riscv_vfadd(P_tail, a, vlen);
+    P_tail = __riscv_vfadd(P_tail, r_lo, vlen);
+    poly = __riscv_vfmadd(poly, r_cube, P_tail, vlen);
+    P_head = __riscv_vfadd(P_head, poly, vlen);
+
+    vy = __riscv_vfadd(P_head, A, vlen);
+    // vy is now exp(r) - 1 accurately.
+
+    // Need to compute 2^n * exp(r).
+    // Although most of the time, it suffices to add n to the exponent field of
+    // exp(r) this will fail n is just a bit too positive or negative,
+    // corresponding to 2^n * exp(r) causing over or underflow. So we have to
+    // decompose n into n1 + n2  where n1 = n >> 1 2^n1 * exp(r) can be
+    // performed by adding n to exp(r)'s exponent field But we need to create
+    // the floating point value scale = 2^n2 and perform a multiplication to
+    // finish the task.
+
+    // n1 =__riscv_vsra_vx_i64m1(n, (size_t) 1, vlen);
+    // n2 = __riscv_vsub(n, n1, vlen);
+    // n1 = __riscv_vsll_vx_i64m1(n1, (size_t) 52, vlen);
+    // vy = I_AS_F( __riscv_vadd(F_AS_I(vy), n1, vlen));
+    // n2 = __riscv_vadd_vx_i64m1(n2, (INT) 1023, vlen);
+    // n2 = __riscv_vsll_vx_i64m1(n2, (size_t) 52, vlen);
+    // vy = __riscv_vfmul(vy, I_AS_F(n2), vlen);
+
+    FAST_LDEXP(vy, n, vlen);
+
+    vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
+
+    // copy vy into y and increment addr pointers
+    // VSE (y, vy, vlen);
+    VFSTORE_OUTARG1(vy, vlen);
+
+    INCREMENT_INARG1(vlen);
+    INCREMENT_OUTARG1(vlen);
+
+    // x += vlen; y += vlen;
+  }
+  RESTORE_FRM;
 }
-
diff --git a/include/rvvlm_fp.inc.h b/include/rvvlm_fp.inc.h
index 55b0ecb..e7a0283 100644
--- a/include/rvvlm_fp.inc.h
+++ b/include/rvvlm_fp.inc.h
@@ -27,22 +27,26 @@ static_assert(false, "Must assign STRIDE before including " __FILE__);
 _Static_assert(0, "NaN not available on this architecture")
 #endif
 
-#define __PASTE2_BASE(A,B) A##B
-#define __PASTE2(A,B) __PASTE2_BASE(A,B)
-#define __PASTE3_BASE(A,B,C) A##B##C
-#define __PASTE3(A,B,C) __PASTE3_BASE(A,B,C)
-#define __PASTE4_BASE(A,B,C,D) A##B##C##D
-#define __PASTE4(A,B,C,D) __PASTE4_BASE(A,B,C,D)
-#define __PASTE5_BASE(A,B,C,D,E) A##B##C##D##E
-#define __PASTE5(A,B,C,D,E) __PASTE5_BASE(A,B,C,D,E)
-#define __PASTE6_BASE(A,B,C,D,E,F) A##B##C##D##E##F
-#define __PASTE6(A,B,C,D,E,F) __PASTE5_BASE(A,B,C,D,E,F)
-
-#define MAKE_VTYPE(A) __PASTE3(A,BIT_WIDTH,__PASTE3(m,LMUL,_t))
-#define MAKE_TYPE(A) __PASTE3(A,BIT_WIDTH,_t)
-#define MAKE_FUNC(A) __PASTE3(A,BIT_WIDTH,__PASTE2(m,LMUL))
-#define MAKE_VLOAD(A) __PASTE3(__PASTE3(__riscv_vle,BIT_WIDTH,_v_),A,__PASTE3(BIT_WIDTH,m,LMUL))
-#define MAKE_VSLOAD(A) __PASTE3(__PASTE3(__riscv_vlse,BIT_WIDTH,_v_),A,__PASTE3(BIT_WIDTH,m,LMUL))
+#define __PASTE2_BASE(A, B) A##B
+#define __PASTE2(A, B) __PASTE2_BASE(A, B)
+#define __PASTE3_BASE(A, B, C) A##B##C
+#define __PASTE3(A, B, C) __PASTE3_BASE(A, B, C)
+#define __PASTE4_BASE(A, B, C, D) A##B##C##D
+#define __PASTE4(A, B, C, D) __PASTE4_BASE(A, B, C, D)
+#define __PASTE5_BASE(A, B, C, D, E) A##B##C##D##E
+#define __PASTE5(A, B, C, D, E) __PASTE5_BASE(A, B, C, D, E)
+#define __PASTE6_BASE(A, B, C, D, E, F) A##B##C##D##E##F
+#define __PASTE6(A, B, C, D, E, F) __PASTE5_BASE(A, B, C, D, E, F)
+
+#define MAKE_VTYPE(A) __PASTE3(A, BIT_WIDTH, __PASTE3(m, LMUL, _t))
+#define MAKE_TYPE(A) __PASTE3(A, BIT_WIDTH, _t)
+#define MAKE_FUNC(A) __PASTE3(A, BIT_WIDTH, __PASTE2(m, LMUL))
+#define MAKE_VLOAD(A)                                                          \
+  __PASTE3(__PASTE3(__riscv_vle, BIT_WIDTH, _v_), A,                           \
+           __PASTE3(BIT_WIDTH, m, LMUL))
+#define MAKE_VSLOAD(A)                                                         \
+  __PASTE3(__PASTE3(__riscv_vlse, BIT_WIDTH, _v_), A,                          \
+           __PASTE3(BIT_WIDTH, m, LMUL))
 
 #if (BIT_WIDTH == 64)
 #define NATIVE_TYPE double
@@ -51,19 +55,19 @@ _Static_assert(0, "NaN not available on this architecture")
 static_assert(false, "requested BIT_WIDTH unsupported" __FILE__);
 #endif
 
-#define API_11_US size_t _inarg_n, const NATIVE_TYPE * _inarg1, NATIVE_TYPE * _outarg1
-#define API_11_GS size_t _inarg_n, const NATIVE_TYPE * _inarg1,  size_t _inarg1_stride, \
-                                         NATIVE_TYPE * _outarg1, size_t _outarg1_stride
-#define API_21_US size_t _inarg_n, \
-                  const NATIVE_TYPE * _inarg1, const NATIVE_TYPE * _inarg2, \
-                  NATIVE_TYPE * _outarg1
-
-#define API_21_GS size_t _inarg_n, \
-                  const NATIVE_TYPE * _inarg1, size_t _inarg1_stride, \
-                  const NATIVE_TYPE * _inarg2, size_t _inarg2_stride, \
-                  NATIVE_TYPE * _outarg1, size_t _outarg1_stride
-
+#define API_11_US                                                              \
+  size_t _inarg_n, const NATIVE_TYPE *_inarg1, NATIVE_TYPE *_outarg1
+#define API_11_GS                                                              \
+  size_t _inarg_n, const NATIVE_TYPE *_inarg1, size_t _inarg1_stride,          \
+      NATIVE_TYPE *_outarg1, size_t _outarg1_stride
+#define API_21_US                                                              \
+  size_t _inarg_n, const NATIVE_TYPE *_inarg1, const NATIVE_TYPE *_inarg2,     \
+      NATIVE_TYPE *_outarg1
 
+#define API_21_GS                                                              \
+  size_t _inarg_n, const NATIVE_TYPE *_inarg1, size_t _inarg1_stride,          \
+      const NATIVE_TYPE *_inarg2, size_t _inarg2_stride,                       \
+      NATIVE_TYPE *_outarg1, size_t _outarg1_stride
 
 #if (API_SIGNATURE == API_SIGNATURE_11)
 #if (STRIDE == UNIT_STRIDE)
@@ -80,39 +84,73 @@ static_assert(false, "requested BIT_WIDTH unsupported" __FILE__);
 #endif
 
 #if (STRIDE == UNIT_STRIDE)
-#define VFLOAD_INARG1(vlen) MAKE_VLOAD(f) (_inarg1, (vlen))
-#define VFLOAD_INARG2(vlen) MAKE_VLOAD(f) (_inarg2, (vlen))
-#define VFSTORE_OUTARG1(vy,vlen) __PASTE2(__riscv_vse,BIT_WIDTH)(_outarg1,(vy),(vlen))
-#define VFSTORE_OUTARG2(vy,vlen) __PASTE2(__riscv_vse,BIT_WIDTH)(_outarg2,(vy),(vlen))
-#define INCREMENT_INARG1(vlen) do {_inarg1 += (vlen);} while(0)
-#define INCREMENT_INARG2(vlen) do {_inarg2 += (vlen);} while(0)
-#define INCREMENT_OUTARG1(vlen) do {_outarg1 += (vlen);} while(0)
-#define INCREMENT_OUTARG2(vlen) do {_outarg2 += (vlen);} while(0)
+#define VFLOAD_INARG1(vlen) MAKE_VLOAD(f)(_inarg1, (vlen))
+#define VFLOAD_INARG2(vlen) MAKE_VLOAD(f)(_inarg2, (vlen))
+#define VFSTORE_OUTARG1(vy, vlen)                                              \
+  __PASTE2(__riscv_vse, BIT_WIDTH)(_outarg1, (vy), (vlen))
+#define VFSTORE_OUTARG2(vy, vlen)                                              \
+  __PASTE2(__riscv_vse, BIT_WIDTH)(_outarg2, (vy), (vlen))
+#define INCREMENT_INARG1(vlen)                                                 \
+  do {                                                                         \
+    _inarg1 += (vlen);                                                         \
+  } while (0)
+#define INCREMENT_INARG2(vlen)                                                 \
+  do {                                                                         \
+    _inarg2 += (vlen);                                                         \
+  } while (0)
+#define INCREMENT_OUTARG1(vlen)                                                \
+  do {                                                                         \
+    _outarg1 += (vlen);                                                        \
+  } while (0)
+#define INCREMENT_OUTARG2(vlen)                                                \
+  do {                                                                         \
+    _outarg2 += (vlen);                                                        \
+  } while (0)
 #else
-#define VFLOAD_INARG1(vlen) MAKE_VSLOAD(f)(_inarg1, _inarg1_stride*TYPE_SIZE, (vlen))
-#define VFLOAD_INARG2(vlen) MAKE_VSLOAD(f)(_inarg2, _inarg2_stride*TYPE_SIZE, (vlen))
-#define VFSTORE_OUTARG1(vy,vlen) __PASTE2(__riscv_vsse,BIT_WIDTH)(_outarg1,_outarg1_stride*TYPE_SIZE,(vy),(vlen))
-#define VFSTORE_OUTARG2(vy,vlen) __PASTE2(__riscv_vsse,BIT_WIDTH)(_outarg2,_outarg2_stride*TYPE_SIZE,(vy),(vlen))
-#define INCREMENT_INARG1(vlen) do {_inarg1 += _inarg1_stride*(vlen);} while(0)
-#define INCREMENT_INARG2(vlen) do {_inarg2 += _inarg2_stride*(vlen);} while(0)
-#define INCREMENT_OUTARG1(vlen) do {_outarg1 += _outarg1_stride*(vlen);} while(0)
-#define INCREMENT_OUTARG2(vlen) do {_outarg2 += _outarg2_stride*(vlen);} while(0)
+#define VFLOAD_INARG1(vlen)                                                    \
+  MAKE_VSLOAD(f)(_inarg1, _inarg1_stride * TYPE_SIZE, (vlen))
+#define VFLOAD_INARG2(vlen)                                                    \
+  MAKE_VSLOAD(f)(_inarg2, _inarg2_stride * TYPE_SIZE, (vlen))
+#define VFSTORE_OUTARG1(vy, vlen)                                              \
+  __PASTE2(__riscv_vsse, BIT_WIDTH)                                            \
+  (_outarg1, _outarg1_stride * TYPE_SIZE, (vy), (vlen))
+#define VFSTORE_OUTARG2(vy, vlen)                                              \
+  __PASTE2(__riscv_vsse, BIT_WIDTH)                                            \
+  (_outarg2, _outarg2_stride * TYPE_SIZE, (vy), (vlen))
+#define INCREMENT_INARG1(vlen)                                                 \
+  do {                                                                         \
+    _inarg1 += _inarg1_stride * (vlen);                                        \
+  } while (0)
+#define INCREMENT_INARG2(vlen)                                                 \
+  do {                                                                         \
+    _inarg2 += _inarg2_stride * (vlen);                                        \
+  } while (0)
+#define INCREMENT_OUTARG1(vlen)                                                \
+  do {                                                                         \
+    _outarg1 += _outarg1_stride * (vlen);                                      \
+  } while (0)
+#define INCREMENT_OUTARG2(vlen)                                                \
+  do {                                                                         \
+    _outarg2 += _outarg2_stride * (vlen);                                      \
+  } while (0)
 #endif
 
 // For MAKE_VBOOL, the value is 64/LMUL
 #if (LMUL == 1)
-#define MAKE_VBOOL(A) __PASTE3(A,64,_t)
+#define MAKE_VBOOL(A) __PASTE3(A, 64, _t)
 #elif (LMUL == 2)
-#define MAKE_VBOOL(A) __PASTE3(A,32,_t)
+#define MAKE_VBOOL(A) __PASTE3(A, 32, _t)
 #elif (LMUL == 4)
-#define MAKE_VBOOL(A) __PASTE3(A,16,_t)
+#define MAKE_VBOOL(A) __PASTE3(A, 16, _t)
 #elif (LMUL == 8)
-#define MAKE_VBOOL(A) __PASTE3(A,8,_t)
+#define MAKE_VBOOL(A) __PASTE3(A, 8, _t)
 #endif
-#define VSET __PASTE2(__riscv_vsetvl_e,__PASTE3(BIT_WIDTH,m,LMUL))
-#define VSE __PASTE2(__riscv_vse,BIT_WIDTH)
-#define VSSE __PASTE2(__riscv_vsse,BIT_WIDTH)
-#define MAKE_REINTERPRET(A,B) __PASTE5(__riscv_vreinterpret_v_,A,__PASTE4(BIT_WIDTH,m,LMUL,_),B,__PASTE3(BIT_WIDTH,m,LMUL))
+#define VSET __PASTE2(__riscv_vsetvl_e, __PASTE3(BIT_WIDTH, m, LMUL))
+#define VSE __PASTE2(__riscv_vse, BIT_WIDTH)
+#define VSSE __PASTE2(__riscv_vsse, BIT_WIDTH)
+#define MAKE_REINTERPRET(A, B)                                                 \
+  __PASTE5(__riscv_vreinterpret_v_, A, __PASTE4(BIT_WIDTH, m, LMUL, _), B,     \
+           __PASTE3(BIT_WIDTH, m, LMUL))
 
 #define FLOAT MAKE_TYPE(float)
 #define VFLOAT MAKE_VTYPE(vfloat)
@@ -122,12 +160,12 @@ static_assert(false, "requested BIT_WIDTH unsupported" __FILE__);
 #define VUINT MAKE_VTYPE(vuint)
 #define VBOOL MAKE_VBOOL(vbool)
 
-#define F_AS_I MAKE_REINTERPRET(f,i)
-#define F_AS_U MAKE_REINTERPRET(f,u)
-#define I_AS_F MAKE_REINTERPRET(i,f)
-#define U_AS_F MAKE_REINTERPRET(u,f)
-#define I_AS_U MAKE_REINTERPRET(i,u)
-#define U_AS_I MAKE_REINTERPRET(u,i)
+#define F_AS_I MAKE_REINTERPRET(f, i)
+#define F_AS_U MAKE_REINTERPRET(f, u)
+#define I_AS_F MAKE_REINTERPRET(i, f)
+#define U_AS_F MAKE_REINTERPRET(u, f)
+#define I_AS_U MAKE_REINTERPRET(i, u)
+#define U_AS_I MAKE_REINTERPRET(u, i)
 
 #define VFLOAD MAKE_VLOAD(f)
 #define VILOAD MAKE_VLOAD(i)
@@ -137,10 +175,10 @@ static_assert(false, "requested BIT_WIDTH unsupported" __FILE__);
 #define VMVU_VX MAKE_FUNC(__riscv_vmv_v_x_u)
 #define VFMV_VF MAKE_FUNC(__riscv_vfmv_v_f_f)
 
-const INT int_Zero = 0;
+    const INT int_Zero = 0;
 const UINT uint_Zero = 0;
 
-#if (BIT_WIDTH == 64) 
+#if (BIT_WIDTH == 64)
 #define EXP_BIAS 1023
 #define MAN_LEN 52
 const uint64_t class_sNaN = 0x100;
@@ -184,4 +222,3 @@ static const double fp_negHalf = -0x1.0p-1;
 #undef _GNU_SOURCE
 #undef _NEED_UNDEF_GNU_SOURCE
 #endif
-
diff --git a/include/rvvlm_fp64m1.h b/include/rvvlm_fp64m1.h
index b71e87d..1841217 100644
--- a/include/rvvlm_fp64m1.h
+++ b/include/rvvlm_fp64m1.h
@@ -4,7 +4,7 @@
 
 #ifndef __RVVLM_FP64M1_H__
 #define __RVVLM_FP64M1_H__
-#define LMUL      1
+#define LMUL 1
 #define BIT_WIDTH 64
 #include "rvvlm_fp.inc.h"
 #endif
diff --git a/include/rvvlm_fp64m2.h b/include/rvvlm_fp64m2.h
index 1b12108..c21e624 100644
--- a/include/rvvlm_fp64m2.h
+++ b/include/rvvlm_fp64m2.h
@@ -4,7 +4,7 @@
 
 #ifndef __RVVLM_FP64M2_H__
 #define __RVVLM_FP64M2_H__
-#define LMUL      2
+#define LMUL 2
 #define BIT_WIDTH 64
 #include "rvvlm_fp.inc.h"
 #endif
diff --git a/include/rvvlm_fp64m4.h b/include/rvvlm_fp64m4.h
index 0b92347..d3930c4 100644
--- a/include/rvvlm_fp64m4.h
+++ b/include/rvvlm_fp64m4.h
@@ -4,7 +4,7 @@
 
 #ifndef __RVVLM_FP64M4_H__
 #define __RVVLM_FP64M4_H__
-#define LMUL      4
+#define LMUL 4
 #define BIT_WIDTH 64
 #include "rvvlm_fp.inc.h"
 #endif
diff --git a/include/rvvlm_log1pD.inc.h b/include/rvvlm_log1pD.inc.h
index 57d0ef7..6ea2ef5 100644
--- a/include/rvvlm_log1pD.inc.h
+++ b/include/rvvlm_log1pD.inc.h
@@ -3,143 +3,149 @@
 // SPDX-License-Identifier: Apache-2.0
 
 // Macros for common small-block codes
-#define EXCEPTION_HANDLING_LOG1P(vx, vxp1, special_args, vy_special, vlen) \
-do { \
-  vxp1 = __riscv_vfadd(vx, fp_posOne, vlen); \
-  VUINT vclass = __riscv_vfclass((vxp1), (vlen)); \
-  /* special handling except positive normal number */ \
-  IDENTIFY(vclass, 0x3BF, (special_args), (vlen)); \
-  UINT nb_special_args = __riscv_vcpop((special_args), (vlen)); \
-  if (nb_special_args > 0) { \
-    VBOOL id_mask; \
-    /* substitute negative arguments with sNaN */ \
-    IDENTIFY(vclass, class_negative, id_mask, (vlen)); \
-    vxp1 = __riscv_vfmerge(vxp1, fp_sNaN, id_mask, vlen); \
-    /* substitute 0 with -0 */ \
-    IDENTIFY(vclass, class_Zero, id_mask, vlen); \
-    vxp1 = __riscv_vfmerge(vxp1, fp_negZero, id_mask, vlen); \
-    /* eliminate positive denorm input from special_args */ \
-    IDENTIFY(vclass, 0x39F, (special_args), (vlen)); \
-    /* for narrowed set of special arguments, compute vx+vfrec7(vx) */ \
-    vy_special = __riscv_vfrec7((special_args), (vxp1), (vlen)); \
-    vy_special = __riscv_vfadd((special_args), vy_special, (vxp1), (vlen)); \
-    vx = __riscv_vfmerge((vx), fp_posOne, (special_args), (vlen)); \
-    vxp1 = __riscv_vfmerge((vxp1), 0x1.0p1, (special_args), (vlen)); \
-  } \
-} while(0)
+#define EXCEPTION_HANDLING_LOG1P(vx, vxp1, special_args, vy_special, vlen)     \
+  do {                                                                         \
+    vxp1 = __riscv_vfadd(vx, fp_posOne, vlen);                                 \
+    VUINT vclass = __riscv_vfclass((vxp1), (vlen));                            \
+    /* special handling except positive normal number */                       \
+    IDENTIFY(vclass, 0x3BF, (special_args), (vlen));                           \
+    UINT nb_special_args = __riscv_vcpop((special_args), (vlen));              \
+    if (nb_special_args > 0) {                                                 \
+      VBOOL id_mask;                                                           \
+      /* substitute negative arguments with sNaN */                            \
+      IDENTIFY(vclass, class_negative, id_mask, (vlen));                       \
+      vxp1 = __riscv_vfmerge(vxp1, fp_sNaN, id_mask, vlen);                    \
+      /* substitute 0 with -0 */                                               \
+      IDENTIFY(vclass, class_Zero, id_mask, vlen);                             \
+      vxp1 = __riscv_vfmerge(vxp1, fp_negZero, id_mask, vlen);                 \
+      /* eliminate positive denorm input from special_args */                  \
+      IDENTIFY(vclass, 0x39F, (special_args), (vlen));                         \
+      /* for narrowed set of special arguments, compute vx+vfrec7(vx) */       \
+      vy_special = __riscv_vfrec7((special_args), (vxp1), (vlen));             \
+      vy_special = __riscv_vfadd((special_args), vy_special, (vxp1), (vlen));  \
+      vx = __riscv_vfmerge((vx), fp_posOne, (special_args), (vlen));           \
+      vxp1 = __riscv_vfmerge((vxp1), 0x1.0p1, (special_args), (vlen));         \
+    }                                                                          \
+  } while (0)
 
 #if (STRIDE == UNIT_STRIDE)
-#define F_VER1    RVVLM_LOG1PD_TBL128
+#define F_VER1 RVVLM_LOG1PD_TBL128
 #else
-#define F_VER1    RVVLM_LOG1PDI_TBL128
+#define F_VER1 RVVLM_LOG1PDI_TBL128
 #endif
 #define LOG2_HI 0x1.62e42fefa39efp-1
-#define LOG2_LO 0x1.abc9e3b39803fp-56 
+#define LOG2_LO 0x1.abc9e3b39803fp-56
 
 #include <fenv.h>
 
 // Version 1 uses a 128-entry LUT
 void F_VER1(API) {
-    size_t vlen;
-    VFLOAT vx, vxp1, vy, vy_special;
-    VBOOL special_args;
-
-    SET_ROUNDTONEAREST;
-    // stripmining over input arguments
-    for (; _inarg_n > 0; _inarg_n -= vlen) {
-        vlen = VSET(_inarg_n);
-        vx = VFLOAD_INARG1(vlen);
-
-        // Handle cases when 1+x is NaN, Inf, 0 and -ve 
-        EXCEPTION_HANDLING_LOG1P(vx, vxp1, special_args, vy_special, vlen);
- 
-        // 1+in_arg at this point are positive, finite and not subnormal
-        // Decompose in_arg into n, B, r: 1+in_arg = 2^n (1/B) (1 + r)
-        // B is equivalently defined by ind, 0 <= ind < 128
-        
-        // First compute 1+in_arg as vxp1 + u
-        VFLOAT first = __riscv_vfmax(vx, fp_posOne, vlen);
-        VFLOAT second = __riscv_vfmin(vx, fp_posOne, vlen);
-        VFLOAT u = __riscv_vfsub(first, vxp1, vlen);
-        u = __riscv_vfadd(u, second, vlen);
-
-        // Decompose 1+in_arg as 2^n (1/B) (1 + r). Unlike log, r has more precision then FP64
-        // and needs to be preserved to some extent
-        // To prevent loss of precision in frec7 when 1+in_arg is too large, we modify vxp1
-        // However, unlike in the case of log where we simply change the exponent to 0,
-        // this technique does not work too well here as we would than need to adjust the exponent of u
-        // accordingly as well. Here we modify vxp1 only when it is large, which in this case
-        // u will so small in comparison that missing this theoretically needed adjustment does not matter.
-        VINT n_adjust = VMVI_VX(0,vlen);
-        VFLOAT scale = VFMV_VF(fp_posOne, vlen);
-        VBOOL adjust = __riscv_vmfge(vxp1, 0x1.0p1020, vlen);
-        n_adjust = __riscv_vmerge(n_adjust, 4, adjust, vlen);
-        scale = __riscv_vfmerge(scale, 0x1.0p-4, adjust, vlen);
-        vxp1 = __riscv_vfmul(vxp1, scale, vlen);
-
-        VINT n = U_AS_I(__riscv_vadd(__riscv_vsrl(F_AS_U(vxp1), MAN_LEN-1, vlen), 1, vlen));
-        n = __riscv_vsra(n, 1, vlen);
-        n = __riscv_vsub(n, EXP_BIAS, vlen);
-        n = __riscv_vadd(n, n_adjust, vlen);
-        VFLOAT n_flt = __riscv_vfcvt_f(n, vlen);
-
-        VUINT ind = __riscv_vand(__riscv_vsrl(F_AS_U(vxp1), MAN_LEN-10, vlen), 0x3F8, vlen);
-
-        // normalize exponent of vx
-        VFLOAT B = __riscv_vfrec7(vxp1, vlen);
-        // adjust B if ind == 0. We cannot simply set B to be 1.0, but round up at the 7-th bit
-        VBOOL adjust_B = __riscv_vmseq(ind, 0, vlen);
-        B = I_AS_F(__riscv_vadd_mu(adjust_B, F_AS_I(B), F_AS_I(B), 0x0000200000000000, vlen));
-
-        VFLOAT r_hi = VFMV_VF(fp_posOne, vlen);
-        r_hi = __riscv_vfmsac(r_hi, vxp1, B, vlen);
-        VFLOAT r = __riscv_vfmacc(r_hi, u, B, vlen);
-        VFLOAT delta_r = __riscv_vfsub(r_hi, r, vlen);
-        delta_r = __riscv_vfmacc(delta_r, u, B, vlen);
-
-        // 1+in_arg = 2^n (1/B) (1 + r + delta_r)
-        // Thus log(1+in_arg) = (n + T) * log(2) + log(1 + r + delta_r)
-        // log is natural log (aka ln)
-        // log(1 + r + delta_r) is approximated by r + delta_r + r^2(p0 + p1 r + ... )
-        VFLOAT rsq = __riscv_vfmul(r, r, vlen);
-        VFLOAT rcube = __riscv_vfmul(rsq, r, vlen);
-
-        VFLOAT poly_right = PSTEP( 0x1.9999998877038p-3, r,
-                            PSTEP(  -0x1.555c54f8b7c6cp-3, 0x1.2499765b3c27ap-3, r,
-                            vlen), vlen);
-
-        VFLOAT poly_left = PSTEP( -0x1.000000000001cp-1, r,
-                           PSTEP(  0x1.55555555555a9p-2, -0x1.fffffff2018cfp-3, r,
-                           vlen), vlen);
-
-        VFLOAT poly = __riscv_vfmadd(poly_right, rcube, poly_left, vlen);
-        poly = __riscv_vfmadd(rsq, poly, delta_r, vlen);
-        // At this point, log(1+r+delta_r) is approximated by r + poly
-
-        // Load table values and get n_flt + T to be A + a
-        VINT T = __riscv_vluxei64(logD_tbl128_fixedpt, ind, vlen);
-        VINT T_hi = __riscv_vsll(__riscv_vsra(T, 24, vlen), 24, vlen);
-        VINT T_lo = __riscv_vsub(T, T_hi, vlen);
-        VFLOAT T_hi_flt = __riscv_vfcvt_f(T_hi, vlen);
-        VFLOAT A = __riscv_vfmadd(T_hi_flt, 0x1.0p-63, n_flt, vlen);
-        VFLOAT a = __riscv_vfcvt_f(T_lo, vlen);
-        a = __riscv_vfmul(a, 0x1.0p-63, vlen);
-
-        // Compute (A + a) * (log2_hi + log2_lo) + (r + poly)
-        // where B can be e, 2, or 10
-        VFLOAT delta_1 = __riscv_vfmul(A, LOG2_LO, vlen);
-        delta_1 = __riscv_vfmadd(a, LOG2_HI, delta_1, vlen);
-        poly = __riscv_vfadd(poly, delta_1, vlen);
-        poly = __riscv_vfadd(poly, r, vlen);
-        vy = __riscv_vfmadd(A, LOG2_HI, poly, vlen);
-
-        vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
-
-        // copy vy into y and increment addr pointers
-        VFSTORE_OUTARG1(vy, vlen);
-
-        INCREMENT_INARG1(vlen);
-        INCREMENT_OUTARG1(vlen);
-    }
-    RESTORE_FRM;
+  size_t vlen;
+  VFLOAT vx, vxp1, vy, vy_special;
+  VBOOL special_args;
+
+  SET_ROUNDTONEAREST;
+  // stripmining over input arguments
+  for (; _inarg_n > 0; _inarg_n -= vlen) {
+    vlen = VSET(_inarg_n);
+    vx = VFLOAD_INARG1(vlen);
+
+    // Handle cases when 1+x is NaN, Inf, 0 and -ve
+    EXCEPTION_HANDLING_LOG1P(vx, vxp1, special_args, vy_special, vlen);
+
+    // 1+in_arg at this point are positive, finite and not subnormal
+    // Decompose in_arg into n, B, r: 1+in_arg = 2^n (1/B) (1 + r)
+    // B is equivalently defined by ind, 0 <= ind < 128
+
+    // First compute 1+in_arg as vxp1 + u
+    VFLOAT first = __riscv_vfmax(vx, fp_posOne, vlen);
+    VFLOAT second = __riscv_vfmin(vx, fp_posOne, vlen);
+    VFLOAT u = __riscv_vfsub(first, vxp1, vlen);
+    u = __riscv_vfadd(u, second, vlen);
+
+    // Decompose 1+in_arg as 2^n (1/B) (1 + r). Unlike log, r has more precision
+    // then FP64 and needs to be preserved to some extent To prevent loss of
+    // precision in frec7 when 1+in_arg is too large, we modify vxp1 However,
+    // unlike in the case of log where we simply change the exponent to 0, this
+    // technique does not work too well here as we would than need to adjust the
+    // exponent of u accordingly as well. Here we modify vxp1 only when it is
+    // large, which in this case u will so small in comparison that missing this
+    // theoretically needed adjustment does not matter.
+    VINT n_adjust = VMVI_VX(0, vlen);
+    VFLOAT scale = VFMV_VF(fp_posOne, vlen);
+    VBOOL adjust = __riscv_vmfge(vxp1, 0x1.0p1020, vlen);
+    n_adjust = __riscv_vmerge(n_adjust, 4, adjust, vlen);
+    scale = __riscv_vfmerge(scale, 0x1.0p-4, adjust, vlen);
+    vxp1 = __riscv_vfmul(vxp1, scale, vlen);
+
+    VINT n = U_AS_I(
+        __riscv_vadd(__riscv_vsrl(F_AS_U(vxp1), MAN_LEN - 1, vlen), 1, vlen));
+    n = __riscv_vsra(n, 1, vlen);
+    n = __riscv_vsub(n, EXP_BIAS, vlen);
+    n = __riscv_vadd(n, n_adjust, vlen);
+    VFLOAT n_flt = __riscv_vfcvt_f(n, vlen);
+
+    VUINT ind = __riscv_vand(__riscv_vsrl(F_AS_U(vxp1), MAN_LEN - 10, vlen),
+                             0x3F8, vlen);
+
+    // normalize exponent of vx
+    VFLOAT B = __riscv_vfrec7(vxp1, vlen);
+    // adjust B if ind == 0. We cannot simply set B to be 1.0, but round up at
+    // the 7-th bit
+    VBOOL adjust_B = __riscv_vmseq(ind, 0, vlen);
+    B = I_AS_F(__riscv_vadd_mu(adjust_B, F_AS_I(B), F_AS_I(B),
+                               0x0000200000000000, vlen));
+
+    VFLOAT r_hi = VFMV_VF(fp_posOne, vlen);
+    r_hi = __riscv_vfmsac(r_hi, vxp1, B, vlen);
+    VFLOAT r = __riscv_vfmacc(r_hi, u, B, vlen);
+    VFLOAT delta_r = __riscv_vfsub(r_hi, r, vlen);
+    delta_r = __riscv_vfmacc(delta_r, u, B, vlen);
+
+    // 1+in_arg = 2^n (1/B) (1 + r + delta_r)
+    // Thus log(1+in_arg) = (n + T) * log(2) + log(1 + r + delta_r)
+    // log is natural log (aka ln)
+    // log(1 + r + delta_r) is approximated by r + delta_r + r^2(p0 + p1 r + ...
+    // )
+    VFLOAT rsq = __riscv_vfmul(r, r, vlen);
+    VFLOAT rcube = __riscv_vfmul(rsq, r, vlen);
+
+    VFLOAT poly_right = PSTEP(
+        0x1.9999998877038p-3, r,
+        PSTEP(-0x1.555c54f8b7c6cp-3, 0x1.2499765b3c27ap-3, r, vlen), vlen);
+
+    VFLOAT poly_left = PSTEP(
+        -0x1.000000000001cp-1, r,
+        PSTEP(0x1.55555555555a9p-2, -0x1.fffffff2018cfp-3, r, vlen), vlen);
+
+    VFLOAT poly = __riscv_vfmadd(poly_right, rcube, poly_left, vlen);
+    poly = __riscv_vfmadd(rsq, poly, delta_r, vlen);
+    // At this point, log(1+r+delta_r) is approximated by r + poly
+
+    // Load table values and get n_flt + T to be A + a
+    VINT T = __riscv_vluxei64(logD_tbl128_fixedpt, ind, vlen);
+    VINT T_hi = __riscv_vsll(__riscv_vsra(T, 24, vlen), 24, vlen);
+    VINT T_lo = __riscv_vsub(T, T_hi, vlen);
+    VFLOAT T_hi_flt = __riscv_vfcvt_f(T_hi, vlen);
+    VFLOAT A = __riscv_vfmadd(T_hi_flt, 0x1.0p-63, n_flt, vlen);
+    VFLOAT a = __riscv_vfcvt_f(T_lo, vlen);
+    a = __riscv_vfmul(a, 0x1.0p-63, vlen);
+
+    // Compute (A + a) * (log2_hi + log2_lo) + (r + poly)
+    // where B can be e, 2, or 10
+    VFLOAT delta_1 = __riscv_vfmul(A, LOG2_LO, vlen);
+    delta_1 = __riscv_vfmadd(a, LOG2_HI, delta_1, vlen);
+    poly = __riscv_vfadd(poly, delta_1, vlen);
+    poly = __riscv_vfadd(poly, r, vlen);
+    vy = __riscv_vfmadd(A, LOG2_HI, poly, vlen);
+
+    vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
+
+    // copy vy into y and increment addr pointers
+    VFSTORE_OUTARG1(vy, vlen);
+
+    INCREMENT_INARG1(vlen);
+    INCREMENT_OUTARG1(vlen);
+  }
+  RESTORE_FRM;
 }
diff --git a/include/rvvlm_logD.inc.h b/include/rvvlm_logD.inc.h
index f973944..9da9e4f 100644
--- a/include/rvvlm_logD.inc.h
+++ b/include/rvvlm_logD.inc.h
@@ -3,71 +3,71 @@
 // SPDX-License-Identifier: Apache-2.0
 
 // Macros for common small-block codes
-#define EXCEPTION_HANDLING_LOG(vx, special_args, vy_special, n_adjust, vlen) \
-do { \
-  VUINT vclass = __riscv_vfclass((vx), (vlen)); \
-  /* special handling except positive normal number */ \
-  IDENTIFY(vclass, 0x3BF, (special_args), (vlen)); \
-  UINT nb_special_args = __riscv_vcpop((special_args), (vlen)); \
-  n_adjust = VMVI_VX(0, (vlen)); \
-  if (nb_special_args > 0) { \
-    VBOOL id_mask; \
-    /* substitute negative arguments with sNaN */ \
-    IDENTIFY(vclass, class_negative, id_mask, (vlen)); \
-    vx = __riscv_vfmerge(vx, fp_sNaN, id_mask, vlen); \
-    /* substitute +0 argument with -0 */ \
-    IDENTIFY(vclass, class_posZero, id_mask, vlen); \
-    vx = __riscv_vfmerge(vx, fp_negZero, id_mask, vlen); \
-    /* eliminate positive denorm input from special_args */ \
-    IDENTIFY(vclass, 0x39F, (special_args), (vlen)); \
-    /* for narrowed set of special arguments, compute vx+vfrec7(vx) */ \
-    vy_special = __riscv_vfrec7((special_args), (vx), (vlen)); \
-    vy_special = __riscv_vfadd((special_args), vy_special, (vx), (vlen)); \
-    vx = __riscv_vfmerge((vx), fp_posOne, (special_args), (vlen)); \
-    /* scale up input for positive denormals */ \
-    IDENTIFY(vclass, class_posDenorm, id_mask, (vlen)); \
-    n_adjust = __riscv_vmerge(n_adjust, 64, id_mask, vlen); \
-    VFLOAT vx_normalized = __riscv_vfmul(id_mask, vx, 0x1.0p64, vlen); \
-    vx = __riscv_vmerge(vx, vx_normalized, id_mask, vlen); \
-  } \
-} while(0)
+#define EXCEPTION_HANDLING_LOG(vx, special_args, vy_special, n_adjust, vlen)   \
+  do {                                                                         \
+    VUINT vclass = __riscv_vfclass((vx), (vlen));                              \
+    /* special handling except positive normal number */                       \
+    IDENTIFY(vclass, 0x3BF, (special_args), (vlen));                           \
+    UINT nb_special_args = __riscv_vcpop((special_args), (vlen));              \
+    n_adjust = VMVI_VX(0, (vlen));                                             \
+    if (nb_special_args > 0) {                                                 \
+      VBOOL id_mask;                                                           \
+      /* substitute negative arguments with sNaN */                            \
+      IDENTIFY(vclass, class_negative, id_mask, (vlen));                       \
+      vx = __riscv_vfmerge(vx, fp_sNaN, id_mask, vlen);                        \
+      /* substitute +0 argument with -0 */                                     \
+      IDENTIFY(vclass, class_posZero, id_mask, vlen);                          \
+      vx = __riscv_vfmerge(vx, fp_negZero, id_mask, vlen);                     \
+      /* eliminate positive denorm input from special_args */                  \
+      IDENTIFY(vclass, 0x39F, (special_args), (vlen));                         \
+      /* for narrowed set of special arguments, compute vx+vfrec7(vx) */       \
+      vy_special = __riscv_vfrec7((special_args), (vx), (vlen));               \
+      vy_special = __riscv_vfadd((special_args), vy_special, (vx), (vlen));    \
+      vx = __riscv_vfmerge((vx), fp_posOne, (special_args), (vlen));           \
+      /* scale up input for positive denormals */                              \
+      IDENTIFY(vclass, class_posDenorm, id_mask, (vlen));                      \
+      n_adjust = __riscv_vmerge(n_adjust, 64, id_mask, vlen);                  \
+      VFLOAT vx_normalized = __riscv_vfmul(id_mask, vx, 0x1.0p64, vlen);       \
+      vx = __riscv_vmerge(vx, vx_normalized, id_mask, vlen);                   \
+    }                                                                          \
+  } while (0)
 
 #if defined(COMPILE_FOR_LOG)
 #if (STRIDE == UNIT_STRIDE)
-#define F_VER1    RVVLM_LOGD_TBL128
-#define F_VER2    RVVLM_LOGD_ATANH
+#define F_VER1 RVVLM_LOGD_TBL128
+#define F_VER2 RVVLM_LOGD_ATANH
 #else
-#define F_VER1    RVVLM_LOGDI_TBL128
-#define F_VER2    RVVLM_LOGDI_ATANH
+#define F_VER1 RVVLM_LOGDI_TBL128
+#define F_VER2 RVVLM_LOGDI_ATANH
 #endif
 #define LOGB_2_HI 0x1.62e42fefa39efp-1
-#define LOGB_2_LO 0x1.abc9e3b39803fp-56 
+#define LOGB_2_LO 0x1.abc9e3b39803fp-56
 #define LOGB_e_HI 0x1.0p0
 #define LOGB_e_LO 0.0
 #elif defined(COMPILE_FOR_LOG2)
 #if (STRIDE == UNIT_STRIDE)
-#define F_VER1    RVVLM_LOG2D_TBL128
-#define F_VER2    RVVLM_LOG2D_ATANH
+#define F_VER1 RVVLM_LOG2D_TBL128
+#define F_VER2 RVVLM_LOG2D_ATANH
 #else
-#define F_VER1    RVVLM_LOG2DI_TBL128
-#define F_VER2    RVVLM_LOG2DI_ATANH
+#define F_VER1 RVVLM_LOG2DI_TBL128
+#define F_VER2 RVVLM_LOG2DI_ATANH
 #endif
 #define LOGB_2_HI 0x1.0p0
 #define LOGB_2_LO 0.0
-#define LOGB_e_HI 0x1.71547652b82fep+0 
-#define LOGB_e_LO 0x1.777d0ffda0d24p-56 
+#define LOGB_e_HI 0x1.71547652b82fep+0
+#define LOGB_e_LO 0x1.777d0ffda0d24p-56
 #elif defined(COMPILE_FOR_LOG10)
 #if (STRIDE == 1)
-#define F_VER1    RVVLM_LOG10D_TBL128
-#define F_VER2    RVVLM_LOG10D_ATANH
+#define F_VER1 RVVLM_LOG10D_TBL128
+#define F_VER2 RVVLM_LOG10D_ATANH
 #else
-#define F_VER1    RVVLM_LOG10DI_TBL128
-#define F_VER2    RVVLM_LOG10DI_ATANH
+#define F_VER1 RVVLM_LOG10DI_TBL128
+#define F_VER2 RVVLM_LOG10DI_ATANH
 #endif
-#define LOGB_2_HI  0x1.34413509f79ffp-2
-#define LOGB_2_LO -0x1.9dc1da994fd00p-59 
-#define LOGB_e_HI  0x1.bcb7b1526e50ep-2 
-#define LOGB_e_LO  0x1.95355baaafad3p-57
+#define LOGB_2_HI 0x1.34413509f79ffp-2
+#define LOGB_2_LO -0x1.9dc1da994fd00p-59
+#define LOGB_e_HI 0x1.bcb7b1526e50ep-2
+#define LOGB_e_LO 0x1.95355baaafad3p-57
 #else
 static_assert(false, "Must specify base of logarithm" __FILE__);
 #endif
@@ -76,207 +76,213 @@ static_assert(false, "Must specify base of logarithm" __FILE__);
 
 // Version 1 uses a 128-entry LUT
 void F_VER1(API) {
-    size_t vlen;
-    VFLOAT vx, vy, vy_special;
-    VBOOL special_args;
-    VINT n_adjust;
-
-    SET_ROUNDTONEAREST;
-    // stripmining over input arguments
-    for (; _inarg_n > 0; _inarg_n -= vlen) {
-        vlen = VSET(_inarg_n);
-        vx = VFLOAD_INARG1(vlen);
-
-        // NaN, Inf, and -ve handling, as well as scaling denormal input by 2^64
-        EXCEPTION_HANDLING_LOG(vx, special_args, vy_special, n_adjust, vlen);
- 
-        // in_arg at this point are positive, finite and not subnormal
-        // Decompose in_arg into n, B, r: in_arg = 2^n (1/B) (1 + r)
-        // B is equivalently defined by ind, 0 <= ind < 128
-        VINT n = U_AS_I(__riscv_vadd(__riscv_vsrl(F_AS_U(vx), MAN_LEN-1, vlen), 1, vlen));
-        n = __riscv_vsra(n, 1, vlen);
-        n = __riscv_vsub(n, EXP_BIAS, vlen);
-        vx = U_AS_F(__riscv_vsrl(__riscv_vsll(F_AS_U(vx), BIT_WIDTH-MAN_LEN, vlen), 
-                    BIT_WIDTH-MAN_LEN, vlen));
-        vx = U_AS_F(__riscv_vadd(F_AS_U(vx), (uint64_t)EXP_BIAS << MAN_LEN, vlen));
-        n = __riscv_vsub(n, n_adjust, vlen);
-        VFLOAT n_flt = __riscv_vfcvt_f(n, vlen);
-        VFLOAT B = __riscv_vfrec7(vx, vlen);
-        // get 7 msb of mantissa, and left shift by 3 to get address 
-        VUINT ind = __riscv_vand(__riscv_vsrl(F_AS_U(vx), MAN_LEN-10, vlen), 0x3F8, vlen);
-        // adjust B to be 1.0 if ind == 0
-        VBOOL adjust_B = __riscv_vmseq(ind, 0, vlen);
-        B = __riscv_vfmerge(B, fp_posOne, adjust_B, vlen);        
-        // finally get r = B * in_arg - 1.0
-        VFLOAT r = VFMV_VF(fp_posOne, vlen);
-        r = __riscv_vfmsac(r, vx, B, vlen);
-
-        // Base-B log is logB(in_arg) = logB(2^n * 1/B) + logB(1 + r)
-        // (n + log2(1/B))*logB(2) + log(1+r)*logB(e) 
-        // log2(1/B) is stored in a table
-        // and log(1+r) is approximated by r + poly
-        // poly is a polynomial in r in the form r^2 * (p0 + p1 r + ... ) 
-        // To deliver this result accurately, one uses logB(2) and logB(e)
-        // with extra precision and sums the various terms in an appropriate order
-        VFLOAT rsq = __riscv_vfmul(r, r, vlen);
-        VFLOAT rcube = __riscv_vfmul(rsq, r, vlen);
-
-        VFLOAT poly_right = PSTEP( 0x1.9999998877038p-3, r,
-                            PSTEP(  -0x1.555c54f8b7c6cp-3, 0x1.2499765b3c27ap-3, r,
-                            vlen), vlen);
-
-        VFLOAT poly_left = PSTEP( -0x1.000000000001cp-1, r,
-                           PSTEP(  0x1.55555555555a9p-2, -0x1.fffffff2018cfp-3, r,
-                           vlen), vlen);
-
-        VFLOAT poly = __riscv_vfmadd(poly_right, rcube, poly_left, vlen);
-        poly = __riscv_vfmul(rsq, poly, vlen);
-        // log_e(1+r) is r + poly
-    
-        // Load table values and get n_flt + T to be A + a
-        VINT T = __riscv_vluxei64(logD_tbl128_fixedpt, ind, vlen);
-        VINT T_hi = __riscv_vsll(__riscv_vsra(T, 24, vlen), 24, vlen);
-        VINT T_lo = __riscv_vsub(T, T_hi, vlen);
-        VFLOAT T_hi_flt = __riscv_vfcvt_f(T_hi, vlen);
-        VFLOAT A = __riscv_vfmadd(T_hi_flt, 0x1.0p-63, n_flt, vlen);
-        VFLOAT a = __riscv_vfcvt_f(T_lo, vlen);
-        a = __riscv_vfmul(a, 0x1.0p-63, vlen);
-
-        // Compute (A + a) * (logB_2_hi + logB_2_lo) + (r + P) * (logB_e_hi + logB_e_lo)
-        // where B can be e, 2, or 10
-        VFLOAT delta_1 = __riscv_vfmul(A, LOGB_2_LO, vlen);
-        delta_1 = __riscv_vfmadd(a, LOGB_2_HI, delta_1, vlen);
+  size_t vlen;
+  VFLOAT vx, vy, vy_special;
+  VBOOL special_args;
+  VINT n_adjust;
+
+  SET_ROUNDTONEAREST;
+  // stripmining over input arguments
+  for (; _inarg_n > 0; _inarg_n -= vlen) {
+    vlen = VSET(_inarg_n);
+    vx = VFLOAD_INARG1(vlen);
+
+    // NaN, Inf, and -ve handling, as well as scaling denormal input by 2^64
+    EXCEPTION_HANDLING_LOG(vx, special_args, vy_special, n_adjust, vlen);
+
+    // in_arg at this point are positive, finite and not subnormal
+    // Decompose in_arg into n, B, r: in_arg = 2^n (1/B) (1 + r)
+    // B is equivalently defined by ind, 0 <= ind < 128
+    VINT n = U_AS_I(
+        __riscv_vadd(__riscv_vsrl(F_AS_U(vx), MAN_LEN - 1, vlen), 1, vlen));
+    n = __riscv_vsra(n, 1, vlen);
+    n = __riscv_vsub(n, EXP_BIAS, vlen);
+    vx =
+        U_AS_F(__riscv_vsrl(__riscv_vsll(F_AS_U(vx), BIT_WIDTH - MAN_LEN, vlen),
+                            BIT_WIDTH - MAN_LEN, vlen));
+    vx = U_AS_F(__riscv_vadd(F_AS_U(vx), (uint64_t)EXP_BIAS << MAN_LEN, vlen));
+    n = __riscv_vsub(n, n_adjust, vlen);
+    VFLOAT n_flt = __riscv_vfcvt_f(n, vlen);
+    VFLOAT B = __riscv_vfrec7(vx, vlen);
+    // get 7 msb of mantissa, and left shift by 3 to get address
+    VUINT ind =
+        __riscv_vand(__riscv_vsrl(F_AS_U(vx), MAN_LEN - 10, vlen), 0x3F8, vlen);
+    // adjust B to be 1.0 if ind == 0
+    VBOOL adjust_B = __riscv_vmseq(ind, 0, vlen);
+    B = __riscv_vfmerge(B, fp_posOne, adjust_B, vlen);
+    // finally get r = B * in_arg - 1.0
+    VFLOAT r = VFMV_VF(fp_posOne, vlen);
+    r = __riscv_vfmsac(r, vx, B, vlen);
+
+    // Base-B log is logB(in_arg) = logB(2^n * 1/B) + logB(1 + r)
+    // (n + log2(1/B))*logB(2) + log(1+r)*logB(e)
+    // log2(1/B) is stored in a table
+    // and log(1+r) is approximated by r + poly
+    // poly is a polynomial in r in the form r^2 * (p0 + p1 r + ... )
+    // To deliver this result accurately, one uses logB(2) and logB(e)
+    // with extra precision and sums the various terms in an appropriate order
+    VFLOAT rsq = __riscv_vfmul(r, r, vlen);
+    VFLOAT rcube = __riscv_vfmul(rsq, r, vlen);
+
+    VFLOAT poly_right = PSTEP(
+        0x1.9999998877038p-3, r,
+        PSTEP(-0x1.555c54f8b7c6cp-3, 0x1.2499765b3c27ap-3, r, vlen), vlen);
+
+    VFLOAT poly_left = PSTEP(
+        -0x1.000000000001cp-1, r,
+        PSTEP(0x1.55555555555a9p-2, -0x1.fffffff2018cfp-3, r, vlen), vlen);
+
+    VFLOAT poly = __riscv_vfmadd(poly_right, rcube, poly_left, vlen);
+    poly = __riscv_vfmul(rsq, poly, vlen);
+    // log_e(1+r) is r + poly
+
+    // Load table values and get n_flt + T to be A + a
+    VINT T = __riscv_vluxei64(logD_tbl128_fixedpt, ind, vlen);
+    VINT T_hi = __riscv_vsll(__riscv_vsra(T, 24, vlen), 24, vlen);
+    VINT T_lo = __riscv_vsub(T, T_hi, vlen);
+    VFLOAT T_hi_flt = __riscv_vfcvt_f(T_hi, vlen);
+    VFLOAT A = __riscv_vfmadd(T_hi_flt, 0x1.0p-63, n_flt, vlen);
+    VFLOAT a = __riscv_vfcvt_f(T_lo, vlen);
+    a = __riscv_vfmul(a, 0x1.0p-63, vlen);
+
+    // Compute (A + a) * (logB_2_hi + logB_2_lo) + (r + P) * (logB_e_hi +
+    // logB_e_lo) where B can be e, 2, or 10
+    VFLOAT delta_1 = __riscv_vfmul(A, LOGB_2_LO, vlen);
+    delta_1 = __riscv_vfmadd(a, LOGB_2_HI, delta_1, vlen);
 #if !defined(COMPILE_FOR_LOG)
-        delta_1 = __riscv_vfmacc(delta_1, LOGB_e_LO, r, vlen);
-        poly = __riscv_vfmadd(poly, LOGB_e_HI, delta_1, vlen);
+    delta_1 = __riscv_vfmacc(delta_1, LOGB_e_LO, r, vlen);
+    poly = __riscv_vfmadd(poly, LOGB_e_HI, delta_1, vlen);
 #else
-        poly = __riscv_vfadd(poly, delta_1, vlen);
+    poly = __riscv_vfadd(poly, delta_1, vlen);
 #endif
 
 #if !defined(COMPILE_FOR_LOG)
-        poly = __riscv_vfmacc(poly, LOGB_e_HI, r, vlen);
+    poly = __riscv_vfmacc(poly, LOGB_e_HI, r, vlen);
 #else
-        poly = __riscv_vfadd(poly, r, vlen);
+    poly = __riscv_vfadd(poly, r, vlen);
 #endif
 
 #if !defined(COMPILE_FOR_LOG2)
-        vy = __riscv_vfmadd(A, LOGB_2_HI, poly, vlen);
+    vy = __riscv_vfmadd(A, LOGB_2_HI, poly, vlen);
 #else
-        vy = __riscv_vfadd(A, poly, vlen);
+    vy = __riscv_vfadd(A, poly, vlen);
 #endif
-        vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
+    vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
 
-        // copy vy into y and increment addr pointers
-        VFSTORE_OUTARG1(vy, vlen);
+    // copy vy into y and increment addr pointers
+    VFSTORE_OUTARG1(vy, vlen);
 
-        INCREMENT_INARG1(vlen);
-        INCREMENT_OUTARG1(vlen);
-
-    }
-    RESTORE_FRM;
+    INCREMENT_INARG1(vlen);
+    INCREMENT_OUTARG1(vlen);
+  }
+  RESTORE_FRM;
 }
 
-// Version uses the identity log(x) = 2 atanh((x-1)/(x+1)) 
+// Version uses the identity log(x) = 2 atanh((x-1)/(x+1))
 void F_VER2(API) {
-    size_t vlen;
-    VFLOAT vx, vy, vy_special;
-    VBOOL special_args;
-    VINT n_adjust;
-
-    SET_ROUNDTONEAREST;
-    // stripmining over input arguments
-    for (; _inarg_n > 0; _inarg_n -= vlen) {
-        vlen = VSET(_inarg_n);
-        vx = VFLOAD_INARG1(vlen);
-
-        // NaN, Inf, and -ve handling, as well as scaling denormal input by 2^64
-        EXCEPTION_HANDLING_LOG(vx, special_args, vy_special, n_adjust, vlen);
- 
-        // in_arg at this point are positive, finite and not subnormal
-        // Decompose in_arg into 2^n * X, where 0.75 <= X < 1.5 
-        // logB(2^n X) = n * logB(2)  +  log(X) * logB(e) 
-        // log(X) = 2 atanh((X-1)/(X+1))
-
-        // Argument reduction: represent in_arg as 2^n X where 0.75 <= X < 1.5
-        // Then compute 2(X-1)/(X+1) as r + delta_r. 
-        // natural log, log(X) = 2 atanh(w/2)  = w + p1 w^3 + p2 w5 ...; w = r+delta_r
-        VINT n = U_AS_I(__riscv_vadd(__riscv_vsrl(F_AS_U(vx), MAN_LEN-8, vlen), 0x96, vlen));
-        n = __riscv_vsra(n, 8, vlen);
-        n = __riscv_vsub(n, EXP_BIAS, vlen);
-        vx = I_AS_F(__riscv_vsub(F_AS_I(vx), __riscv_vsll(n, MAN_LEN, vlen), vlen));
-        n = __riscv_vsub(n, n_adjust, vlen);
-        VFLOAT n_flt = __riscv_vfcvt_f(n, vlen);
-
-        VFLOAT numer = __riscv_vfsub(vx, fp_posOne, vlen); 
-        numer = __riscv_vfadd(numer, numer, vlen);
-        VFLOAT denom, delta_d;
-        //FAST2SUM(fp_posOne, vx, denom, delta_d, vlen);
-        denom = __riscv_vfadd(vx, fp_posOne, vlen);
-        delta_d = __riscv_vfrsub(denom, fp_posOne, vlen);
-        delta_d = __riscv_vfadd(delta_d, vx, vlen);
-        VFLOAT r, delta_r;
-        DIV_N1D2(numer, denom, delta_d, r, delta_r, vlen);
-
-        VFLOAT dummy = r;
-
-        VFLOAT rsq = __riscv_vfmul(r, r, vlen);
-        VFLOAT rcube = __riscv_vfmul(rsq, r, vlen);
-        VFLOAT r6 = __riscv_vfmul(rcube, rcube, vlen);
-
-        VFLOAT poly_right = PSTEP( 0x1.c71c51c73bb7fp-12, rsq,
-                            PSTEP( 0x1.74664bed42062p-14, rsq,
-                            PSTEP( 0x1.39a071f83b771p-16, 0x1.2f123764244dfp-18, rsq,
-                            vlen), vlen), vlen);
-
-        VFLOAT poly_left = PSTEP( 0x1.5555555555594p-4, rsq,
-                           PSTEP( 0x1.999999997f6b6p-7, 0x1.2492494248a48p-9, rsq,
-                           vlen), vlen);
-
-        VFLOAT poly = __riscv_vfmadd(poly_right, r6, poly_left, vlen);
-        poly = __riscv_vfmadd(poly, rcube, delta_r, vlen);
-        // At this point r + poly approximates log(X) 
-
-        // Reconstruction: logB(in_arg) = n logB(2) + log(X) * logB(e), computed as
-        // n*(logB_2_hi + logB_2_lo) + r * (logB_e_hi + logB_e_lo) + poly * logB_e_hi
-        // It is best to compute n * logB_2_hi + r * logB_e_hi in extra precision
-        VFLOAT A, a, B, b, S, s;
+  size_t vlen;
+  VFLOAT vx, vy, vy_special;
+  VBOOL special_args;
+  VINT n_adjust;
+
+  SET_ROUNDTONEAREST;
+  // stripmining over input arguments
+  for (; _inarg_n > 0; _inarg_n -= vlen) {
+    vlen = VSET(_inarg_n);
+    vx = VFLOAD_INARG1(vlen);
+
+    // NaN, Inf, and -ve handling, as well as scaling denormal input by 2^64
+    EXCEPTION_HANDLING_LOG(vx, special_args, vy_special, n_adjust, vlen);
+
+    // in_arg at this point are positive, finite and not subnormal
+    // Decompose in_arg into 2^n * X, where 0.75 <= X < 1.5
+    // logB(2^n X) = n * logB(2)  +  log(X) * logB(e)
+    // log(X) = 2 atanh((X-1)/(X+1))
+
+    // Argument reduction: represent in_arg as 2^n X where 0.75 <= X < 1.5
+    // Then compute 2(X-1)/(X+1) as r + delta_r.
+    // natural log, log(X) = 2 atanh(w/2)  = w + p1 w^3 + p2 w5 ...; w =
+    // r+delta_r
+    VINT n = U_AS_I(
+        __riscv_vadd(__riscv_vsrl(F_AS_U(vx), MAN_LEN - 8, vlen), 0x96, vlen));
+    n = __riscv_vsra(n, 8, vlen);
+    n = __riscv_vsub(n, EXP_BIAS, vlen);
+    vx = I_AS_F(__riscv_vsub(F_AS_I(vx), __riscv_vsll(n, MAN_LEN, vlen), vlen));
+    n = __riscv_vsub(n, n_adjust, vlen);
+    VFLOAT n_flt = __riscv_vfcvt_f(n, vlen);
+
+    VFLOAT numer = __riscv_vfsub(vx, fp_posOne, vlen);
+    numer = __riscv_vfadd(numer, numer, vlen);
+    VFLOAT denom, delta_d;
+    // FAST2SUM(fp_posOne, vx, denom, delta_d, vlen);
+    denom = __riscv_vfadd(vx, fp_posOne, vlen);
+    delta_d = __riscv_vfrsub(denom, fp_posOne, vlen);
+    delta_d = __riscv_vfadd(delta_d, vx, vlen);
+    VFLOAT r, delta_r;
+    DIV_N1D2(numer, denom, delta_d, r, delta_r, vlen);
+
+    VFLOAT dummy = r;
+
+    VFLOAT rsq = __riscv_vfmul(r, r, vlen);
+    VFLOAT rcube = __riscv_vfmul(rsq, r, vlen);
+    VFLOAT r6 = __riscv_vfmul(rcube, rcube, vlen);
+
+    VFLOAT poly_right = PSTEP(
+        0x1.c71c51c73bb7fp-12, rsq,
+        PSTEP(0x1.74664bed42062p-14, rsq,
+              PSTEP(0x1.39a071f83b771p-16, 0x1.2f123764244dfp-18, rsq, vlen),
+              vlen),
+        vlen);
+
+    VFLOAT poly_left = PSTEP(
+        0x1.5555555555594p-4, rsq,
+        PSTEP(0x1.999999997f6b6p-7, 0x1.2492494248a48p-9, rsq, vlen), vlen);
+
+    VFLOAT poly = __riscv_vfmadd(poly_right, r6, poly_left, vlen);
+    poly = __riscv_vfmadd(poly, rcube, delta_r, vlen);
+    // At this point r + poly approximates log(X)
+
+    // Reconstruction: logB(in_arg) = n logB(2) + log(X) * logB(e), computed as
+    // n*(logB_2_hi + logB_2_lo) + r * (logB_e_hi + logB_e_lo) + poly *
+    // logB_e_hi It is best to compute n * logB_2_hi + r * logB_e_hi in extra
+    // precision
+    VFLOAT A, a, B, b, S, s;
 #if !defined(COMPILE_FOR_LOG2)
-        PROD_X1Y1(n_flt, LOGB_2_HI, A, a, vlen);
+    PROD_X1Y1(n_flt, LOGB_2_HI, A, a, vlen);
 #else
-        A = n_flt;
-        a = I_AS_F(__riscv_vxor(F_AS_I(a), F_AS_I(a), vlen));
+    A = n_flt;
+    a = I_AS_F(__riscv_vxor(F_AS_I(a), F_AS_I(a), vlen));
 #endif
 #if !defined(COMPILE_FOR_LOG)
-        PROD_X1Y1(r, LOGB_e_HI, B, b, vlen);
+    PROD_X1Y1(r, LOGB_e_HI, B, b, vlen);
 #else
-        B = r;
-        b = I_AS_F(__riscv_vxor(F_AS_I(b), F_AS_I(b), vlen));
+    B = r;
+    b = I_AS_F(__riscv_vxor(F_AS_I(b), F_AS_I(b), vlen));
 #endif
-        a = __riscv_vfadd(a, b, vlen);
-        FAST2SUM(A, B, S, s, vlen);
-        s = __riscv_vfadd(s, a, vlen);
+    a = __riscv_vfadd(a, b, vlen);
+    FAST2SUM(A, B, S, s, vlen);
+    s = __riscv_vfadd(s, a, vlen);
 
 #if !defined(COMPILE_FOR_LOG)
-        s = __riscv_vfmacc(s, LOGB_e_LO, r, vlen);
-        s = __riscv_vfmacc(s, LOGB_e_HI, poly, vlen);
+    s = __riscv_vfmacc(s, LOGB_e_LO, r, vlen);
+    s = __riscv_vfmacc(s, LOGB_e_HI, poly, vlen);
 #else
-        s = __riscv_vfadd(s, poly, vlen);
+    s = __riscv_vfadd(s, poly, vlen);
 #endif
 
 #if !defined(COMPILE_FOR_LOG2)
-        s = __riscv_vfmacc(s, LOGB_2_LO, n_flt, vlen);
+    s = __riscv_vfmacc(s, LOGB_2_LO, n_flt, vlen);
 #endif
-        vy = __riscv_vfadd(S, s, vlen); 
-
-        // vy = dummy;
-        vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
+    vy = __riscv_vfadd(S, s, vlen);
 
-        // copy vy into y and increment addr pointers
-        VFSTORE_OUTARG1(vy, vlen);
+    // vy = dummy;
+    vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
 
-        INCREMENT_INARG1(vlen);
-        INCREMENT_OUTARG1(vlen);
+    // copy vy into y and increment addr pointers
+    VFSTORE_OUTARG1(vy, vlen);
 
-    }
-    RESTORE_FRM;
+    INCREMENT_INARG1(vlen);
+    INCREMENT_OUTARG1(vlen);
+  }
+  RESTORE_FRM;
 }
diff --git a/include/rvvlm_powD.inc.h b/include/rvvlm_powD.inc.h
index 4a02ce7..570a15c 100644
--- a/include/rvvlm_powD.inc.h
+++ b/include/rvvlm_powD.inc.h
@@ -3,155 +3,162 @@
 // SPDX-License-Identifier: Apache-2.0
 
 // Macros for common small-block codes
-#define EXCEPTION_HANDLING_POW(vx, vy, special_args, vz_special, vlen) \
-VUINT vclass_x = __riscv_vfclass(vx, vlen); \
-VUINT vclass_y = __riscv_vfclass(vy, vlen); \
-do { \
-/* Exception handling: handle x or y being NaN, Inf, and Zero  \
- * and replace them with 2.0 so that normal computations with them \
- * do not raise problems. \
- * Note that we do not call out negative x for special handling. The \
- * normal computation essentially computes |x|^y, but identify x < 0 \
- * later on; replacing the answer appropriately depending on whether \
- * y is an integer (resulting in +-(|x|^y)) or not (resulting in NaN). \
- * \
- * In side the special argument handling, we handle 3 cases separately \
- * x AND y both special, only x is special, and only y is special. \
- */ \
- \
-    VBOOL y_special, x_special; \
-    /* 0x399 is NaN/Inf/Zero */ \
-    IDENTIFY(vclass_y, 0x399, y_special, vlen); \
-    IDENTIFY(vclass_x, 0x399, x_special, vlen); \
- \
-    special_args = __riscv_vmor(x_special, y_special, vlen); \
-    UINT nb_special_args = __riscv_vcpop(special_args, vlen); \
- \
-    if (nb_special_args > 0) { \
-        /* Expect this to be taken rarely. We handle separately the three  \
-         * mutually exclusive cases of both x and y are special, and only one is special \
-         */ \
-        VUINT vclass_z; \
-        VBOOL id_mask; \
-        vz_special = VFMV_VF(fp_posOne, vlen); \
-        VBOOL current_cases = __riscv_vmand(x_special, y_special, vlen); \
-        if (__riscv_vcpop(current_cases, vlen) > 0) { \
-            /* x AND y are special */ \
- \
-            /* pow(any, 0) is 1.0 */ \
-            IDENTIFY(vclass_y, class_Zero, id_mask, vlen); \
-            id_mask = __riscv_vmand(id_mask, current_cases, vlen); \
-            vy = __riscv_vfmerge(vy, fp_posOne, id_mask, vlen); \
-            vx = __riscv_vfmerge(vx, fp_posOne, id_mask, vlen); \
-            VBOOL restricted_cases = __riscv_vmandn(current_cases, id_mask, vlen); \
- \
-            /* pow(+-Inf,+-Inf) = pow(+Inf,+-Inf), so substitue -Inf by +Inf for x */ \
-            IDENTIFY(vclass_x, class_negInf, id_mask, vlen); \
-            id_mask = __riscv_vmand(id_mask, restricted_cases, vlen); \
-            vx = __riscv_vfmerge(vx, fp_posInf, id_mask, vlen); \
- \
-            /* pow(0, +-Inf) = +Inf or 0. Substitute x by -Inf to mimic log(x) */ \
-            IDENTIFY(vclass_x, class_Zero, id_mask, vlen); \
-            id_mask = __riscv_vmand(id_mask, restricted_cases, vlen); \
-            vx = __riscv_vfmerge(vx, fp_negInf, id_mask, vlen); \
- \
-            /* multiply the substituted vx * vy that mimics y*log(x) to some extent.  \
-             * This product will also generate the necessary NaN and invalid operation signal \
-             */ \
-            vz_special = __riscv_vfmul_mu(current_cases, vz_special, vx, vy, vlen); \
-            vclass_z = __riscv_vfclass(vz_special, vlen); \
-            IDENTIFY(vclass_z, class_negInf, id_mask, vlen); \
-            id_mask = __riscv_vmand(id_mask, current_cases, vlen); \
-            vz_special = __riscv_vfmerge(vz_special, fp_posZero, id_mask, vlen); \
-            /* end of handling for BOTH x and y are special */ \
-        } \
- \
- \
-        current_cases = __riscv_vmandn(x_special, y_special, vlen); \
-        if (__riscv_vcpop(current_cases, vlen) > 0) { \
-            /* x only is special */ \
- \
-            VINT sign_x = __riscv_vand(F_AS_I(vx), F_AS_I(vx), vlen); \
-            /* Here we change x that is +-Inf into +Inf, and x that is +-0 to -Inf */ \
-            IDENTIFY(vclass_x, class_Zero, id_mask, vlen); \
-            id_mask = __riscv_vmand(id_mask, current_cases, vlen); \
-            vx = __riscv_vfmerge(vx, fp_negInf, id_mask, vlen); \
- \
-            IDENTIFY(vclass_x, class_Inf, id_mask, vlen); \
-            id_mask = __riscv_vmand(id_mask, current_cases, vlen); \
-            vx = __riscv_vfmerge(vx, fp_posInf, id_mask, vlen); \
- \
-            /* We need to identify whether y is of integer value and if so its parity. \
-             * We first clip y values to +-2^53, because FP value of this magnitude and beyond  \
-             * are always even integers \
-             */ \
-            vy = __riscv_vfmin_mu(current_cases, vy, vy, 0x1.0p53, vlen); \
-            vy = __riscv_vfmax_mu(current_cases, vy, vy, -0x1.0p53, vlen); \
-            VINT y_to_int = __riscv_vfcvt_x(current_cases, vy, vlen); \
-            VFLOAT y_to_int_fp = __riscv_vfcvt_f(current_cases, y_to_int, vlen); \
-            VBOOL y_is_int = __riscv_vmfeq(current_cases, vy, y_to_int_fp, vlen); \
-            VINT sign_z = __riscv_vsll(y_to_int, 63, vlen);  \
-            /* the parity is used later on to manipulate sign, hence sll 63 bits */ \
- \
-            /* we have set vx to mimic log(|x|), so we now compute y * log(|x|) */ \
-            vz_special = __riscv_vfmul_mu(current_cases, vz_special, vy, vx, vlen); \
-            /* map -Inf to +0 */ \
-            vclass_z = __riscv_vfclass(vz_special, vlen); \
-            IDENTIFY(vclass_z, class_negInf, id_mask, vlen); \
-            id_mask = __riscv_vmand(id_mask, current_cases, vlen); \
-            vz_special = __riscv_vfmerge(vz_special, fp_posZero, id_mask, vlen); \
-            /* now must set the sign of vz_special for x in {Zero, Inf} and y of integer value */ \
- \
-            IDENTIFY(vclass_x, class_Inf|class_Zero, id_mask, vlen); \
-            id_mask = __riscv_vmand(current_cases, id_mask, vlen); \
-            VFLOAT vz_tmp = I_AS_F(__riscv_vand(id_mask, sign_x, sign_z, vlen)); \
-            vz_tmp = __riscv_vfsgnj(id_mask, vz_special, vz_tmp, vlen); \
-            vz_special = __riscv_vmerge(vz_special, vz_tmp, id_mask, vlen); \
-        } \
- \
-        current_cases = __riscv_vmandn(y_special, x_special, vlen); \
-        if (__riscv_vcpop(current_cases, vlen) > 0) { \
-            /* y only is special */ \
- \
-            /* Here x is finite and non-zero. But x == 1.0 is special \
-             * in that 1.0^anything is 1.0, including when y is a NaN. \
-             * Aside from this case, we need to differentiate |x| <, ==, > 1 \
-             * so as to handle y == +-Inf appropriately. \
-             */ \
- \
-            /* If |x| == 1.0, replace y with 0.0 */ \
-            VFLOAT vz_tmp = __riscv_vfsgnj(current_cases, vx, fp_posOne, vlen); \
-            vz_tmp = __riscv_vfsub(current_cases, vz_tmp, fp_posOne, vlen); \
-            id_mask = __riscv_vmfeq(vz_tmp, fp_posZero, vlen); \
-            id_mask = __riscv_vmand(current_cases, id_mask, vlen); \
-            VBOOL id_mask2; \
-            IDENTIFY(vclass_y, class_Inf|class_Zero, id_mask2, vlen); \
-            id_mask2 = __riscv_vmand(id_mask, id_mask2, vlen); \
-            vy = __riscv_vfmerge(vy, fp_posZero, id_mask2, vlen); \
- \
-            /* compute (|x|-1) * y yeilding the correct signed infinities */ \
-            vz_tmp = __riscv_vfmul(current_cases, vz_tmp, vy, vlen); \
-            /* except we need to set this to +0 if x == 1 (even if y is NaN) */ \
-            id_mask = __riscv_vmfeq(vx, fp_posOne, vlen); \
-            id_mask = __riscv_vmand(current_cases, id_mask, vlen); \
-            vz_tmp = __riscv_vfmerge(vz_tmp, fp_posZero, id_mask, vlen); \
-            vz_special = __riscv_vmerge(vz_special, vz_tmp, current_cases, vlen); \
- \
-            /* map vz_special values of -Inf to 0 and 0 to 1.0 */ \
-            vclass_z = __riscv_vfclass(vz_special, vlen); \
-            IDENTIFY(vclass_z, class_negInf, id_mask, vlen); \
-            id_mask = __riscv_vmand(current_cases, id_mask, vlen); \
-            vz_special = __riscv_vfmerge(vz_special, fp_posZero, id_mask, vlen); \
-            IDENTIFY(vclass_z, class_Zero, id_mask, vlen); \
-            id_mask = __riscv_vmand(current_cases, id_mask, vlen); \
-            vz_special = __riscv_vfmerge(vz_special, fp_posOne, id_mask, vlen); \
-        } \
- \
-        /* finally, substitue 1.0 for x and y when either x or y is special */ \
-        vx = __riscv_vfmerge(vx, fp_posOne, special_args, vlen); \
-        vy = __riscv_vfmerge(vy, fp_posOne, special_args, vlen); \
-    } \
-} while(0)
+#define EXCEPTION_HANDLING_POW(vx, vy, special_args, vz_special, vlen)         \
+  VUINT vclass_x = __riscv_vfclass(vx, vlen);                                  \
+  VUINT vclass_y = __riscv_vfclass(vy, vlen);                                  \
+  do {                                                                         \
+    /* Exception handling: handle x or y being NaN, Inf, and Zero              \
+     * and replace them with 2.0 so that normal computations with them         \
+     * do not raise problems.                                                  \
+     * Note that we do not call out negative x for special handling. The       \
+     * normal computation essentially computes |x|^y, but identify x < 0       \
+     * later on; replacing the answer appropriately depending on whether       \
+     * y is an integer (resulting in +-(|x|^y)) or not (resulting in NaN).     \
+     *                                                                         \
+     * In side the special argument handling, we handle 3 cases separately     \
+     * x AND y both special, only x is special, and only y is special.         \
+     */                                                                        \
+                                                                               \
+    VBOOL y_special, x_special;                                                \
+    /* 0x399 is NaN/Inf/Zero */                                                \
+    IDENTIFY(vclass_y, 0x399, y_special, vlen);                                \
+    IDENTIFY(vclass_x, 0x399, x_special, vlen);                                \
+                                                                               \
+    special_args = __riscv_vmor(x_special, y_special, vlen);                   \
+    UINT nb_special_args = __riscv_vcpop(special_args, vlen);                  \
+                                                                               \
+    if (nb_special_args > 0) {                                                 \
+      /* Expect this to be taken rarely. We handle separately the three        \
+       * mutually exclusive cases of both x and y are special, and only one is \
+       * special                                                               \
+       */                                                                      \
+      VUINT vclass_z;                                                          \
+      VBOOL id_mask;                                                           \
+      vz_special = VFMV_VF(fp_posOne, vlen);                                   \
+      VBOOL current_cases = __riscv_vmand(x_special, y_special, vlen);         \
+      if (__riscv_vcpop(current_cases, vlen) > 0) {                            \
+        /* x AND y are special */                                              \
+                                                                               \
+        /* pow(any, 0) is 1.0 */                                               \
+        IDENTIFY(vclass_y, class_Zero, id_mask, vlen);                         \
+        id_mask = __riscv_vmand(id_mask, current_cases, vlen);                 \
+        vy = __riscv_vfmerge(vy, fp_posOne, id_mask, vlen);                    \
+        vx = __riscv_vfmerge(vx, fp_posOne, id_mask, vlen);                    \
+        VBOOL restricted_cases = __riscv_vmandn(current_cases, id_mask, vlen); \
+                                                                               \
+        /* pow(+-Inf,+-Inf) = pow(+Inf,+-Inf), so substitue -Inf by +Inf for x \
+         */                                                                    \
+        IDENTIFY(vclass_x, class_negInf, id_mask, vlen);                       \
+        id_mask = __riscv_vmand(id_mask, restricted_cases, vlen);              \
+        vx = __riscv_vfmerge(vx, fp_posInf, id_mask, vlen);                    \
+                                                                               \
+        /* pow(0, +-Inf) = +Inf or 0. Substitute x by -Inf to mimic log(x) */  \
+        IDENTIFY(vclass_x, class_Zero, id_mask, vlen);                         \
+        id_mask = __riscv_vmand(id_mask, restricted_cases, vlen);              \
+        vx = __riscv_vfmerge(vx, fp_negInf, id_mask, vlen);                    \
+                                                                               \
+        /* multiply the substituted vx * vy that mimics y*log(x) to some       \
+         * extent. This product will also generate the necessary NaN and       \
+         * invalid operation signal                                            \
+         */                                                                    \
+        vz_special =                                                           \
+            __riscv_vfmul_mu(current_cases, vz_special, vx, vy, vlen);         \
+        vclass_z = __riscv_vfclass(vz_special, vlen);                          \
+        IDENTIFY(vclass_z, class_negInf, id_mask, vlen);                       \
+        id_mask = __riscv_vmand(id_mask, current_cases, vlen);                 \
+        vz_special = __riscv_vfmerge(vz_special, fp_posZero, id_mask, vlen);   \
+        /* end of handling for BOTH x and y are special */                     \
+      }                                                                        \
+                                                                               \
+      current_cases = __riscv_vmandn(x_special, y_special, vlen);              \
+      if (__riscv_vcpop(current_cases, vlen) > 0) {                            \
+        /* x only is special */                                                \
+                                                                               \
+        VINT sign_x = __riscv_vand(F_AS_I(vx), F_AS_I(vx), vlen);              \
+        /* Here we change x that is +-Inf into +Inf, and x that is +-0 to -Inf \
+         */                                                                    \
+        IDENTIFY(vclass_x, class_Zero, id_mask, vlen);                         \
+        id_mask = __riscv_vmand(id_mask, current_cases, vlen);                 \
+        vx = __riscv_vfmerge(vx, fp_negInf, id_mask, vlen);                    \
+                                                                               \
+        IDENTIFY(vclass_x, class_Inf, id_mask, vlen);                          \
+        id_mask = __riscv_vmand(id_mask, current_cases, vlen);                 \
+        vx = __riscv_vfmerge(vx, fp_posInf, id_mask, vlen);                    \
+                                                                               \
+        /* We need to identify whether y is of integer value and if so its     \
+         * parity. We first clip y values to +-2^53, because FP value of this  \
+         * magnitude and beyond are always even integers                       \
+         */                                                                    \
+        vy = __riscv_vfmin_mu(current_cases, vy, vy, 0x1.0p53, vlen);          \
+        vy = __riscv_vfmax_mu(current_cases, vy, vy, -0x1.0p53, vlen);         \
+        VINT y_to_int = __riscv_vfcvt_x(current_cases, vy, vlen);              \
+        VFLOAT y_to_int_fp = __riscv_vfcvt_f(current_cases, y_to_int, vlen);   \
+        VBOOL y_is_int = __riscv_vmfeq(current_cases, vy, y_to_int_fp, vlen);  \
+        VINT sign_z = __riscv_vsll(y_to_int, 63, vlen);                        \
+        /* the parity is used later on to manipulate sign, hence sll 63 bits   \
+         */                                                                    \
+                                                                               \
+        /* we have set vx to mimic log(|x|), so we now compute y * log(|x|) */ \
+        vz_special =                                                           \
+            __riscv_vfmul_mu(current_cases, vz_special, vy, vx, vlen);         \
+        /* map -Inf to +0 */                                                   \
+        vclass_z = __riscv_vfclass(vz_special, vlen);                          \
+        IDENTIFY(vclass_z, class_negInf, id_mask, vlen);                       \
+        id_mask = __riscv_vmand(id_mask, current_cases, vlen);                 \
+        vz_special = __riscv_vfmerge(vz_special, fp_posZero, id_mask, vlen);   \
+        /* now must set the sign of vz_special for x in {Zero, Inf} and y of   \
+         * integer value */                                                    \
+                                                                               \
+        IDENTIFY(vclass_x, class_Inf | class_Zero, id_mask, vlen);             \
+        id_mask = __riscv_vmand(current_cases, id_mask, vlen);                 \
+        VFLOAT vz_tmp = I_AS_F(__riscv_vand(id_mask, sign_x, sign_z, vlen));   \
+        vz_tmp = __riscv_vfsgnj(id_mask, vz_special, vz_tmp, vlen);            \
+        vz_special = __riscv_vmerge(vz_special, vz_tmp, id_mask, vlen);        \
+      }                                                                        \
+                                                                               \
+      current_cases = __riscv_vmandn(y_special, x_special, vlen);              \
+      if (__riscv_vcpop(current_cases, vlen) > 0) {                            \
+        /* y only is special */                                                \
+                                                                               \
+        /* Here x is finite and non-zero. But x == 1.0 is special              \
+         * in that 1.0^anything is 1.0, including when y is a NaN.             \
+         * Aside from this case, we need to differentiate |x| <, ==, > 1       \
+         * so as to handle y == +-Inf appropriately.                           \
+         */                                                                    \
+                                                                               \
+        /* If |x| == 1.0, replace y with 0.0 */                                \
+        VFLOAT vz_tmp = __riscv_vfsgnj(current_cases, vx, fp_posOne, vlen);    \
+        vz_tmp = __riscv_vfsub(current_cases, vz_tmp, fp_posOne, vlen);        \
+        id_mask = __riscv_vmfeq(vz_tmp, fp_posZero, vlen);                     \
+        id_mask = __riscv_vmand(current_cases, id_mask, vlen);                 \
+        VBOOL id_mask2;                                                        \
+        IDENTIFY(vclass_y, class_Inf | class_Zero, id_mask2, vlen);            \
+        id_mask2 = __riscv_vmand(id_mask, id_mask2, vlen);                     \
+        vy = __riscv_vfmerge(vy, fp_posZero, id_mask2, vlen);                  \
+                                                                               \
+        /* compute (|x|-1) * y yeilding the correct signed infinities */       \
+        vz_tmp = __riscv_vfmul(current_cases, vz_tmp, vy, vlen);               \
+        /* except we need to set this to +0 if x == 1 (even if y is NaN) */    \
+        id_mask = __riscv_vmfeq(vx, fp_posOne, vlen);                          \
+        id_mask = __riscv_vmand(current_cases, id_mask, vlen);                 \
+        vz_tmp = __riscv_vfmerge(vz_tmp, fp_posZero, id_mask, vlen);           \
+        vz_special = __riscv_vmerge(vz_special, vz_tmp, current_cases, vlen);  \
+                                                                               \
+        /* map vz_special values of -Inf to 0 and 0 to 1.0 */                  \
+        vclass_z = __riscv_vfclass(vz_special, vlen);                          \
+        IDENTIFY(vclass_z, class_negInf, id_mask, vlen);                       \
+        id_mask = __riscv_vmand(current_cases, id_mask, vlen);                 \
+        vz_special = __riscv_vfmerge(vz_special, fp_posZero, id_mask, vlen);   \
+        IDENTIFY(vclass_z, class_Zero, id_mask, vlen);                         \
+        id_mask = __riscv_vmand(current_cases, id_mask, vlen);                 \
+        vz_special = __riscv_vfmerge(vz_special, fp_posOne, id_mask, vlen);    \
+      }                                                                        \
+                                                                               \
+      /* finally, substitue 1.0 for x and y when either x or y is special */   \
+      vx = __riscv_vfmerge(vx, fp_posOne, special_args, vlen);                 \
+      vy = __riscv_vfmerge(vy, fp_posOne, special_args, vlen);                 \
+    }                                                                          \
+  } while (0)
 
 static const double two_to_neg63 = 0x1.0p-63;
 static const uint64_t bias = 0x3ff0000000000000;
@@ -169,9 +176,9 @@ static const double two_to_65 = 0x1.0p65;
 static const double negtwo_to_65 = -0x1.0p65;
 
 #if (STRIDE == UNIT_STRIDE)
-#define F_VER1    RVVLM_POWD_TBL
+#define F_VER1 RVVLM_POWD_TBL
 #else
-#define F_VER1    RVVLM_POWDI_TBL
+#define F_VER1 RVVLM_POWDI_TBL
 #endif
 
 #include <fenv.h>
@@ -179,203 +186,203 @@ static const double negtwo_to_65 = -0x1.0p65;
 // Version 1 is reduction to standard primary interval.
 // Reduced argument is represented as one FP64 variable.
 void F_VER1(API) {
-    size_t vlen;
-    VFLOAT vx, vy, vz, vz_special;
-    VBOOL special_args;
-
-    SET_ROUNDTONEAREST;
-    // stripmining over input arguments
-    for (; _inarg_n > 0; _inarg_n -= vlen) {
-        vlen = VSET(_inarg_n);
-        vx = VFLOAD_INARG1(vlen);
-        vy = VFLOAD_INARG2(vlen);
-
-        // Set results when one of the inputs is NaN/Inf/Zero
-        EXCEPTION_HANDLING_POW(vx, vy, special_args, vz_special, vlen);
-
-        // Normal computations. Here, both x and y are finite and non-zero.
-        // We compute 2^( y log_2(x) ) on the high level. But when x < 0,
-        // we must handle the cases when y is of integer value, making x^y well defined.
-        // So in essence, we try to compute 2^(y log_2(|x|)) and then figure out if 
-        // one should replace this with NaN, or accept this numerical result with the 
-        // possible flipping of its sign (x is negative and y is an odd integer).
-
-        // Decompose in_arg into n, B, r
-        VINT n_adjust, sign_x;
-        VBOOL id_mask;
-        n_adjust = __riscv_vxor(n_adjust, n_adjust, vlen);
-        sign_x = __riscv_vxor(sign_x, sign_x, vlen);
-        sign_x = F_AS_I(__riscv_vfsgnj(I_AS_F(sign_x), vx, vlen));
-        vx = __riscv_vfsgnjx(vx, vx, vlen);
-        IDENTIFY(vclass_x, class_Denorm, id_mask, vlen);
-        vx = __riscv_vfmul_mu(id_mask, vx, vx, 0x1.0p65, vlen);
-        n_adjust = __riscv_vmerge(n_adjust, 65, id_mask, vlen);
-
-        VINT n = __riscv_vadd(F_AS_I(vx), round_up, vlen);
-        n = __riscv_vsub(n, bias, vlen);
-        n = __riscv_vsra(n, 52, vlen);
-        n = __riscv_vsub(n, n_adjust, vlen);
-
-        VFLOAT A = __riscv_vfcvt_f(n, vlen);
-
-        // To get frec7(X) suffices to get frecp7 of X with its exponent field set to bias
-        // The main reason for this step is that should the exponent of X be the largest
-        // finite exponent, frec7(X) will be subnormal and carry less precision.
-        // Moreover, we need to get the 7 mantissa bits of X for table lookup later on
-        VUINT ind = __riscv_vand(F_AS_U(vx), zero_mask_expo, vlen);
-
-        // normalize exponent of vx
-        vx = U_AS_F(__riscv_vor(ind, bias, vlen));
-        VFLOAT B = __riscv_vfrec7(vx, vlen);
-        ind = __riscv_vsrl(ind, 45, vlen); // 7 leading mantissa bit 
-        ind = __riscv_vsll(ind, 4, vlen);  // left shifted 4 (16-byte table)
-
-        // adjust B to be 1.0 if ind == 0
-        VBOOL adjust_B = __riscv_vmseq(ind, 0, vlen);
-        B = __riscv_vfmerge(B, fp_posOne, adjust_B, vlen);
-        VFLOAT r = VFMV_VF(fp_posOne, vlen);
-        r = __riscv_vfmsac(r, vx, B, vlen);
-
-        // with A = n in float format, r, and ind we can carry out floating-point computations
-        // (A + T)  + log_e(1+r) * (1/log_e(2))
-        // compute log_e(1+r) by a polynomial approximation. To obtian an accurate pow(x,y)
-        // in the end, we must obtain at least 10 extra bits of precision over FP64.
-        // So log_e(1+r) is approximated by a degree-9 polynomial
-        // r - r^2/2 + r^3[ (p6 + r p5 + r^2 p4 ) + r^3 (p3 + r p2 + r^2 p1 + r^3 p0) ]
-        // r - r^2/2 + poly; and furthermore, r - r^2/2 is computed as P + p,
-        // 1/log(2) is stored as log2_inv_hi, log2_inv_lo, and T is broken into T_hi, T_lo
-        // So, we need
-        // (A + T_hi) + log2_inv_hi * P + 
-        // log2_inv_hi * poly + T_lo  + log2_inv_lo*P
-        // Note that log_2(|x|) needs be to represented in 2 FP64 variables as
-        // we need to have log_2(|x|) in extra precision.
-        //
-        VFLOAT rcube = __riscv_vfmul(r, r, vlen);
-        rcube = __riscv_vfmul(rcube, r, vlen);
-
-        VFLOAT poly_right = PSTEP( -0x1.555555483d731p-3, r,
-                            PSTEP(  0x1.2492453584b8ep-3, r,
-                            PSTEP( -0x1.0005fa6ef2342p-3, 0x1.c7fe32d120e6bp-4, r,
-                            vlen), vlen), vlen);
-
-        VFLOAT poly_left = PSTEP( 0x1.5555555555555p-2, r,
-                           PSTEP(-0x1.000000000003cp-2, 0x1.99999999a520ep-3, r,
-                           vlen), vlen);
-
-        VFLOAT poly = __riscv_vfmadd(poly_right, rcube, poly_left, vlen);
-        // poly is (p6 + r p5 + r^2 p4 ) + r^3 (p3 + r p2 + r^2 p1 + r^3 p0)  
-
-        VFLOAT r_prime = __riscv_vfmul(r, -0x1.0p-1, vlen); // exact product
-        VFLOAT P = __riscv_vfmadd(r_prime, r, r, vlen);
-        VFLOAT p = __riscv_vfsub(r, P, vlen);
-        p = __riscv_vfmacc(p, r_prime, r, vlen);
-        // P + p is r - r^2/2 to extra precision
-        poly = __riscv_vfmadd(poly, rcube, p, vlen);
-        // Now P + poly is log_e(1+r) to extra precision
-        
-        // Load table values and get n_flt + T to be A + a
-        VFLOAT T_hi_flt = __riscv_vluxei64(logtbl_4_powD_128_hi_lo, ind, vlen);
-        ind = __riscv_vadd(ind, 8, vlen);
-        VFLOAT T_lo_flt = __riscv_vluxei64(logtbl_4_powD_128_hi_lo, ind, vlen);
-
-        A = __riscv_vfadd(A, T_hi_flt, vlen);
-        // (A + T_hi) + log2_inv_hi * P  + log2_inv_hi * poly + log2_inv_lo*P + T_lo
-        // is log2(|x|) to extra precision
-        VFLOAT log2x_hi = __riscv_vfmadd(P, log2_inv_hi, A, vlen);
-        VFLOAT log2x_lo = __riscv_vfsub(A, log2x_hi, vlen);
-        log2x_lo = __riscv_vfmacc(log2x_lo, log2_inv_hi, P, vlen);
-
-        T_lo_flt = __riscv_vfmacc(T_lo_flt, log2_inv_lo, P, vlen);
-        log2x_lo = __riscv_vfadd(log2x_lo, T_lo_flt, vlen);
-        log2x_lo = __riscv_vfmacc(log2x_lo, log2_inv_hi, poly, vlen);
-        VFLOAT log2x = __riscv_vfadd(log2x_hi, log2x_lo, vlen);
-        T_lo_flt = __riscv_vfsub(log2x_hi, log2x, vlen);
-        log2x_lo = __riscv_vfadd(T_lo_flt, log2x_lo, vlen);
-        // log2x + log2x_lo is log2(|x|) to extra precision
-
-        // The final stage involves computing 2^(y * log2x)
-        VFLOAT vy_tmp = __riscv_vfmin(vy, 0x1.0p53, vlen);
-        vy_tmp = __riscv_vfmax(vy_tmp, -0x1.0p53, vlen);
-        VINT y_to_int = __riscv_vfcvt_x(vy_tmp, vlen);
-        VFLOAT vy_rnd_int = __riscv_vfcvt_f(y_to_int, vlen);
-        VBOOL y_is_int = __riscv_vmfeq(vy_tmp, vy_rnd_int, vlen);
-        y_to_int = __riscv_vsll(y_to_int, 63, vlen);
-        // if y is of integer value, y_to_int is the parity of y in the sign bit position
-        // To compute y * (log2x + log2x_lo)
-        // we first clip y to +-2^65
-        vy = __riscv_vfmin(vy, two_to_65 , vlen);
-        vy = __riscv_vfmax(vy, negtwo_to_65, vlen);
-        vy_tmp = __riscv_vfmul(vy, log2x, vlen);
-        r = __riscv_vfmsub(vy, log2x, vy_tmp, vlen);
-        r = __riscv_vfmacc(r, vy, log2x_lo, vlen);
-        // vy_tmp + r is the product, clip at +-1100
-        vy_tmp = __riscv_vfmin(vy_tmp, 0x1.13p10, vlen);
-        vy_tmp = __riscv_vfmax(vy_tmp, -0x1.13p10, vlen);
-        r = __riscv_vfmin(r, 0x1.0p-35, vlen);
-        r = __riscv_vfmax(r, -0x1.0p-35, vlen);
-
-        // Argument reduction
-        VFLOAT n_flt = __riscv_vfmul(vy_tmp, 0x1.0p6, vlen);
-        n = __riscv_vfcvt_x(n_flt, vlen);
-        n_flt = __riscv_vfcvt_f(n, vlen);
-
-        vy_tmp = __riscv_vfnmsac(vy_tmp, 0x1.0p-6, n_flt, vlen);
-        r = __riscv_vfadd(vy_tmp, r, vlen);
-        r = __riscv_vfmul(r, log2_hi, vlen);
-
-        // Polynomial computation, we have a degree 5
-        // We break this up into 2 pieces
-        // Ideally the compiler will interleave the computations of the segments
-        poly_right = PSTEP( 0x1.5555722e87735p-5, 0x1.1107f5fc29bb7p-7, r, vlen);
-        poly_left = PSTEP( 0x1.fffffffffe1f5p-2, 0x1.55555556582a8p-3, r, vlen);
-
-        VFLOAT r_sq = __riscv_vfmul(r, r, vlen);
-        poly = __riscv_vfmadd(poly_right, r_sq, poly_left, vlen);
-
-        poly = __riscv_vfmadd(poly, r_sq, r, vlen);
-        poly = __riscv_vfmul(poly, two_to_63, vlen);
-        VINT P_fixedpt = __riscv_vfcvt_x(poly, vlen);
-
-        VINT j = __riscv_vand(n, 0x3f, vlen);
-        j = __riscv_vsll(j, 3, vlen);
-        VINT T = __riscv_vluxei64(expD_tbl64_fixedpt, I_AS_U(j), vlen);
-
-        P_fixedpt = __riscv_vsmul(P_fixedpt, T, 1, vlen);
-        P_fixedpt = __riscv_vsadd(P_fixedpt, T, vlen);
-        vz = __riscv_vfcvt_f(P_fixedpt, vlen);
-        // at this point, vz ~=~ 2^62 * exp(r)
-
-        n = __riscv_vsra(n, 6, vlen);
-        // Need to compute 2^(n-62) * exp(r). 
-        // Although most of the time, it suffices to add n to the exponent field of exp(r)
-        // this will fail n is just a bit too positive or negative, corresponding to
-        // 2^n * exp(r) causing over or underflow.
-        // So we have to decompose n into n1 + n2  where n1 = n >> 1
-        // 2^n1 * exp(r) can be performed by adding n to exp(r)'s exponent field
-        // But we need to create the floating point value scale = 2^n2 
-        // and perform a multiplication to finish the task.
-
-        n = __riscv_vsub(n, 62, vlen);
-        FAST_LDEXP(vz, n, vlen);
-
-        VBOOL invalid = __riscv_vmsne(sign_x, 0, vlen);
-        invalid = __riscv_vmandn(invalid, y_is_int, vlen);
-        vz = __riscv_vfmerge(vz, fp_sNaN, invalid, vlen);
-        vz = __riscv_vfadd(vz, fp_posZero, vlen);
-        
-        sign_x = __riscv_vand(sign_x, y_to_int, vlen);
-        vz = __riscv_vfsgnj_mu(y_is_int, vz, vz, I_AS_F(sign_x), vlen);
-        
-        vz = __riscv_vmerge(vz, vz_special, special_args, vlen);
-
-        VFSTORE_OUTARG1(vz, vlen);
-
-        //VSE(z, vz, vlen);
-        //x += vlen; y += vlen, z+= vlen;
-        INCREMENT_INARG1(vlen);
-        INCREMENT_INARG2(vlen);
-        INCREMENT_OUTARG1(vlen);
-    }
-    RESTORE_FRM;
+  size_t vlen;
+  VFLOAT vx, vy, vz, vz_special;
+  VBOOL special_args;
+
+  SET_ROUNDTONEAREST;
+  // stripmining over input arguments
+  for (; _inarg_n > 0; _inarg_n -= vlen) {
+    vlen = VSET(_inarg_n);
+    vx = VFLOAD_INARG1(vlen);
+    vy = VFLOAD_INARG2(vlen);
+
+    // Set results when one of the inputs is NaN/Inf/Zero
+    EXCEPTION_HANDLING_POW(vx, vy, special_args, vz_special, vlen);
+
+    // Normal computations. Here, both x and y are finite and non-zero.
+    // We compute 2^( y log_2(x) ) on the high level. But when x < 0,
+    // we must handle the cases when y is of integer value, making x^y well
+    // defined. So in essence, we try to compute 2^(y log_2(|x|)) and then
+    // figure out if one should replace this with NaN, or accept this numerical
+    // result with the possible flipping of its sign (x is negative and y is an
+    // odd integer).
+
+    // Decompose in_arg into n, B, r
+    VINT n_adjust, sign_x;
+    VBOOL id_mask;
+    n_adjust = __riscv_vxor(n_adjust, n_adjust, vlen);
+    sign_x = __riscv_vxor(sign_x, sign_x, vlen);
+    sign_x = F_AS_I(__riscv_vfsgnj(I_AS_F(sign_x), vx, vlen));
+    vx = __riscv_vfsgnjx(vx, vx, vlen);
+    IDENTIFY(vclass_x, class_Denorm, id_mask, vlen);
+    vx = __riscv_vfmul_mu(id_mask, vx, vx, 0x1.0p65, vlen);
+    n_adjust = __riscv_vmerge(n_adjust, 65, id_mask, vlen);
+
+    VINT n = __riscv_vadd(F_AS_I(vx), round_up, vlen);
+    n = __riscv_vsub(n, bias, vlen);
+    n = __riscv_vsra(n, 52, vlen);
+    n = __riscv_vsub(n, n_adjust, vlen);
+
+    VFLOAT A = __riscv_vfcvt_f(n, vlen);
+
+    // To get frec7(X) suffices to get frecp7 of X with its exponent field set
+    // to bias The main reason for this step is that should the exponent of X be
+    // the largest finite exponent, frec7(X) will be subnormal and carry less
+    // precision. Moreover, we need to get the 7 mantissa bits of X for table
+    // lookup later on
+    VUINT ind = __riscv_vand(F_AS_U(vx), zero_mask_expo, vlen);
+
+    // normalize exponent of vx
+    vx = U_AS_F(__riscv_vor(ind, bias, vlen));
+    VFLOAT B = __riscv_vfrec7(vx, vlen);
+    ind = __riscv_vsrl(ind, 45, vlen); // 7 leading mantissa bit
+    ind = __riscv_vsll(ind, 4, vlen);  // left shifted 4 (16-byte table)
+
+    // adjust B to be 1.0 if ind == 0
+    VBOOL adjust_B = __riscv_vmseq(ind, 0, vlen);
+    B = __riscv_vfmerge(B, fp_posOne, adjust_B, vlen);
+    VFLOAT r = VFMV_VF(fp_posOne, vlen);
+    r = __riscv_vfmsac(r, vx, B, vlen);
+
+    // with A = n in float format, r, and ind we can carry out floating-point
+    // computations (A + T)  + log_e(1+r) * (1/log_e(2)) compute log_e(1+r) by a
+    // polynomial approximation. To obtian an accurate pow(x,y) in the end, we
+    // must obtain at least 10 extra bits of precision over FP64. So log_e(1+r)
+    // is approximated by a degree-9 polynomial r - r^2/2 + r^3[ (p6 + r p5 +
+    // r^2 p4 ) + r^3 (p3 + r p2 + r^2 p1 + r^3 p0) ] r - r^2/2 + poly; and
+    // furthermore, r - r^2/2 is computed as P + p, 1/log(2) is stored as
+    // log2_inv_hi, log2_inv_lo, and T is broken into T_hi, T_lo So, we need (A
+    // + T_hi) + log2_inv_hi * P + log2_inv_hi * poly + T_lo  + log2_inv_lo*P
+    // Note that log_2(|x|) needs be to represented in 2 FP64 variables as
+    // we need to have log_2(|x|) in extra precision.
+    //
+    VFLOAT rcube = __riscv_vfmul(r, r, vlen);
+    rcube = __riscv_vfmul(rcube, r, vlen);
+
+    VFLOAT poly_right =
+        PSTEP(-0x1.555555483d731p-3, r,
+              PSTEP(0x1.2492453584b8ep-3, r,
+                    PSTEP(-0x1.0005fa6ef2342p-3, 0x1.c7fe32d120e6bp-4, r, vlen),
+                    vlen),
+              vlen);
+
+    VFLOAT poly_left = PSTEP(
+        0x1.5555555555555p-2, r,
+        PSTEP(-0x1.000000000003cp-2, 0x1.99999999a520ep-3, r, vlen), vlen);
+
+    VFLOAT poly = __riscv_vfmadd(poly_right, rcube, poly_left, vlen);
+    // poly is (p6 + r p5 + r^2 p4 ) + r^3 (p3 + r p2 + r^2 p1 + r^3 p0)
+
+    VFLOAT r_prime = __riscv_vfmul(r, -0x1.0p-1, vlen); // exact product
+    VFLOAT P = __riscv_vfmadd(r_prime, r, r, vlen);
+    VFLOAT p = __riscv_vfsub(r, P, vlen);
+    p = __riscv_vfmacc(p, r_prime, r, vlen);
+    // P + p is r - r^2/2 to extra precision
+    poly = __riscv_vfmadd(poly, rcube, p, vlen);
+    // Now P + poly is log_e(1+r) to extra precision
+
+    // Load table values and get n_flt + T to be A + a
+    VFLOAT T_hi_flt = __riscv_vluxei64(logtbl_4_powD_128_hi_lo, ind, vlen);
+    ind = __riscv_vadd(ind, 8, vlen);
+    VFLOAT T_lo_flt = __riscv_vluxei64(logtbl_4_powD_128_hi_lo, ind, vlen);
+
+    A = __riscv_vfadd(A, T_hi_flt, vlen);
+    // (A + T_hi) + log2_inv_hi * P  + log2_inv_hi * poly + log2_inv_lo*P + T_lo
+    // is log2(|x|) to extra precision
+    VFLOAT log2x_hi = __riscv_vfmadd(P, log2_inv_hi, A, vlen);
+    VFLOAT log2x_lo = __riscv_vfsub(A, log2x_hi, vlen);
+    log2x_lo = __riscv_vfmacc(log2x_lo, log2_inv_hi, P, vlen);
+
+    T_lo_flt = __riscv_vfmacc(T_lo_flt, log2_inv_lo, P, vlen);
+    log2x_lo = __riscv_vfadd(log2x_lo, T_lo_flt, vlen);
+    log2x_lo = __riscv_vfmacc(log2x_lo, log2_inv_hi, poly, vlen);
+    VFLOAT log2x = __riscv_vfadd(log2x_hi, log2x_lo, vlen);
+    T_lo_flt = __riscv_vfsub(log2x_hi, log2x, vlen);
+    log2x_lo = __riscv_vfadd(T_lo_flt, log2x_lo, vlen);
+    // log2x + log2x_lo is log2(|x|) to extra precision
+
+    // The final stage involves computing 2^(y * log2x)
+    VFLOAT vy_tmp = __riscv_vfmin(vy, 0x1.0p53, vlen);
+    vy_tmp = __riscv_vfmax(vy_tmp, -0x1.0p53, vlen);
+    VINT y_to_int = __riscv_vfcvt_x(vy_tmp, vlen);
+    VFLOAT vy_rnd_int = __riscv_vfcvt_f(y_to_int, vlen);
+    VBOOL y_is_int = __riscv_vmfeq(vy_tmp, vy_rnd_int, vlen);
+    y_to_int = __riscv_vsll(y_to_int, 63, vlen);
+    // if y is of integer value, y_to_int is the parity of y in the sign bit
+    // position To compute y * (log2x + log2x_lo) we first clip y to +-2^65
+    vy = __riscv_vfmin(vy, two_to_65, vlen);
+    vy = __riscv_vfmax(vy, negtwo_to_65, vlen);
+    vy_tmp = __riscv_vfmul(vy, log2x, vlen);
+    r = __riscv_vfmsub(vy, log2x, vy_tmp, vlen);
+    r = __riscv_vfmacc(r, vy, log2x_lo, vlen);
+    // vy_tmp + r is the product, clip at +-1100
+    vy_tmp = __riscv_vfmin(vy_tmp, 0x1.13p10, vlen);
+    vy_tmp = __riscv_vfmax(vy_tmp, -0x1.13p10, vlen);
+    r = __riscv_vfmin(r, 0x1.0p-35, vlen);
+    r = __riscv_vfmax(r, -0x1.0p-35, vlen);
+
+    // Argument reduction
+    VFLOAT n_flt = __riscv_vfmul(vy_tmp, 0x1.0p6, vlen);
+    n = __riscv_vfcvt_x(n_flt, vlen);
+    n_flt = __riscv_vfcvt_f(n, vlen);
+
+    vy_tmp = __riscv_vfnmsac(vy_tmp, 0x1.0p-6, n_flt, vlen);
+    r = __riscv_vfadd(vy_tmp, r, vlen);
+    r = __riscv_vfmul(r, log2_hi, vlen);
+
+    // Polynomial computation, we have a degree 5
+    // We break this up into 2 pieces
+    // Ideally the compiler will interleave the computations of the segments
+    poly_right = PSTEP(0x1.5555722e87735p-5, 0x1.1107f5fc29bb7p-7, r, vlen);
+    poly_left = PSTEP(0x1.fffffffffe1f5p-2, 0x1.55555556582a8p-3, r, vlen);
+
+    VFLOAT r_sq = __riscv_vfmul(r, r, vlen);
+    poly = __riscv_vfmadd(poly_right, r_sq, poly_left, vlen);
+
+    poly = __riscv_vfmadd(poly, r_sq, r, vlen);
+    poly = __riscv_vfmul(poly, two_to_63, vlen);
+    VINT P_fixedpt = __riscv_vfcvt_x(poly, vlen);
+
+    VINT j = __riscv_vand(n, 0x3f, vlen);
+    j = __riscv_vsll(j, 3, vlen);
+    VINT T = __riscv_vluxei64(expD_tbl64_fixedpt, I_AS_U(j), vlen);
+
+    P_fixedpt = __riscv_vsmul(P_fixedpt, T, 1, vlen);
+    P_fixedpt = __riscv_vsadd(P_fixedpt, T, vlen);
+    vz = __riscv_vfcvt_f(P_fixedpt, vlen);
+    // at this point, vz ~=~ 2^62 * exp(r)
+
+    n = __riscv_vsra(n, 6, vlen);
+    // Need to compute 2^(n-62) * exp(r).
+    // Although most of the time, it suffices to add n to the exponent field of
+    // exp(r) this will fail n is just a bit too positive or negative,
+    // corresponding to 2^n * exp(r) causing over or underflow. So we have to
+    // decompose n into n1 + n2  where n1 = n >> 1 2^n1 * exp(r) can be
+    // performed by adding n to exp(r)'s exponent field But we need to create
+    // the floating point value scale = 2^n2 and perform a multiplication to
+    // finish the task.
+
+    n = __riscv_vsub(n, 62, vlen);
+    FAST_LDEXP(vz, n, vlen);
+
+    VBOOL invalid = __riscv_vmsne(sign_x, 0, vlen);
+    invalid = __riscv_vmandn(invalid, y_is_int, vlen);
+    vz = __riscv_vfmerge(vz, fp_sNaN, invalid, vlen);
+    vz = __riscv_vfadd(vz, fp_posZero, vlen);
+
+    sign_x = __riscv_vand(sign_x, y_to_int, vlen);
+    vz = __riscv_vfsgnj_mu(y_is_int, vz, vz, I_AS_F(sign_x), vlen);
+
+    vz = __riscv_vmerge(vz, vz_special, special_args, vlen);
+
+    VFSTORE_OUTARG1(vz, vlen);
+
+    // VSE(z, vz, vlen);
+    // x += vlen; y += vlen, z+= vlen;
+    INCREMENT_INARG1(vlen);
+    INCREMENT_INARG2(vlen);
+    INCREMENT_OUTARG1(vlen);
+  }
+  RESTORE_FRM;
 }
-
diff --git a/src/rvvlm_exp10D.c b/src/rvvlm_exp10D.c
index 011e8d9..9b53926 100644
--- a/src/rvvlm_exp10D.c
+++ b/src/rvvlm_exp10D.c
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <stdio.h>
 #include <riscv_vector.h>
+#include <stdio.h>
 
 #include "rvvlm.h"
 #define API_SIGNATURE API_SIGNATURE_11
@@ -13,4 +13,3 @@
 
 #define COMPILE_FOR_EXP10
 #include "rvvlm_expD.inc.h"
-
diff --git a/src/rvvlm_exp10DI.c b/src/rvvlm_exp10DI.c
index 769ee96..0ff1563 100644
--- a/src/rvvlm_exp10DI.c
+++ b/src/rvvlm_exp10DI.c
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <stdio.h>
 #include <riscv_vector.h>
+#include <stdio.h>
 
 #include "rvvlm.h"
 #define API_SIGNATURE API_SIGNATURE_11
@@ -13,4 +13,3 @@
 
 #define COMPILE_FOR_EXP10
 #include "rvvlm_expD.inc.h"
-
diff --git a/src/rvvlm_exp2D.c b/src/rvvlm_exp2D.c
index cebec6c..e2c1d44 100644
--- a/src/rvvlm_exp2D.c
+++ b/src/rvvlm_exp2D.c
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <stdio.h>
 #include <riscv_vector.h>
+#include <stdio.h>
 
 #include "rvvlm.h"
 #define API_SIGNATURE API_SIGNATURE_11
@@ -13,4 +13,3 @@
 
 #define COMPILE_FOR_EXP2
 #include "rvvlm_expD.inc.h"
-
diff --git a/src/rvvlm_exp2DI.c b/src/rvvlm_exp2DI.c
index 185d171..d1928e9 100644
--- a/src/rvvlm_exp2DI.c
+++ b/src/rvvlm_exp2DI.c
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <stdio.h>
 #include <riscv_vector.h>
+#include <stdio.h>
 
 #include "rvvlm.h"
 #define API_SIGNATURE API_SIGNATURE_11
@@ -13,4 +13,3 @@
 
 #define COMPILE_FOR_EXP2
 #include "rvvlm_expD.inc.h"
-
diff --git a/src/rvvlm_expD.c b/src/rvvlm_expD.c
index 857ebb5..d16ce68 100644
--- a/src/rvvlm_expD.c
+++ b/src/rvvlm_expD.c
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <stdio.h>
 #include <riscv_vector.h>
+#include <stdio.h>
 
 #include "rvvlm.h"
 #define API_SIGNATURE API_SIGNATURE_11
@@ -13,4 +13,3 @@
 
 #define COMPILE_FOR_EXP
 #include "rvvlm_expD.inc.h"
-
diff --git a/src/rvvlm_expDI.c b/src/rvvlm_expDI.c
index d4904c4..d96f4b4 100644
--- a/src/rvvlm_expDI.c
+++ b/src/rvvlm_expDI.c
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <stdio.h>
 #include <riscv_vector.h>
+#include <stdio.h>
 
 #include "rvvlm.h"
 #define API_SIGNATURE API_SIGNATURE_11
@@ -13,4 +13,3 @@
 
 #define COMPILE_FOR_EXP
 #include "rvvlm_expD.inc.h"
-
diff --git a/src/rvvlm_expm1D.c b/src/rvvlm_expm1D.c
index 5384422..63ac259 100644
--- a/src/rvvlm_expm1D.c
+++ b/src/rvvlm_expm1D.c
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <stdio.h>
 #include <riscv_vector.h>
+#include <stdio.h>
 
 #include "rvvlm.h"
 #define API_SIGNATURE API_SIGNATURE_11
@@ -12,4 +12,3 @@
 #include RVVLM_EXPM1D_VSET_CONFIG
 
 #include "rvvlm_expm1D.inc.h"
-
diff --git a/src/rvvlm_expm1DI.c b/src/rvvlm_expm1DI.c
index 86ee475..1b9d6ef 100644
--- a/src/rvvlm_expm1DI.c
+++ b/src/rvvlm_expm1DI.c
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <stdio.h>
 #include <riscv_vector.h>
+#include <stdio.h>
 
 #include "rvvlm.h"
 #define API_SIGNATURE API_SIGNATURE_11
@@ -12,4 +12,3 @@
 #include RVVLM_EXPM1DI_VSET_CONFIG
 
 #include "rvvlm_expm1D.inc.h"
-
diff --git a/src/rvvlm_log10D.c b/src/rvvlm_log10D.c
index f36254b..63bd885 100644
--- a/src/rvvlm_log10D.c
+++ b/src/rvvlm_log10D.c
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <stdio.h>
 #include <riscv_vector.h>
+#include <stdio.h>
 
 #include "rvvlm.h"
 #define API_SIGNATURE API_SIGNATURE_11
@@ -13,4 +13,3 @@
 
 #define COMPILE_FOR_LOG10
 #include "rvvlm_logD.inc.h"
-
diff --git a/src/rvvlm_log10DI.c b/src/rvvlm_log10DI.c
index c1d6168..7137671 100644
--- a/src/rvvlm_log10DI.c
+++ b/src/rvvlm_log10DI.c
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <stdio.h>
 #include <riscv_vector.h>
+#include <stdio.h>
 
 #include "rvvlm.h"
 #define API_SIGNATURE API_SIGNATURE_11
@@ -13,4 +13,3 @@
 
 #define COMPILE_FOR_LOG10
 #include "rvvlm_logD.inc.h"
-
diff --git a/src/rvvlm_log1pD.c b/src/rvvlm_log1pD.c
index 062a87d..fb5d683 100644
--- a/src/rvvlm_log1pD.c
+++ b/src/rvvlm_log1pD.c
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <stdio.h>
 #include <riscv_vector.h>
+#include <stdio.h>
 
 #include "rvvlm.h"
 #define API_SIGNATURE API_SIGNATURE_11
@@ -12,4 +12,3 @@
 #include RVVLM_LOG1PD_VSET_CONFIG
 
 #include "rvvlm_log1pD.inc.h"
-
diff --git a/src/rvvlm_log1pDI.c b/src/rvvlm_log1pDI.c
index dc6bab0..9681899 100644
--- a/src/rvvlm_log1pDI.c
+++ b/src/rvvlm_log1pDI.c
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <stdio.h>
 #include <riscv_vector.h>
+#include <stdio.h>
 
 #include "rvvlm.h"
 #define API_SIGNATURE API_SIGNATURE_11
@@ -12,4 +12,3 @@
 #include RVVLM_LOG1PDI_VSET_CONFIG
 
 #include "rvvlm_log1pD.inc.h"
-
diff --git a/src/rvvlm_log2D.c b/src/rvvlm_log2D.c
index 86815f0..2e4f1b5 100644
--- a/src/rvvlm_log2D.c
+++ b/src/rvvlm_log2D.c
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <stdio.h>
 #include <riscv_vector.h>
+#include <stdio.h>
 
 #include "rvvlm.h"
 #define API_SIGNATURE API_SIGNATURE_11
@@ -13,4 +13,3 @@
 
 #define COMPILE_FOR_LOG2
 #include "rvvlm_logD.inc.h"
-
diff --git a/src/rvvlm_log2DI.c b/src/rvvlm_log2DI.c
index 9f76742..853b172 100644
--- a/src/rvvlm_log2DI.c
+++ b/src/rvvlm_log2DI.c
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <stdio.h>
 #include <riscv_vector.h>
+#include <stdio.h>
 
 #include "rvvlm.h"
 #define API_SIGNATURE API_SIGNATURE_11
@@ -13,4 +13,3 @@
 
 #define COMPILE_FOR_LOG2
 #include "rvvlm_logD.inc.h"
-
diff --git a/src/rvvlm_logD.c b/src/rvvlm_logD.c
index 7c7e0f5..6a910da 100644
--- a/src/rvvlm_logD.c
+++ b/src/rvvlm_logD.c
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <stdio.h>
 #include <riscv_vector.h>
+#include <stdio.h>
 
 #include "rvvlm.h"
 #define API_SIGNATURE API_SIGNATURE_11
@@ -13,4 +13,3 @@
 
 #define COMPILE_FOR_LOG
 #include "rvvlm_logD.inc.h"
-
diff --git a/src/rvvlm_logDI.c b/src/rvvlm_logDI.c
index 99c24e4..2d7d721 100644
--- a/src/rvvlm_logDI.c
+++ b/src/rvvlm_logDI.c
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <stdio.h>
 #include <riscv_vector.h>
+#include <stdio.h>
 
 #include "rvvlm.h"
 #define API_SIGNATURE API_SIGNATURE_11
@@ -13,4 +13,3 @@
 
 #define COMPILE_FOR_LOG
 #include "rvvlm_logD.inc.h"
-
diff --git a/src/rvvlm_logD_tbl.c b/src/rvvlm_logD_tbl.c
index fdb3014..4d27d1d 100644
--- a/src/rvvlm_logD_tbl.c
+++ b/src/rvvlm_logD_tbl.c
@@ -5,38 +5,131 @@
 // This table is used my different functions of the log and exponential family
 #include <stdint.h>
 
-const extern int64_t logD_tbl128_fixedpt[128] = {
-  0x0, 0x22d443c414148a1, 0x3a475f892273f13, 0x51ea81cd5dc13cb,
-  0x69be70ddf74c6a8, 0x81c3f7de5434ed0, 0x8dd9953002a4e86, 0xa62b07f3457c407,
-  0xbeb024b67dda634, 0xd769c8d5b33a728, 0xe3da945b878e27d, 0xfce4aee0e88b275,
-  0x1162593186da6fc4, 0x122dadc2ab3496d3, 0x13c6fb650cde50a1, 0x1563dc29ffacb20d,
-  0x1633a8bf437ce10b, 0x17d60496cfbb4c67, 0x18a8980abfbd3266, 0x1a5094b54d282840,
-  0x1b2602497d53458d, 0x1cd3c712d3110932, 0x1dac22d3e441d2fe, 0x1f5fd8a9063e3491,
-  0x203b3779f4c3a8bb, 0x21f509008966a211, 0x22d380a6c7e2b0e4, 0x23b30593aa4e106c,
-  0x2575418697c3d7e0, 0x2657fdc6e1dcd0cb, 0x273bd1c2ab3edefe, 0x2906cbcd2baf2d54,
-  0x29edf7659d8b30f2, 0x2ad645cd6af1c939, 0x2bbfb9e3dd5c1c88, 0x2d961ed0cb91d407,
-  0x2e83159d77e31d6d, 0x2f713e059e555a64, 0x30609b21823fa654, 0x315130157f7a64cd,
-  0x33360e552d8d64de, 0x342a5e28530367af, 0x351ff2e30214bc30, 0x3616cfe9e8d01fea,
-  0x370ef8af6360dfe0, 0x380870b3c5fb66f7, 0x39033b85a8bfc871, 0x39ff5cc235a256c5,
-  0x3afcd815786af188, 0x3bfbb13ab0dc5614, 0x3cfbebfca715669e, 0x3dfd8c36023f0ab7,
-  0x3f0095d1a19a0332, 0x40050ccaf800ca8c, 0x410af52e69f26264, 0x42125319ae3bbf06,
-  0x431b2abc31565be7, 0x442580577b936763, 0x4531583f9a2be204, 0x463eb6db8b4f066d,
-  0x474da0a5ad495303, 0x485e1a2c30df9ea9, 0x497028118efabeb8, 0x4a83cf0d01c16e3d,
-  -0x3466ec14fec0a13b, -0x335004723c465e69, -0x323775123e2e1169, -0x323775123e2e1169,
-  -0x311d38e5c1644b49, -0x30014ac62c38a865, -0x2ee3a574fdf677c9, -0x2dc4439b3a19bcaf,
-  -0x2ca31fc8cef74dca, -0x2ca31fc8cef74dca, -0x2b803473f7ad0f3f, -0x2a5b7bf8992d66fc,
-  -0x2934f0979a3715fd, -0x280c8c76360892eb, -0x280c8c76360892eb, -0x26e2499d499bd9b3,
-  -0x25b621f89b355ede, -0x24880f561c0e7305, -0x24880f561c0e7305, -0x23580b6523e0e0a5,
-  -0x22260fb5a616eb96, -0x20f215b7606012de, -0x20f215b7606012de, -0x1fbc16b902680a24,
-  -0x1e840be74e6a4cc8, -0x1e840be74e6a4cc8, -0x1d49ee4c32596fc9, -0x1c0db6cdd94dee41,
-  -0x1c0db6cdd94dee41, -0x1acf5e2db4ec93f0, -0x198edd077e70df03, -0x198edd077e70df03,
-  -0x184c2bd02f03b2fe, -0x170742d4ef027f2a, -0x170742d4ef027f2a, -0x15c01a39fbd687a0,
-  -0x1476a9f983f74d31, -0x1476a9f983f74d31, -0x132ae9e278ae1a1f, -0x132ae9e278ae1a1f,
-  -0x11dcd197552b7b5f, -0x108c588cda79e396, -0x108c588cda79e396, -0xf397608bfd2d90e,
-  -0xf397608bfd2d90e, -0xde4212056d5dd32, -0xc8c50b72319ad57, -0xc8c50b72319ad57,
-  -0xb31fb7d64898b3e, -0xb31fb7d64898b3e, -0x9d517ee93f8e16c, -0x9d517ee93f8e16c,
-  -0x8759c4fd14fcd5a, -0x7137eae42aad7bd, -0x7137eae42aad7bd, -0x5aeb4dd63bf61cc,
-  -0x5aeb4dd63bf61cc, -0x447347544cd04bb, -0x447347544cd04bb, -0x2dcf2d0b85a4531,
-  -0x2dcf2d0b85a4531, -0x16fe50b6ef08518, -0x16fe50b6ef08518, 0x0
-};
-
+const extern int64_t logD_tbl128_fixedpt[128] = {0x0,
+                                                 0x22d443c414148a1,
+                                                 0x3a475f892273f13,
+                                                 0x51ea81cd5dc13cb,
+                                                 0x69be70ddf74c6a8,
+                                                 0x81c3f7de5434ed0,
+                                                 0x8dd9953002a4e86,
+                                                 0xa62b07f3457c407,
+                                                 0xbeb024b67dda634,
+                                                 0xd769c8d5b33a728,
+                                                 0xe3da945b878e27d,
+                                                 0xfce4aee0e88b275,
+                                                 0x1162593186da6fc4,
+                                                 0x122dadc2ab3496d3,
+                                                 0x13c6fb650cde50a1,
+                                                 0x1563dc29ffacb20d,
+                                                 0x1633a8bf437ce10b,
+                                                 0x17d60496cfbb4c67,
+                                                 0x18a8980abfbd3266,
+                                                 0x1a5094b54d282840,
+                                                 0x1b2602497d53458d,
+                                                 0x1cd3c712d3110932,
+                                                 0x1dac22d3e441d2fe,
+                                                 0x1f5fd8a9063e3491,
+                                                 0x203b3779f4c3a8bb,
+                                                 0x21f509008966a211,
+                                                 0x22d380a6c7e2b0e4,
+                                                 0x23b30593aa4e106c,
+                                                 0x2575418697c3d7e0,
+                                                 0x2657fdc6e1dcd0cb,
+                                                 0x273bd1c2ab3edefe,
+                                                 0x2906cbcd2baf2d54,
+                                                 0x29edf7659d8b30f2,
+                                                 0x2ad645cd6af1c939,
+                                                 0x2bbfb9e3dd5c1c88,
+                                                 0x2d961ed0cb91d407,
+                                                 0x2e83159d77e31d6d,
+                                                 0x2f713e059e555a64,
+                                                 0x30609b21823fa654,
+                                                 0x315130157f7a64cd,
+                                                 0x33360e552d8d64de,
+                                                 0x342a5e28530367af,
+                                                 0x351ff2e30214bc30,
+                                                 0x3616cfe9e8d01fea,
+                                                 0x370ef8af6360dfe0,
+                                                 0x380870b3c5fb66f7,
+                                                 0x39033b85a8bfc871,
+                                                 0x39ff5cc235a256c5,
+                                                 0x3afcd815786af188,
+                                                 0x3bfbb13ab0dc5614,
+                                                 0x3cfbebfca715669e,
+                                                 0x3dfd8c36023f0ab7,
+                                                 0x3f0095d1a19a0332,
+                                                 0x40050ccaf800ca8c,
+                                                 0x410af52e69f26264,
+                                                 0x42125319ae3bbf06,
+                                                 0x431b2abc31565be7,
+                                                 0x442580577b936763,
+                                                 0x4531583f9a2be204,
+                                                 0x463eb6db8b4f066d,
+                                                 0x474da0a5ad495303,
+                                                 0x485e1a2c30df9ea9,
+                                                 0x497028118efabeb8,
+                                                 0x4a83cf0d01c16e3d,
+                                                 -0x3466ec14fec0a13b,
+                                                 -0x335004723c465e69,
+                                                 -0x323775123e2e1169,
+                                                 -0x323775123e2e1169,
+                                                 -0x311d38e5c1644b49,
+                                                 -0x30014ac62c38a865,
+                                                 -0x2ee3a574fdf677c9,
+                                                 -0x2dc4439b3a19bcaf,
+                                                 -0x2ca31fc8cef74dca,
+                                                 -0x2ca31fc8cef74dca,
+                                                 -0x2b803473f7ad0f3f,
+                                                 -0x2a5b7bf8992d66fc,
+                                                 -0x2934f0979a3715fd,
+                                                 -0x280c8c76360892eb,
+                                                 -0x280c8c76360892eb,
+                                                 -0x26e2499d499bd9b3,
+                                                 -0x25b621f89b355ede,
+                                                 -0x24880f561c0e7305,
+                                                 -0x24880f561c0e7305,
+                                                 -0x23580b6523e0e0a5,
+                                                 -0x22260fb5a616eb96,
+                                                 -0x20f215b7606012de,
+                                                 -0x20f215b7606012de,
+                                                 -0x1fbc16b902680a24,
+                                                 -0x1e840be74e6a4cc8,
+                                                 -0x1e840be74e6a4cc8,
+                                                 -0x1d49ee4c32596fc9,
+                                                 -0x1c0db6cdd94dee41,
+                                                 -0x1c0db6cdd94dee41,
+                                                 -0x1acf5e2db4ec93f0,
+                                                 -0x198edd077e70df03,
+                                                 -0x198edd077e70df03,
+                                                 -0x184c2bd02f03b2fe,
+                                                 -0x170742d4ef027f2a,
+                                                 -0x170742d4ef027f2a,
+                                                 -0x15c01a39fbd687a0,
+                                                 -0x1476a9f983f74d31,
+                                                 -0x1476a9f983f74d31,
+                                                 -0x132ae9e278ae1a1f,
+                                                 -0x132ae9e278ae1a1f,
+                                                 -0x11dcd197552b7b5f,
+                                                 -0x108c588cda79e396,
+                                                 -0x108c588cda79e396,
+                                                 -0xf397608bfd2d90e,
+                                                 -0xf397608bfd2d90e,
+                                                 -0xde4212056d5dd32,
+                                                 -0xc8c50b72319ad57,
+                                                 -0xc8c50b72319ad57,
+                                                 -0xb31fb7d64898b3e,
+                                                 -0xb31fb7d64898b3e,
+                                                 -0x9d517ee93f8e16c,
+                                                 -0x9d517ee93f8e16c,
+                                                 -0x8759c4fd14fcd5a,
+                                                 -0x7137eae42aad7bd,
+                                                 -0x7137eae42aad7bd,
+                                                 -0x5aeb4dd63bf61cc,
+                                                 -0x5aeb4dd63bf61cc,
+                                                 -0x447347544cd04bb,
+                                                 -0x447347544cd04bb,
+                                                 -0x2dcf2d0b85a4531,
+                                                 -0x2dcf2d0b85a4531,
+                                                 -0x16fe50b6ef08518,
+                                                 -0x16fe50b6ef08518,
+                                                 0x0};
diff --git a/src/rvvlm_powD.c b/src/rvvlm_powD.c
index 9243fdb..5e11623 100644
--- a/src/rvvlm_powD.c
+++ b/src/rvvlm_powD.c
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <stdio.h>
 #include <riscv_vector.h>
+#include <stdio.h>
 
 #include "rvvlm.h"
 #define API_SIGNATURE API_SIGNATURE_21
@@ -12,4 +12,3 @@
 #include RVVLM_POWD_VSET_CONFIG
 
 #include "rvvlm_powD.inc.h"
-
diff --git a/src/rvvlm_powDI.c b/src/rvvlm_powDI.c
index 9804a86..33a6180 100644
--- a/src/rvvlm_powDI.c
+++ b/src/rvvlm_powDI.c
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <stdio.h>
 #include <riscv_vector.h>
+#include <stdio.h>
 
 #include "rvvlm.h"
 #define API_SIGNATURE API_SIGNATURE_21
@@ -12,4 +12,3 @@
 #include RVVLM_POWDI_VSET_CONFIG
 
 #include "rvvlm_powD.inc.h"
-
diff --git a/src/rvvlm_powD_tbl.c b/src/rvvlm_powD_tbl.c
index 4ae12cc..f12925c 100644
--- a/src/rvvlm_powD_tbl.c
+++ b/src/rvvlm_powD_tbl.c
@@ -5,113 +5,388 @@
 // This table is used my different functions of the log and exponential family
 #include <stdint.h>
 
-const extern double logtbl_4_powD_128_hi_lo[256] = {
-  0x0.0p+0, 0x0.0p+0, 0x1.16a21e20c0000p-6, -0x1.f5baf436cbec7p-42,
-  0x1.d23afc4900000p-6, 0x1.39f89bcdae7bdp-42, 0x1.47aa073580000p-5, -0x1.1f61a96b8ce77p-42,
-  0x1.a6f9c377e0000p-5, -0x1.672b0c88d4dd6p-44, 0x1.0387efbcb0000p-4, -0x1.e5897de9078d1p-42,
-  0x1.1bb32a6000000p-4, 0x1.52743318a8a57p-42, 0x1.4c560fe690000p-4, -0x1.41dfc7d7c3321p-42,
-  0x1.7d60496d00000p-4, -0x1.12ce6312ebb82p-42, 0x1.aed391ab60000p-4, 0x1.9d3940238de7fp-42,
-  0x1.c7b528b710000p-4, -0x1.c760bc9b188c4p-45, 0x1.f9c95dc1d0000p-4, 0x1.164e932b2d51cp-44,
-  0x1.1625931870000p-3, -0x1.2c81df0fdbc29p-42, 0x1.22dadc2ab0000p-3, 0x1.a4b69691d7994p-42,
-  0x1.3c6fb650d0000p-3, -0x1.0d7af4dda9c36p-42, 0x1.563dc29ff8000p-3, 0x1.6590643906f2ap-42,
-  0x1.633a8bf438000p-3, -0x1.8f7aac147fdc1p-46, 0x1.7d60496cf8000p-3, 0x1.da6339da288fcp-42,
-  0x1.8a8980abf8000p-3, 0x1.e9933354dbf17p-42, 0x1.a5094b54d0000p-3, 0x1.41420276dd59dp-42,
-  0x1.b2602497d8000p-3, -0x1.65d3990cb67bap-42, 0x1.cd3c712d30000p-3, 0x1.109325dd5e814p-43,
-  0x1.dac22d3e48000p-3, -0x1.f1680dd458fb2p-42, 0x1.f5fd8a9060000p-3, 0x1.f1a4847f7b278p-42,
-  0x1.01d9bbcfa8000p-2, -0x1.e2ba25d9aeffdp-42, 0x1.0fa848044c000p-2, -0x1.95def21f8497bp-43,
-  0x1.169c053640000p-2, -0x1.d4f1b95e0ff45p-43, 0x1.1d982c9d54000p-2, -0x1.8f7ca2cff7b90p-42,
-  0x1.2baa0c34c0000p-2, -0x1.e1410132ae5e4p-42, 0x1.32bfee3710000p-2, -0x1.1979a5db68722p-42,
-  0x1.39de8e1558000p-2, 0x1.f6f7f2b4bd1c4p-42, 0x1.48365e695c000p-2, 0x1.796aa2981fdbcp-42,
-  0x1.4f6fbb2cec000p-2, 0x1.661e393a16b95p-44, 0x1.56b22e6b58000p-2, -0x1.c6d8d86531d56p-44,
-  0x1.5dfdcf1eec000p-2, -0x1.1f1bbd2926f16p-42, 0x1.6cb0f6865c000p-2, 0x1.1d406db502403p-43,
-  0x1.7418acebc0000p-2, -0x1.ce2935fff809ap-43, 0x1.7b89f02cf4000p-2, -0x1.552ce0ec3a295p-42,
-  0x1.8304d90c10000p-2, 0x1.fd32a3ab0a4b5p-42, 0x1.8a8980abfc000p-2, -0x1.66cccab240e90p-45,
-  0x1.99b072a96c000p-2, 0x1.ac9bca36fd02ep-44, 0x1.a152f14298000p-2, 0x1.b3d7b0e65d2cep-46,
-  0x1.a8ff971810000p-2, 0x1.4bc302ffa76fbp-43, 0x1.b0b67f4f48000p-2, -0x1.7f00af09dc1c7p-42,
-  0x1.b877c57b1c000p-2, -0x1.f20203b3186a6p-43, 0x1.c043859e30000p-2, -0x1.2642415d47384p-45,
-  0x1.c819dc2d44000p-2, 0x1.fe43895d8ac46p-42, 0x1.cffae611ac000p-2, 0x1.12b628e2d05d7p-42,
-  0x1.d7e6c0abc4000p-2, -0x1.50e785694a8c6p-43, 0x1.dfdd89d588000p-2, -0x1.1d4f639bb5cdfp-42,
-  0x1.e7df5fe538000p-2, 0x1.5669df6a2b592p-43, 0x1.efec61b010000p-2, 0x1.f855b4987c5d5p-42,
-  0x1.f804ae8d0c000p-2, 0x1.a0331af2e6feap-43, 0x1.0014332be0000p-1, 0x1.9518ce032f41dp-48,
-  0x1.042bd4b9a8000p-1, -0x1.b3b3864c60011p-44, 0x1.08494c66b8000p-1, 0x1.ddf82e1fe57c7p-42,
-  0x1.0c6caaf0c6000p-1, -0x1.4d20c519e12f4p-42, 0x1.1096015dee000p-1, 0x1.3676289cd3dd4p-43,
-  0x1.14c560fe68000p-1, 0x1.5f101c141e670p-42, 0x1.18fadb6e2e000p-1, -0x1.87cc95d0a2ee8p-42,
-  0x1.1d368296b6000p-1, -0x1.b567e7ee54aefp-42, 0x1.217868b0c4000p-1, -0x1.030ab442ce320p-42,
-  0x1.25c0a0463c000p-1, -0x1.50520a377c7ecp-45, 0x1.2a0f3c3408000p-1, -0x1.f48e1a4725559p-42,
-  -0x1.a33760a7f8000p-2, 0x1.faf6283bf2868p-42, -0x1.9a802391e4000p-2, 0x1.cd0cb4492f1bcp-42,
-  -0x1.91bba891f0000p-2, -0x1.708b4b2b5056cp-42, -0x1.91bba891f0000p-2, -0x1.708b4b2b5056cp-42,
-  -0x1.88e9c72e0c000p-2, 0x1.bb4b69336b66ep-43, -0x1.800a563160000p-2, -0x1.c5432aeb609f5p-42,
-  -0x1.771d2ba7f0000p-2, 0x1.3106e404cabb7p-44, -0x1.6e221cd9d0000p-2, -0x1.9bcaf1aa4168ap-43,
-  -0x1.6518fe4678000p-2, 0x1.1646b761c48dep-44, -0x1.6518fe4678000p-2, 0x1.1646b761c48dep-44,
-  -0x1.5c01a39fbc000p-2, -0x1.6879fa00b120ap-42, -0x1.52dbdfc4c8000p-2, -0x1.6b37dcf60e620p-42,
-  -0x1.49a784bcd0000p-2, -0x1.b8afe492bf6ffp-42, -0x1.406463b1b0000p-2, -0x1.125d6cbcd1095p-44,
-  -0x1.406463b1b0000p-2, -0x1.125d6cbcd1095p-44, -0x1.37124cea4c000p-2, -0x1.bd9b32266d92cp-43,
-  -0x1.2db10fc4d8000p-2, -0x1.aaf6f137a3d8cp-42, -0x1.24407ab0e0000p-2, -0x1.ce60916e52e91p-44,
-  -0x1.24407ab0e0000p-2, -0x1.ce60916e52e91p-44, -0x1.1ac05b2920000p-2, 0x1.f1f5ae718f241p-43,
-  -0x1.11307dad30000p-2, -0x1.6eb9612e0b4f3p-43, -0x1.0790adbb04000p-2, 0x1.fed21f9cb2cc5p-43,
-  -0x1.0790adbb04000p-2, 0x1.fed21f9cb2cc5p-43, -0x1.fbc16b9028000p-3, 0x1.7f5dc57266758p-43,
-  -0x1.e840be74e8000p-3, 0x1.5b338360c2ae2p-43, -0x1.e840be74e8000p-3, 0x1.5b338360c2ae2p-43,
-  -0x1.d49ee4c328000p-3, 0x1.3481b85a54d7fp-42, -0x1.c0db6cdd98000p-3, 0x1.908df8ec933b3p-42,
-  -0x1.c0db6cdd98000p-3, 0x1.908df8ec933b3p-42, -0x1.acf5e2db50000p-3, 0x1.36c101ee13440p-43,
-  -0x1.98edd077e8000p-3, 0x1.e41fa0a62e6aep-44, -0x1.98edd077e8000p-3, 0x1.e41fa0a62e6aep-44,
-  -0x1.84c2bd02f0000p-3, -0x1.d97ee9124773bp-46, -0x1.70742d4ef0000p-3, -0x1.3f94e00e7d6bcp-46,
-  -0x1.70742d4ef0000p-3, -0x1.3f94e00e7d6bcp-46, -0x1.5c01a39fc0000p-3, 0x1.4bc302ffa76fbp-42,
-  -0x1.476a9f9840000p-3, 0x1.1659d8e2d7d38p-44, -0x1.476a9f9840000p-3, 0x1.1659d8e2d7d38p-44,
-  -0x1.32ae9e2788000p-3, -0x1.70d0fa8f9603bp-42, -0x1.32ae9e2788000p-3, -0x1.70d0fa8f9603bp-42,
-  -0x1.1dcd197550000p-3, -0x1.5bdaf522a183cp-42, -0x1.08c588cda8000p-3, 0x1.871a7610e40bdp-45,
-  -0x1.08c588cda8000p-3, 0x1.871a7610e40bdp-45, -0x1.e72ec11800000p-4, 0x1.69378d0928989p-42,
-  -0x1.e72ec11800000p-4, 0x1.69378d0928989p-42, -0x1.bc84240ae0000p-4, 0x1.51167134e9647p-42,
-  -0x1.918a16e460000p-4, -0x1.9ad57391924a7p-43, -0x1.918a16e460000p-4, -0x1.9ad57391924a7p-43,
-  -0x1.663f6fac90000p-4, -0x1.3167ccc538261p-44, -0x1.663f6fac90000p-4, -0x1.3167ccc538261p-44,
-  -0x1.3aa2fdd280000p-4, 0x1.c7a4ff65ddbc9p-45, -0x1.3aa2fdd280000p-4, 0x1.c7a4ff65ddbc9p-45,
-  -0x1.0eb389fa30000p-4, 0x1.819530c22d152p-42, -0x1.c4dfab90a0000p-5, -0x1.56bde9f1f0d3dp-42,
-  -0x1.c4dfab90a0000p-5, -0x1.56bde9f1f0d3dp-42, -0x1.6bad3758e0000p-5, -0x1.fb0e626c0de13p-42,
-  -0x1.6bad3758e0000p-5, -0x1.fb0e626c0de13p-42, -0x1.11cd1d5140000p-5, 0x1.97da24fd75f61p-42,
-  -0x1.11cd1d5140000p-5, 0x1.97da24fd75f61p-42, -0x1.6e79685c40000p-6, 0x1.2dd67591d81dfp-42,
-  -0x1.6e79685c40000p-6, 0x1.2dd67591d81dfp-42, -0x1.6fe50b6f00000p-7, 0x1.ef5d00e390a00p-44,
-  -0x1.6fe50b6f00000p-7, 0x1.ef5d00e390a00p-44, 0x0.0p+0, 0x0.0p+0
-};
-
-
-
-
-
-
-
-
-
-const extern int64_t logD_tbl128_fixedpt[128] = {
-  0x0, 0x22d443c414148a1, 0x3a475f892273f13, 0x51ea81cd5dc13cb,
-  0x69be70ddf74c6a8, 0x81c3f7de5434ed0, 0x8dd9953002a4e86, 0xa62b07f3457c407,
-  0xbeb024b67dda634, 0xd769c8d5b33a728, 0xe3da945b878e27d, 0xfce4aee0e88b275,
-  0x1162593186da6fc4, 0x122dadc2ab3496d3, 0x13c6fb650cde50a1, 0x1563dc29ffacb20d,
-  0x1633a8bf437ce10b, 0x17d60496cfbb4c67, 0x18a8980abfbd3266, 0x1a5094b54d282840,
-  0x1b2602497d53458d, 0x1cd3c712d3110932, 0x1dac22d3e441d2fe, 0x1f5fd8a9063e3491,
-  0x203b3779f4c3a8bb, 0x21f509008966a211, 0x22d380a6c7e2b0e4, 0x23b30593aa4e106c,
-  0x2575418697c3d7e0, 0x2657fdc6e1dcd0cb, 0x273bd1c2ab3edefe, 0x2906cbcd2baf2d54,
-  0x29edf7659d8b30f2, 0x2ad645cd6af1c939, 0x2bbfb9e3dd5c1c88, 0x2d961ed0cb91d407,
-  0x2e83159d77e31d6d, 0x2f713e059e555a64, 0x30609b21823fa654, 0x315130157f7a64cd,
-  0x33360e552d8d64de, 0x342a5e28530367af, 0x351ff2e30214bc30, 0x3616cfe9e8d01fea,
-  0x370ef8af6360dfe0, 0x380870b3c5fb66f7, 0x39033b85a8bfc871, 0x39ff5cc235a256c5,
-  0x3afcd815786af188, 0x3bfbb13ab0dc5614, 0x3cfbebfca715669e, 0x3dfd8c36023f0ab7,
-  0x3f0095d1a19a0332, 0x40050ccaf800ca8c, 0x410af52e69f26264, 0x42125319ae3bbf06,
-  0x431b2abc31565be7, 0x442580577b936763, 0x4531583f9a2be204, 0x463eb6db8b4f066d,
-  0x474da0a5ad495303, 0x485e1a2c30df9ea9, 0x497028118efabeb8, 0x4a83cf0d01c16e3d,
-  -0x3466ec14fec0a13b, -0x335004723c465e69, -0x323775123e2e1169, -0x323775123e2e1169,
-  -0x311d38e5c1644b49, -0x30014ac62c38a865, -0x2ee3a574fdf677c9, -0x2dc4439b3a19bcaf,
-  -0x2ca31fc8cef74dca, -0x2ca31fc8cef74dca, -0x2b803473f7ad0f3f, -0x2a5b7bf8992d66fc,
-  -0x2934f0979a3715fd, -0x280c8c76360892eb, -0x280c8c76360892eb, -0x26e2499d499bd9b3,
-  -0x25b621f89b355ede, -0x24880f561c0e7305, -0x24880f561c0e7305, -0x23580b6523e0e0a5,
-  -0x22260fb5a616eb96, -0x20f215b7606012de, -0x20f215b7606012de, -0x1fbc16b902680a24,
-  -0x1e840be74e6a4cc8, -0x1e840be74e6a4cc8, -0x1d49ee4c32596fc9, -0x1c0db6cdd94dee41,
-  -0x1c0db6cdd94dee41, -0x1acf5e2db4ec93f0, -0x198edd077e70df03, -0x198edd077e70df03,
-  -0x184c2bd02f03b2fe, -0x170742d4ef027f2a, -0x170742d4ef027f2a, -0x15c01a39fbd687a0,
-  -0x1476a9f983f74d31, -0x1476a9f983f74d31, -0x132ae9e278ae1a1f, -0x132ae9e278ae1a1f,
-  -0x11dcd197552b7b5f, -0x108c588cda79e396, -0x108c588cda79e396, -0xf397608bfd2d90e,
-  -0xf397608bfd2d90e, -0xde4212056d5dd32, -0xc8c50b72319ad57, -0xc8c50b72319ad57,
-  -0xb31fb7d64898b3e, -0xb31fb7d64898b3e, -0x9d517ee93f8e16c, -0x9d517ee93f8e16c,
-  -0x8759c4fd14fcd5a, -0x7137eae42aad7bd, -0x7137eae42aad7bd, -0x5aeb4dd63bf61cc,
-  -0x5aeb4dd63bf61cc, -0x447347544cd04bb, -0x447347544cd04bb, -0x2dcf2d0b85a4531,
-  -0x2dcf2d0b85a4531, -0x16fe50b6ef08518, -0x16fe50b6ef08518, 0x0
-};
+const extern double logtbl_4_powD_128_hi_lo[256] = {0x0.0p+0,
+                                                    0x0.0p+0,
+                                                    0x1.16a21e20c0000p-6,
+                                                    -0x1.f5baf436cbec7p-42,
+                                                    0x1.d23afc4900000p-6,
+                                                    0x1.39f89bcdae7bdp-42,
+                                                    0x1.47aa073580000p-5,
+                                                    -0x1.1f61a96b8ce77p-42,
+                                                    0x1.a6f9c377e0000p-5,
+                                                    -0x1.672b0c88d4dd6p-44,
+                                                    0x1.0387efbcb0000p-4,
+                                                    -0x1.e5897de9078d1p-42,
+                                                    0x1.1bb32a6000000p-4,
+                                                    0x1.52743318a8a57p-42,
+                                                    0x1.4c560fe690000p-4,
+                                                    -0x1.41dfc7d7c3321p-42,
+                                                    0x1.7d60496d00000p-4,
+                                                    -0x1.12ce6312ebb82p-42,
+                                                    0x1.aed391ab60000p-4,
+                                                    0x1.9d3940238de7fp-42,
+                                                    0x1.c7b528b710000p-4,
+                                                    -0x1.c760bc9b188c4p-45,
+                                                    0x1.f9c95dc1d0000p-4,
+                                                    0x1.164e932b2d51cp-44,
+                                                    0x1.1625931870000p-3,
+                                                    -0x1.2c81df0fdbc29p-42,
+                                                    0x1.22dadc2ab0000p-3,
+                                                    0x1.a4b69691d7994p-42,
+                                                    0x1.3c6fb650d0000p-3,
+                                                    -0x1.0d7af4dda9c36p-42,
+                                                    0x1.563dc29ff8000p-3,
+                                                    0x1.6590643906f2ap-42,
+                                                    0x1.633a8bf438000p-3,
+                                                    -0x1.8f7aac147fdc1p-46,
+                                                    0x1.7d60496cf8000p-3,
+                                                    0x1.da6339da288fcp-42,
+                                                    0x1.8a8980abf8000p-3,
+                                                    0x1.e9933354dbf17p-42,
+                                                    0x1.a5094b54d0000p-3,
+                                                    0x1.41420276dd59dp-42,
+                                                    0x1.b2602497d8000p-3,
+                                                    -0x1.65d3990cb67bap-42,
+                                                    0x1.cd3c712d30000p-3,
+                                                    0x1.109325dd5e814p-43,
+                                                    0x1.dac22d3e48000p-3,
+                                                    -0x1.f1680dd458fb2p-42,
+                                                    0x1.f5fd8a9060000p-3,
+                                                    0x1.f1a4847f7b278p-42,
+                                                    0x1.01d9bbcfa8000p-2,
+                                                    -0x1.e2ba25d9aeffdp-42,
+                                                    0x1.0fa848044c000p-2,
+                                                    -0x1.95def21f8497bp-43,
+                                                    0x1.169c053640000p-2,
+                                                    -0x1.d4f1b95e0ff45p-43,
+                                                    0x1.1d982c9d54000p-2,
+                                                    -0x1.8f7ca2cff7b90p-42,
+                                                    0x1.2baa0c34c0000p-2,
+                                                    -0x1.e1410132ae5e4p-42,
+                                                    0x1.32bfee3710000p-2,
+                                                    -0x1.1979a5db68722p-42,
+                                                    0x1.39de8e1558000p-2,
+                                                    0x1.f6f7f2b4bd1c4p-42,
+                                                    0x1.48365e695c000p-2,
+                                                    0x1.796aa2981fdbcp-42,
+                                                    0x1.4f6fbb2cec000p-2,
+                                                    0x1.661e393a16b95p-44,
+                                                    0x1.56b22e6b58000p-2,
+                                                    -0x1.c6d8d86531d56p-44,
+                                                    0x1.5dfdcf1eec000p-2,
+                                                    -0x1.1f1bbd2926f16p-42,
+                                                    0x1.6cb0f6865c000p-2,
+                                                    0x1.1d406db502403p-43,
+                                                    0x1.7418acebc0000p-2,
+                                                    -0x1.ce2935fff809ap-43,
+                                                    0x1.7b89f02cf4000p-2,
+                                                    -0x1.552ce0ec3a295p-42,
+                                                    0x1.8304d90c10000p-2,
+                                                    0x1.fd32a3ab0a4b5p-42,
+                                                    0x1.8a8980abfc000p-2,
+                                                    -0x1.66cccab240e90p-45,
+                                                    0x1.99b072a96c000p-2,
+                                                    0x1.ac9bca36fd02ep-44,
+                                                    0x1.a152f14298000p-2,
+                                                    0x1.b3d7b0e65d2cep-46,
+                                                    0x1.a8ff971810000p-2,
+                                                    0x1.4bc302ffa76fbp-43,
+                                                    0x1.b0b67f4f48000p-2,
+                                                    -0x1.7f00af09dc1c7p-42,
+                                                    0x1.b877c57b1c000p-2,
+                                                    -0x1.f20203b3186a6p-43,
+                                                    0x1.c043859e30000p-2,
+                                                    -0x1.2642415d47384p-45,
+                                                    0x1.c819dc2d44000p-2,
+                                                    0x1.fe43895d8ac46p-42,
+                                                    0x1.cffae611ac000p-2,
+                                                    0x1.12b628e2d05d7p-42,
+                                                    0x1.d7e6c0abc4000p-2,
+                                                    -0x1.50e785694a8c6p-43,
+                                                    0x1.dfdd89d588000p-2,
+                                                    -0x1.1d4f639bb5cdfp-42,
+                                                    0x1.e7df5fe538000p-2,
+                                                    0x1.5669df6a2b592p-43,
+                                                    0x1.efec61b010000p-2,
+                                                    0x1.f855b4987c5d5p-42,
+                                                    0x1.f804ae8d0c000p-2,
+                                                    0x1.a0331af2e6feap-43,
+                                                    0x1.0014332be0000p-1,
+                                                    0x1.9518ce032f41dp-48,
+                                                    0x1.042bd4b9a8000p-1,
+                                                    -0x1.b3b3864c60011p-44,
+                                                    0x1.08494c66b8000p-1,
+                                                    0x1.ddf82e1fe57c7p-42,
+                                                    0x1.0c6caaf0c6000p-1,
+                                                    -0x1.4d20c519e12f4p-42,
+                                                    0x1.1096015dee000p-1,
+                                                    0x1.3676289cd3dd4p-43,
+                                                    0x1.14c560fe68000p-1,
+                                                    0x1.5f101c141e670p-42,
+                                                    0x1.18fadb6e2e000p-1,
+                                                    -0x1.87cc95d0a2ee8p-42,
+                                                    0x1.1d368296b6000p-1,
+                                                    -0x1.b567e7ee54aefp-42,
+                                                    0x1.217868b0c4000p-1,
+                                                    -0x1.030ab442ce320p-42,
+                                                    0x1.25c0a0463c000p-1,
+                                                    -0x1.50520a377c7ecp-45,
+                                                    0x1.2a0f3c3408000p-1,
+                                                    -0x1.f48e1a4725559p-42,
+                                                    -0x1.a33760a7f8000p-2,
+                                                    0x1.faf6283bf2868p-42,
+                                                    -0x1.9a802391e4000p-2,
+                                                    0x1.cd0cb4492f1bcp-42,
+                                                    -0x1.91bba891f0000p-2,
+                                                    -0x1.708b4b2b5056cp-42,
+                                                    -0x1.91bba891f0000p-2,
+                                                    -0x1.708b4b2b5056cp-42,
+                                                    -0x1.88e9c72e0c000p-2,
+                                                    0x1.bb4b69336b66ep-43,
+                                                    -0x1.800a563160000p-2,
+                                                    -0x1.c5432aeb609f5p-42,
+                                                    -0x1.771d2ba7f0000p-2,
+                                                    0x1.3106e404cabb7p-44,
+                                                    -0x1.6e221cd9d0000p-2,
+                                                    -0x1.9bcaf1aa4168ap-43,
+                                                    -0x1.6518fe4678000p-2,
+                                                    0x1.1646b761c48dep-44,
+                                                    -0x1.6518fe4678000p-2,
+                                                    0x1.1646b761c48dep-44,
+                                                    -0x1.5c01a39fbc000p-2,
+                                                    -0x1.6879fa00b120ap-42,
+                                                    -0x1.52dbdfc4c8000p-2,
+                                                    -0x1.6b37dcf60e620p-42,
+                                                    -0x1.49a784bcd0000p-2,
+                                                    -0x1.b8afe492bf6ffp-42,
+                                                    -0x1.406463b1b0000p-2,
+                                                    -0x1.125d6cbcd1095p-44,
+                                                    -0x1.406463b1b0000p-2,
+                                                    -0x1.125d6cbcd1095p-44,
+                                                    -0x1.37124cea4c000p-2,
+                                                    -0x1.bd9b32266d92cp-43,
+                                                    -0x1.2db10fc4d8000p-2,
+                                                    -0x1.aaf6f137a3d8cp-42,
+                                                    -0x1.24407ab0e0000p-2,
+                                                    -0x1.ce60916e52e91p-44,
+                                                    -0x1.24407ab0e0000p-2,
+                                                    -0x1.ce60916e52e91p-44,
+                                                    -0x1.1ac05b2920000p-2,
+                                                    0x1.f1f5ae718f241p-43,
+                                                    -0x1.11307dad30000p-2,
+                                                    -0x1.6eb9612e0b4f3p-43,
+                                                    -0x1.0790adbb04000p-2,
+                                                    0x1.fed21f9cb2cc5p-43,
+                                                    -0x1.0790adbb04000p-2,
+                                                    0x1.fed21f9cb2cc5p-43,
+                                                    -0x1.fbc16b9028000p-3,
+                                                    0x1.7f5dc57266758p-43,
+                                                    -0x1.e840be74e8000p-3,
+                                                    0x1.5b338360c2ae2p-43,
+                                                    -0x1.e840be74e8000p-3,
+                                                    0x1.5b338360c2ae2p-43,
+                                                    -0x1.d49ee4c328000p-3,
+                                                    0x1.3481b85a54d7fp-42,
+                                                    -0x1.c0db6cdd98000p-3,
+                                                    0x1.908df8ec933b3p-42,
+                                                    -0x1.c0db6cdd98000p-3,
+                                                    0x1.908df8ec933b3p-42,
+                                                    -0x1.acf5e2db50000p-3,
+                                                    0x1.36c101ee13440p-43,
+                                                    -0x1.98edd077e8000p-3,
+                                                    0x1.e41fa0a62e6aep-44,
+                                                    -0x1.98edd077e8000p-3,
+                                                    0x1.e41fa0a62e6aep-44,
+                                                    -0x1.84c2bd02f0000p-3,
+                                                    -0x1.d97ee9124773bp-46,
+                                                    -0x1.70742d4ef0000p-3,
+                                                    -0x1.3f94e00e7d6bcp-46,
+                                                    -0x1.70742d4ef0000p-3,
+                                                    -0x1.3f94e00e7d6bcp-46,
+                                                    -0x1.5c01a39fc0000p-3,
+                                                    0x1.4bc302ffa76fbp-42,
+                                                    -0x1.476a9f9840000p-3,
+                                                    0x1.1659d8e2d7d38p-44,
+                                                    -0x1.476a9f9840000p-3,
+                                                    0x1.1659d8e2d7d38p-44,
+                                                    -0x1.32ae9e2788000p-3,
+                                                    -0x1.70d0fa8f9603bp-42,
+                                                    -0x1.32ae9e2788000p-3,
+                                                    -0x1.70d0fa8f9603bp-42,
+                                                    -0x1.1dcd197550000p-3,
+                                                    -0x1.5bdaf522a183cp-42,
+                                                    -0x1.08c588cda8000p-3,
+                                                    0x1.871a7610e40bdp-45,
+                                                    -0x1.08c588cda8000p-3,
+                                                    0x1.871a7610e40bdp-45,
+                                                    -0x1.e72ec11800000p-4,
+                                                    0x1.69378d0928989p-42,
+                                                    -0x1.e72ec11800000p-4,
+                                                    0x1.69378d0928989p-42,
+                                                    -0x1.bc84240ae0000p-4,
+                                                    0x1.51167134e9647p-42,
+                                                    -0x1.918a16e460000p-4,
+                                                    -0x1.9ad57391924a7p-43,
+                                                    -0x1.918a16e460000p-4,
+                                                    -0x1.9ad57391924a7p-43,
+                                                    -0x1.663f6fac90000p-4,
+                                                    -0x1.3167ccc538261p-44,
+                                                    -0x1.663f6fac90000p-4,
+                                                    -0x1.3167ccc538261p-44,
+                                                    -0x1.3aa2fdd280000p-4,
+                                                    0x1.c7a4ff65ddbc9p-45,
+                                                    -0x1.3aa2fdd280000p-4,
+                                                    0x1.c7a4ff65ddbc9p-45,
+                                                    -0x1.0eb389fa30000p-4,
+                                                    0x1.819530c22d152p-42,
+                                                    -0x1.c4dfab90a0000p-5,
+                                                    -0x1.56bde9f1f0d3dp-42,
+                                                    -0x1.c4dfab90a0000p-5,
+                                                    -0x1.56bde9f1f0d3dp-42,
+                                                    -0x1.6bad3758e0000p-5,
+                                                    -0x1.fb0e626c0de13p-42,
+                                                    -0x1.6bad3758e0000p-5,
+                                                    -0x1.fb0e626c0de13p-42,
+                                                    -0x1.11cd1d5140000p-5,
+                                                    0x1.97da24fd75f61p-42,
+                                                    -0x1.11cd1d5140000p-5,
+                                                    0x1.97da24fd75f61p-42,
+                                                    -0x1.6e79685c40000p-6,
+                                                    0x1.2dd67591d81dfp-42,
+                                                    -0x1.6e79685c40000p-6,
+                                                    0x1.2dd67591d81dfp-42,
+                                                    -0x1.6fe50b6f00000p-7,
+                                                    0x1.ef5d00e390a00p-44,
+                                                    -0x1.6fe50b6f00000p-7,
+                                                    0x1.ef5d00e390a00p-44,
+                                                    0x0.0p+0,
+                                                    0x0.0p+0};
 
+const extern int64_t logD_tbl128_fixedpt[128] = {0x0,
+                                                 0x22d443c414148a1,
+                                                 0x3a475f892273f13,
+                                                 0x51ea81cd5dc13cb,
+                                                 0x69be70ddf74c6a8,
+                                                 0x81c3f7de5434ed0,
+                                                 0x8dd9953002a4e86,
+                                                 0xa62b07f3457c407,
+                                                 0xbeb024b67dda634,
+                                                 0xd769c8d5b33a728,
+                                                 0xe3da945b878e27d,
+                                                 0xfce4aee0e88b275,
+                                                 0x1162593186da6fc4,
+                                                 0x122dadc2ab3496d3,
+                                                 0x13c6fb650cde50a1,
+                                                 0x1563dc29ffacb20d,
+                                                 0x1633a8bf437ce10b,
+                                                 0x17d60496cfbb4c67,
+                                                 0x18a8980abfbd3266,
+                                                 0x1a5094b54d282840,
+                                                 0x1b2602497d53458d,
+                                                 0x1cd3c712d3110932,
+                                                 0x1dac22d3e441d2fe,
+                                                 0x1f5fd8a9063e3491,
+                                                 0x203b3779f4c3a8bb,
+                                                 0x21f509008966a211,
+                                                 0x22d380a6c7e2b0e4,
+                                                 0x23b30593aa4e106c,
+                                                 0x2575418697c3d7e0,
+                                                 0x2657fdc6e1dcd0cb,
+                                                 0x273bd1c2ab3edefe,
+                                                 0x2906cbcd2baf2d54,
+                                                 0x29edf7659d8b30f2,
+                                                 0x2ad645cd6af1c939,
+                                                 0x2bbfb9e3dd5c1c88,
+                                                 0x2d961ed0cb91d407,
+                                                 0x2e83159d77e31d6d,
+                                                 0x2f713e059e555a64,
+                                                 0x30609b21823fa654,
+                                                 0x315130157f7a64cd,
+                                                 0x33360e552d8d64de,
+                                                 0x342a5e28530367af,
+                                                 0x351ff2e30214bc30,
+                                                 0x3616cfe9e8d01fea,
+                                                 0x370ef8af6360dfe0,
+                                                 0x380870b3c5fb66f7,
+                                                 0x39033b85a8bfc871,
+                                                 0x39ff5cc235a256c5,
+                                                 0x3afcd815786af188,
+                                                 0x3bfbb13ab0dc5614,
+                                                 0x3cfbebfca715669e,
+                                                 0x3dfd8c36023f0ab7,
+                                                 0x3f0095d1a19a0332,
+                                                 0x40050ccaf800ca8c,
+                                                 0x410af52e69f26264,
+                                                 0x42125319ae3bbf06,
+                                                 0x431b2abc31565be7,
+                                                 0x442580577b936763,
+                                                 0x4531583f9a2be204,
+                                                 0x463eb6db8b4f066d,
+                                                 0x474da0a5ad495303,
+                                                 0x485e1a2c30df9ea9,
+                                                 0x497028118efabeb8,
+                                                 0x4a83cf0d01c16e3d,
+                                                 -0x3466ec14fec0a13b,
+                                                 -0x335004723c465e69,
+                                                 -0x323775123e2e1169,
+                                                 -0x323775123e2e1169,
+                                                 -0x311d38e5c1644b49,
+                                                 -0x30014ac62c38a865,
+                                                 -0x2ee3a574fdf677c9,
+                                                 -0x2dc4439b3a19bcaf,
+                                                 -0x2ca31fc8cef74dca,
+                                                 -0x2ca31fc8cef74dca,
+                                                 -0x2b803473f7ad0f3f,
+                                                 -0x2a5b7bf8992d66fc,
+                                                 -0x2934f0979a3715fd,
+                                                 -0x280c8c76360892eb,
+                                                 -0x280c8c76360892eb,
+                                                 -0x26e2499d499bd9b3,
+                                                 -0x25b621f89b355ede,
+                                                 -0x24880f561c0e7305,
+                                                 -0x24880f561c0e7305,
+                                                 -0x23580b6523e0e0a5,
+                                                 -0x22260fb5a616eb96,
+                                                 -0x20f215b7606012de,
+                                                 -0x20f215b7606012de,
+                                                 -0x1fbc16b902680a24,
+                                                 -0x1e840be74e6a4cc8,
+                                                 -0x1e840be74e6a4cc8,
+                                                 -0x1d49ee4c32596fc9,
+                                                 -0x1c0db6cdd94dee41,
+                                                 -0x1c0db6cdd94dee41,
+                                                 -0x1acf5e2db4ec93f0,
+                                                 -0x198edd077e70df03,
+                                                 -0x198edd077e70df03,
+                                                 -0x184c2bd02f03b2fe,
+                                                 -0x170742d4ef027f2a,
+                                                 -0x170742d4ef027f2a,
+                                                 -0x15c01a39fbd687a0,
+                                                 -0x1476a9f983f74d31,
+                                                 -0x1476a9f983f74d31,
+                                                 -0x132ae9e278ae1a1f,
+                                                 -0x132ae9e278ae1a1f,
+                                                 -0x11dcd197552b7b5f,
+                                                 -0x108c588cda79e396,
+                                                 -0x108c588cda79e396,
+                                                 -0xf397608bfd2d90e,
+                                                 -0xf397608bfd2d90e,
+                                                 -0xde4212056d5dd32,
+                                                 -0xc8c50b72319ad57,
+                                                 -0xc8c50b72319ad57,
+                                                 -0xb31fb7d64898b3e,
+                                                 -0xb31fb7d64898b3e,
+                                                 -0x9d517ee93f8e16c,
+                                                 -0x9d517ee93f8e16c,
+                                                 -0x8759c4fd14fcd5a,
+                                                 -0x7137eae42aad7bd,
+                                                 -0x7137eae42aad7bd,
+                                                 -0x5aeb4dd63bf61cc,
+                                                 -0x5aeb4dd63bf61cc,
+                                                 -0x447347544cd04bb,
+                                                 -0x447347544cd04bb,
+                                                 -0x2dcf2d0b85a4531,
+                                                 -0x2dcf2d0b85a4531,
+                                                 -0x16fe50b6ef08518,
+                                                 -0x16fe50b6ef08518,
+                                                 0x0};
diff --git a/test/include/test_infra.h b/test/include/test_infra.h
index 646e55e..30a4dea 100644
--- a/test/include/test_infra.h
+++ b/test/include/test_infra.h
@@ -8,47 +8,26 @@ void report_err_fp64(void (*test_func)(size_t, const double *, double *),
 void report_err_fp64(void (*test_func)(size_t, const double *, double *),
                      long double (*ref_func)(long double), const double *, int);
 
-void report_err_fp64(
-  void (*test_func)(size_t, const double *, size_t, double *, size_t), 
-  long double (*ref_func)(long double),
-  double,
-  double,
-  int,
-  int,
-  int
-);
-
-void report_err2_fp64(
-  void (*test_func)(size_t, const double *, const double *, double *),
-  long double (*ref_func)(long double, long double),
-  double,
-  double,
-  int,
-  double,
-  double,
-  int,
-  bool
-);
-
-void report_err2_fp64(
-  void (*test_func)(size_t, const double *, size_t, const double *, size_t, double *, size_t),
-  long double (*ref_func)(long double, long double),
-  double,
-  double,
-  int,
-  int,
-  double,
-  double,
-  int,
-  int,
-  int,
-  bool
-);
+void report_err_fp64(void (*test_func)(size_t, const double *, size_t, double *,
+                                       size_t),
+                     long double (*ref_func)(long double), double, double, int,
+                     int, int);
+
+void report_err2_fp64(void (*test_func)(size_t, const double *, const double *,
+                                        double *),
+                      long double (*ref_func)(long double, long double), double,
+                      double, int, double, double, int, bool);
+
+void report_err2_fp64(void (*test_func)(size_t, const double *, size_t,
+                                        const double *, size_t, double *,
+                                        size_t),
+                      long double (*ref_func)(long double, long double), double,
+                      double, int, int, double, double, int, int, int, bool);
 
 void report_err_pow_fp64(void (*test_func)(size_t, const double *,
-                                            const double *, double *),
-                          long double (*ref_func)(long double, long double),
-                          double, double, double, int);
+                                           const double *, double *),
+                         long double (*ref_func)(long double, long double),
+                         double, double, double, int);
 
 void report_err_fp80(void (*test_func)(size_t, const double *, const double *,
                                        double *, double *),
diff --git a/test/src/test_exp2I.cpp b/test/src/test_exp2I.cpp
index 226aa23..af28ce2 100644
--- a/test/src/test_exp2I.cpp
+++ b/test/src/test_exp2I.cpp
@@ -22,7 +22,8 @@ int main() {
   nb_tests = 30;
   int stride_x = 21;
   int stride_y = 39;
-  report_err_fp64(rvvlm_exp2I, exp2l, x_start, x_end, nb_tests, stride_x, stride_y);
+  report_err_fp64(rvvlm_exp2I, exp2l, x_start, x_end, nb_tests, stride_x,
+                  stride_y);
 
   return 0;
 }
diff --git a/test/src/test_expI.cpp b/test/src/test_expI.cpp
index 07c1bf7..76e4e9c 100644
--- a/test/src/test_expI.cpp
+++ b/test/src/test_expI.cpp
@@ -22,7 +22,8 @@ int main() {
   nb_tests = 30;
   int stride_x = 21;
   int stride_y = 39;
-  report_err_fp64(rvvlm_expI, expl, x_start, x_end, nb_tests, stride_x, stride_y);
+  report_err_fp64(rvvlm_expI, expl, x_start, x_end, nb_tests, stride_x,
+                  stride_y);
 
   return 0;
 }
diff --git a/test/src/test_expm1.cpp b/test/src/test_expm1.cpp
index ef91479..315af4b 100644
--- a/test/src/test_expm1.cpp
+++ b/test/src/test_expm1.cpp
@@ -19,7 +19,6 @@ int main() {
 
   show_special_fp64(rvvlm_expm1, "Special Value handling of this function");
 
-  
   x_start = -0.01;
   x_end = 0.01;
   nb_tests = 300000;
diff --git a/test/src/test_expm1I.cpp b/test/src/test_expm1I.cpp
index 29858e1..9e9d4ea 100644
--- a/test/src/test_expm1I.cpp
+++ b/test/src/test_expm1I.cpp
@@ -22,7 +22,8 @@ int main() {
   nb_tests = 300;
   int stride_x = 21;
   int stride_y = 39;
-  report_err_fp64(rvvlm_expm1I, expm1l, x_start, x_end, nb_tests, stride_x, stride_y);
+  report_err_fp64(rvvlm_expm1I, expm1l, x_start, x_end, nb_tests, stride_x,
+                  stride_y);
 
   return 0;
 }
diff --git a/test/src/test_infra.cpp b/test/src/test_infra.cpp
index f43841f..987857a 100644
--- a/test/src/test_infra.cpp
+++ b/test/src/test_infra.cpp
@@ -209,29 +209,23 @@ void report_err_fp64(void (*test_func)(size_t, const double *, double *),
   return;
 }
 
-void report_err_fp64(
-  void (*test_func)(size_t, const double *, size_t,  double *, size_t), 
-  long double (*ref_func)(long double),
-  double start,
-  double end,
-  int nb_pts,
-  int stride_in,
-  int stride_out
-) {
+void report_err_fp64(void (*test_func)(size_t, const double *, size_t, double *,
+                                       size_t),
+                     long double (*ref_func)(long double), double start,
+                     double end, int nb_pts, int stride_in, int stride_out) {
 
   long double y_ref;
   double *x, *y, delta;
-  double abs_err, rel_err, ulp_err; 
-  double max_abs_err, max_rel_err, max_ulp_err; 
-  
-  x = (double *)malloc(nb_pts * sizeof(double) * stride_in );
-  y = (double *)malloc(nb_pts * sizeof(double) * stride_out );
- 
-  if (nb_pts <= 1){
+  double abs_err, rel_err, ulp_err;
+  double max_abs_err, max_rel_err, max_ulp_err;
+
+  x = (double *)malloc(nb_pts * sizeof(double) * stride_in);
+  y = (double *)malloc(nb_pts * sizeof(double) * stride_out);
+
+  if (nb_pts <= 1) {
     delta = 0.0;
     nb_pts = 1;
-  } 
-  else {
+  } else {
     delta = (end - start) / (double)(nb_pts - 1);
   }
 
@@ -240,17 +234,17 @@ void report_err_fp64(
   max_ulp_err = 0.0;
 
   // fill up the set of test points
-  for (int i=0; i<nb_pts; ++i) {
-    x[i * stride_in] = start + (double) i * delta;
+  for (int i = 0; i < nb_pts; ++i) {
+    x[i * stride_in] = start + (double)i * delta;
   }
-  
+
   // call function under test
-  test_func( (size_t) nb_pts, x, (size_t) stride_in, y, (size_t) stride_out );
+  test_func((size_t)nb_pts, x, (size_t)stride_in, y, (size_t)stride_out);
 
   // now for each point we compute error and log the max
   for (int j = 0; j < nb_pts; j++) {
     y_ref = ref_func((long double)x[j * stride_in]);
-    abs_err = (double)((long double)y[j*stride_out] - y_ref);
+    abs_err = (double)((long double)y[j * stride_out] - y_ref);
     abs_err = ABS(abs_err);
     if (ABS((double)y_ref) > 0.0) {
       rel_err = abs_err / ABS((double)y_ref);
@@ -267,35 +261,35 @@ void report_err_fp64(
       union sui64_fp64 xxx, yyy;
       xxx.f = x[j * stride_in];
       yyy.f = y[j * stride_out];
-      printf("--input %24.17le, 0x%016lx, output %24.17le, 0x%016lx \n",
-             xxx.f, xxx.ui, yyy.f, yyy.ui);
+      printf("--input %24.17le, 0x%016lx, output %24.17le, 0x%016lx \n", xxx.f,
+             xxx.ui, yyy.f, yyy.ui);
       printf("  reference %24.17le\n\n", (double)y_ref);
     }
   }
   printf("----------------------------\n");
-  if ((ABS(start) > 100.) || (ABS(end)<1.e-2)){
+  if ((ABS(start) > 100.) || (ABS(end) < 1.e-2)) {
     printf("Tested %d points in [%8.3le, %8.3le]\n", nb_pts, start, end);
-  }
-  else {
+  } else {
     printf("Tested %d points in [%3.3lf, %3.3lf]\n", nb_pts, start, end);
   }
   printf("Maximum observed absolute error is %8.3le\n", max_abs_err);
   printf("Maximum observed relative error is %8.3le\n", max_rel_err);
-  if (max_rel_err > 0.0){
-    printf("                          which is 2^(%3.3lf)\n", log2(max_rel_err));
+  if (max_rel_err > 0.0) {
+    printf("                          which is 2^(%3.3lf)\n",
+           log2(max_rel_err));
   }
   printf("Maximum observed ULP      error is %3.3lf\n", max_ulp_err);
 
   free(x);
   free(y);
-  
+
   return;
 }
 
 void report_err_pow_fp64(void (*test_func)(size_t, const double *,
                                            const double *, double *),
-                          long double (*ref_func)(long double, long double),
-                          double target, double start, double end, int nb_pts) {
+                         long double (*ref_func)(long double, long double),
+                         double target, double start, double end, int nb_pts) {
 
   long double z_ref;
   double *x, *y, *z, delta;
@@ -492,43 +486,36 @@ void report_err_fp64(double (*test_func)(double), double (*ref_func)(double),
   return;
 }
 
-void report_err2_fp64(
-  void (*test_func)(size_t, const double *, const double *, double *), 
-  long double (*ref_func)(long double, long double),
-  double start_x,
-  double end_x,
-  int nb_pts_x,
-  double start_y,
-  double end_y,
-  int nb_pts_y,
-  bool swap_xy
-) {
+void report_err2_fp64(void (*test_func)(size_t, const double *, const double *,
+                                        double *),
+                      long double (*ref_func)(long double, long double),
+                      double start_x, double end_x, int nb_pts_x,
+                      double start_y, double end_y, int nb_pts_y,
+                      bool swap_xy) {
 
   long double z_ref;
   double *x, *y, *z, delta_x, delta_y;
-  long double abs_err, rel_err, ulp_err; 
-  long double max_abs_err, max_rel_err, max_ulp_err; 
+  long double abs_err, rel_err, ulp_err;
+  long double max_abs_err, max_rel_err, max_ulp_err;
   int nb_pts;
 
   nb_pts = nb_pts_x * nb_pts_y;
-  
+
   x = (double *)malloc(nb_pts * sizeof(double));
   y = (double *)malloc(nb_pts * sizeof(double));
   z = (double *)malloc(nb_pts * sizeof(double));
 
-  if (nb_pts_x <= 1){
+  if (nb_pts_x <= 1) {
     delta_x = 0.0;
     nb_pts_x = 1;
-  } 
-  else {
+  } else {
     delta_x = (end_x - start_x) / (double)(nb_pts_x - 1);
   }
 
-  if (nb_pts_y <= 1){
+  if (nb_pts_y <= 1) {
     delta_y = 0.0;
     nb_pts_y = 1;
-  } 
-  else {
+  } else {
     delta_y = (end_y - start_y) / (double)(nb_pts_y - 1);
   }
 
@@ -540,24 +527,24 @@ void report_err2_fp64(
   double x_val, y_val;
   int cnt;
   cnt = 0;
-  //printf("\n start_x %le, end_x %le, nb_pts_x %d\n", start_x, end_x, nb_pts_x);
-  //rintf("\n start_y %le, end_y %le, nb_pts_y %d\n", start_y, end_y, nb_pts_y);
-
-  for (int i=0; i<nb_pts_x; ++i) {
-    x_val = start_x + (double) i * delta_x;
-    for (int j=0; j<nb_pts_y; ++j) {
-      y_val = start_y + (double) j * delta_y;
+  // printf("\n start_x %le, end_x %le, nb_pts_x %d\n", start_x, end_x,
+  // nb_pts_x); rintf("\n start_y %le, end_y %le, nb_pts_y %d\n", start_y,
+  // end_y, nb_pts_y);
+
+  for (int i = 0; i < nb_pts_x; ++i) {
+    x_val = start_x + (double)i * delta_x;
+    for (int j = 0; j < nb_pts_y; ++j) {
+      y_val = start_y + (double)j * delta_y;
       x[cnt++] = x_val;
       y[cnt] = y_val;
     }
   }
- 
+
   // call function under test
   if (swap_xy) {
-    test_func( (size_t) nb_pts, y, x, z );
-  } 
-  else {
-    test_func( (size_t) nb_pts, x, y, z );
+    test_func((size_t)nb_pts, y, x, z);
+  } else {
+    test_func((size_t)nb_pts, x, y, z);
   }
 
   // now for each point we compute error and log the max
@@ -565,11 +552,9 @@ void report_err2_fp64(
     if (swap_xy) {
       z_ref = ref_func((long double)y[j], (long double)x[j]);
       abs_err = (long double)z[j] - z_ref;
-    }
-    else {
+    } else {
       z_ref = ref_func((long double)x[j], (long double)y[j]);
       abs_err = (long double)z[j] - z_ref;
-
     }
     abs_err = ABS(abs_err);
     if (ABS((double)z_ref) > 0.0) {
@@ -588,76 +573,71 @@ void report_err2_fp64(
       xxx.f = x[j];
       yyy.f = y[j];
       zzz.f = z[j];
-      printf("--input X %24.17le, 0x%016lx, Y %24.17le, 0x%016lx, \n output %24.17le, 0x%016lx \n",
+      printf("--input X %24.17le, 0x%016lx, Y %24.17le, 0x%016lx, \n output "
+             "%24.17le, 0x%016lx \n",
              xxx.f, xxx.ui, yyy.f, yyy.ui, zzz.f, zzz.ui);
       printf("  reference %24.17le\n\n", (double)z_ref);
     }
   }
   printf("----------------------------\n");
-  if ((ABS(start_x) > 100.) || (ABS(end_x)<1.e-2)){
-    printf("Tested %d X-points in [%8.3le, %8.3le]\n", nb_pts_x, start_x, end_x);
-    printf("Tested %d Y-points in [%8.3le, %8.3le]\n", nb_pts_y, start_y, end_y);
-
-  }
-  else {
-    printf("Tested %d X-points in [%3.3lf, %3.3lf]\n", nb_pts_x, start_x, end_x);
-    printf("Tested %d Y-points in [%3.3lf, %3.3lf]\n", nb_pts_y, start_y, end_y);
+  if ((ABS(start_x) > 100.) || (ABS(end_x) < 1.e-2)) {
+    printf("Tested %d X-points in [%8.3le, %8.3le]\n", nb_pts_x, start_x,
+           end_x);
+    printf("Tested %d Y-points in [%8.3le, %8.3le]\n", nb_pts_y, start_y,
+           end_y);
 
+  } else {
+    printf("Tested %d X-points in [%3.3lf, %3.3lf]\n", nb_pts_x, start_x,
+           end_x);
+    printf("Tested %d Y-points in [%3.3lf, %3.3lf]\n", nb_pts_y, start_y,
+           end_y);
   }
   printf("Maximum observed absolute error is %8.3Le\n", max_abs_err);
   printf("Maximum observed relative error is %8.3Le\n", max_rel_err);
-  if (max_rel_err > 0.0){
-    printf("                          which is 2^(%3.3Lf)\n", log2l(max_rel_err));
+  if (max_rel_err > 0.0) {
+    printf("                          which is 2^(%3.3Lf)\n",
+           log2l(max_rel_err));
   }
   printf("Maximum observed ULP      error is %3.3Lf\n", max_ulp_err);
 
   free(x);
   free(y);
   free(z);
-  
+
   return;
 }
 
-void report_err2_fp64(
-  void (*test_func)(size_t, const double *, size_t, const double *, size_t, double *, size_t), 
-  long double (*ref_func)(long double, long double),
-  double start_x,
-  double end_x,
-  int nb_pts_x,
-  int stride_x,
-  double start_y,
-  double end_y,
-  int nb_pts_y,
-  int stride_y,
-  int stride_z,
-  bool swap_xy
-) {
+void report_err2_fp64(void (*test_func)(size_t, const double *, size_t,
+                                        const double *, size_t, double *,
+                                        size_t),
+                      long double (*ref_func)(long double, long double),
+                      double start_x, double end_x, int nb_pts_x, int stride_x,
+                      double start_y, double end_y, int nb_pts_y, int stride_y,
+                      int stride_z, bool swap_xy) {
 
   long double z_ref;
   double *x, *y, *z, delta_x, delta_y;
-  long double abs_err, rel_err, ulp_err; 
-  long double max_abs_err, max_rel_err, max_ulp_err; 
+  long double abs_err, rel_err, ulp_err;
+  long double max_abs_err, max_rel_err, max_ulp_err;
   int nb_pts;
 
   nb_pts = nb_pts_x * nb_pts_y;
-  
+
   x = (double *)malloc(nb_pts * stride_x * sizeof(double));
   y = (double *)malloc(nb_pts * stride_y * sizeof(double));
   z = (double *)malloc(nb_pts * stride_z * sizeof(double));
 
-  if (nb_pts_x <= 1){
+  if (nb_pts_x <= 1) {
     delta_x = 0.0;
     nb_pts_x = 1;
-  } 
-  else {
+  } else {
     delta_x = (end_x - start_x) / (double)(nb_pts_x - 1);
   }
 
-  if (nb_pts_y <= 1){
+  if (nb_pts_y <= 1) {
     delta_y = 0.0;
     nb_pts_y = 1;
-  } 
-  else {
+  } else {
     delta_y = (end_y - start_y) / (double)(nb_pts_y - 1);
   }
 
@@ -669,37 +649,39 @@ void report_err2_fp64(
   double x_val, y_val;
   int cnt;
   cnt = 0;
-  //printf("\n start_x %le, end_x %le, nb_pts_x %d\n", start_x, end_x, nb_pts_x);
-  //rintf("\n start_y %le, end_y %le, nb_pts_y %d\n", start_y, end_y, nb_pts_y);
-
-  for (int i=0; i<nb_pts_x; ++i) {
-    x_val = start_x + (double) i * delta_x;
-    for (int j=0; j<nb_pts_y; ++j) {
-      y_val = start_y + (double) j * delta_y;
+  // printf("\n start_x %le, end_x %le, nb_pts_x %d\n", start_x, end_x,
+  // nb_pts_x); rintf("\n start_y %le, end_y %le, nb_pts_y %d\n", start_y,
+  // end_y, nb_pts_y);
+
+  for (int i = 0; i < nb_pts_x; ++i) {
+    x_val = start_x + (double)i * delta_x;
+    for (int j = 0; j < nb_pts_y; ++j) {
+      y_val = start_y + (double)j * delta_y;
       x[cnt * stride_x] = x_val;
       y[cnt * stride_y] = y_val;
       cnt++;
     }
   }
- 
+
   // call function under test
   if (swap_xy) {
-    test_func( (size_t) nb_pts, y, (size_t) stride_y, x, (size_t) stride_x, z, (size_t) stride_z );
-  } 
-  else {
-    test_func( (size_t) nb_pts, x, (size_t) stride_x, y, (size_t) stride_y, z, (size_t) stride_z );
+    test_func((size_t)nb_pts, y, (size_t)stride_y, x, (size_t)stride_x, z,
+              (size_t)stride_z);
+  } else {
+    test_func((size_t)nb_pts, x, (size_t)stride_x, y, (size_t)stride_y, z,
+              (size_t)stride_z);
   }
 
   // now for each point we compute error and log the max
   for (int j = 0; j < nb_pts_x * nb_pts_y; j++) {
     if (swap_xy) {
-      z_ref = ref_func((long double)y[j*stride_y], (long double)x[j*stride_x]);
+      z_ref =
+          ref_func((long double)y[j * stride_y], (long double)x[j * stride_x]);
+      abs_err = (long double)z[j * stride_z] - z_ref;
+    } else {
+      z_ref =
+          ref_func((long double)x[j * stride_x], (long double)y[j * stride_y]);
       abs_err = (long double)z[j * stride_z] - z_ref;
-    }
-    else {
-      z_ref = ref_func((long double)x[j*stride_x], (long double)y[j*stride_y]);
-      abs_err = (long double)z[j*stride_z] - z_ref;
-
     }
     abs_err = ABS(abs_err);
     if (ABS((double)z_ref) > 0.0) {
@@ -715,41 +697,43 @@ void report_err2_fp64(
 
     if (VERBOSE) {
       union sui64_fp64 xxx, yyy, zzz;
-      xxx.f = x[j*stride_x];
-      yyy.f = y[j*stride_y];
-      zzz.f = z[j*stride_z];
-      printf("--input X %24.17le, 0x%016lx, Y %24.17le, 0x%016lx, \n output %24.17le, 0x%016lx \n",
+      xxx.f = x[j * stride_x];
+      yyy.f = y[j * stride_y];
+      zzz.f = z[j * stride_z];
+      printf("--input X %24.17le, 0x%016lx, Y %24.17le, 0x%016lx, \n output "
+             "%24.17le, 0x%016lx \n",
              xxx.f, xxx.ui, yyy.f, yyy.ui, zzz.f, zzz.ui);
       printf("  reference %24.17le\n\n", (double)z_ref);
     }
   }
   printf("----------------------------\n");
-  if ((ABS(start_x) > 100.) || (ABS(end_x)<1.e-2)){
-    printf("Tested %d X-points in [%8.3le, %8.3le]\n", nb_pts_x, start_x, end_x);
-    printf("Tested %d Y-points in [%8.3le, %8.3le]\n", nb_pts_y, start_y, end_y);
-
-  }
-  else {
-    printf("Tested %d X-points in [%3.3lf, %3.3lf]\n", nb_pts_x, start_x, end_x);
-    printf("Tested %d Y-points in [%3.3lf, %3.3lf]\n", nb_pts_y, start_y, end_y);
+  if ((ABS(start_x) > 100.) || (ABS(end_x) < 1.e-2)) {
+    printf("Tested %d X-points in [%8.3le, %8.3le]\n", nb_pts_x, start_x,
+           end_x);
+    printf("Tested %d Y-points in [%8.3le, %8.3le]\n", nb_pts_y, start_y,
+           end_y);
 
+  } else {
+    printf("Tested %d X-points in [%3.3lf, %3.3lf]\n", nb_pts_x, start_x,
+           end_x);
+    printf("Tested %d Y-points in [%3.3lf, %3.3lf]\n", nb_pts_y, start_y,
+           end_y);
   }
   printf("Maximum observed absolute error is %8.3Le\n", max_abs_err);
   printf("Maximum observed relative error is %8.3Le\n", max_rel_err);
-  if (max_rel_err > 0.0){
-    printf("                          which is 2^(%3.3Lf)\n", log2l(max_rel_err));
+  if (max_rel_err > 0.0) {
+    printf("                          which is 2^(%3.3Lf)\n",
+           log2l(max_rel_err));
   }
   printf("Maximum observed ULP      error is %3.3Lf\n", max_ulp_err);
 
   free(x);
   free(y);
   free(z);
-  
+
   return;
 }
 
-
-
 void show_special_fp64(void (*test_func)(size_t, const double *, double *),
                        const char *title) {
 
diff --git a/test/src/test_log.cpp b/test/src/test_log.cpp
index f543500..40fbb53 100644
--- a/test/src/test_log.cpp
+++ b/test/src/test_log.cpp
@@ -44,6 +44,5 @@ int main() {
   nb_tests = 400000;
   report_err_fp64(rvvlm_log, logl, x_start, x_end, nb_tests);
 
-
   return 0;
 }
diff --git a/test/src/test_log10.cpp b/test/src/test_log10.cpp
index f87a65c..dd672d5 100644
--- a/test/src/test_log10.cpp
+++ b/test/src/test_log10.cpp
@@ -44,6 +44,5 @@ int main() {
   nb_tests = 400000;
   report_err_fp64(rvvlm_log10, log10l, x_start, x_end, nb_tests);
 
-
   return 0;
 }
diff --git a/test/src/test_log10I.cpp b/test/src/test_log10I.cpp
index 5dfb7dc..66addc4 100644
--- a/test/src/test_log10I.cpp
+++ b/test/src/test_log10I.cpp
@@ -22,7 +22,8 @@ int main() {
   nb_tests = 300;
   int stride_x = 21;
   int stride_y = 39;
-  report_err_fp64(rvvlm_log10I, log10l, x_start, x_end, nb_tests, stride_x, stride_y);
+  report_err_fp64(rvvlm_log10I, log10l, x_start, x_end, nb_tests, stride_x,
+                  stride_y);
 
   return 0;
 }
diff --git a/test/src/test_log1pI.cpp b/test/src/test_log1pI.cpp
index 2a748fc..c5f79b4 100644
--- a/test/src/test_log1pI.cpp
+++ b/test/src/test_log1pI.cpp
@@ -22,7 +22,8 @@ int main() {
   nb_tests = 300;
   int stride_x = 21;
   int stride_y = 39;
-  report_err_fp64(rvvlm_log1pI, log1pl, x_start, x_end, nb_tests, stride_x, stride_y);
+  report_err_fp64(rvvlm_log1pI, log1pl, x_start, x_end, nb_tests, stride_x,
+                  stride_y);
 
   return 0;
 }
diff --git a/test/src/test_log2.cpp b/test/src/test_log2.cpp
index 93cad12..082f84c 100644
--- a/test/src/test_log2.cpp
+++ b/test/src/test_log2.cpp
@@ -44,6 +44,5 @@ int main() {
   nb_tests = 400000;
   report_err_fp64(rvvlm_log2, log2l, x_start, x_end, nb_tests);
 
-
   return 0;
 }
diff --git a/test/src/test_log2I.cpp b/test/src/test_log2I.cpp
index 95c584a..007b180 100644
--- a/test/src/test_log2I.cpp
+++ b/test/src/test_log2I.cpp
@@ -22,7 +22,8 @@ int main() {
   nb_tests = 300;
   int stride_x = 21;
   int stride_y = 39;
-  report_err_fp64(rvvlm_log2I, log2l, x_start, x_end, nb_tests, stride_x, stride_y);
+  report_err_fp64(rvvlm_log2I, log2l, x_start, x_end, nb_tests, stride_x,
+                  stride_y);
 
   return 0;
 }
diff --git a/test/src/test_logI.cpp b/test/src/test_logI.cpp
index e7428f6..876647b 100644
--- a/test/src/test_logI.cpp
+++ b/test/src/test_logI.cpp
@@ -22,7 +22,8 @@ int main() {
   nb_tests = 300;
   int stride_x = 21;
   int stride_y = 39;
-  report_err_fp64(rvvlm_logI, logl, x_start, x_end, nb_tests, stride_x, stride_y);
+  report_err_fp64(rvvlm_logI, logl, x_start, x_end, nb_tests, stride_x,
+                  stride_y);
 
   return 0;
 }
diff --git a/test/src/test_pow.cpp b/test/src/test_pow.cpp
index e332b6e..1f3be0d 100644
--- a/test/src/test_pow.cpp
+++ b/test/src/test_pow.cpp
@@ -3,8 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <math.h>
-#include <stdio.h>
 #include <stdbool.h>
+#include <stdio.h>
 
 #include "rvvlm.h"
 #include "test_infra.h"
@@ -29,18 +29,20 @@ int main() {
   y_start = 0x1.0p-3;
   y_end = 0x1.0p4;
   nb_pts_y = 300;
-  report_err2_fp64(rvvlm_pow, powl, x_start, x_end, nb_pts_x, y_start, y_end, nb_pts_y, swap_xy); 
+  report_err2_fp64(rvvlm_pow, powl, x_start, x_end, nb_pts_x, y_start, y_end,
+                   nb_pts_y, swap_xy);
 
   nb_targets = 10;
   x_start = 0x1.0p-10;
-  x_end = 0x1.0p10;;
+  x_end = 0x1.0p10;
+  ;
   nb_tests = 40000;
 
   target_start = -1070.0 * log(2.0);
   target_end = -1020.0 * log(2.0);
   delta = (target_end - target_start) / (double)(nb_targets - 1);
   for (int j = 0; j < nb_targets; j++) {
-    target = target_start + (double)j * delta; 
+    target = target_start + (double)j * delta;
     report_err_pow_fp64(rvvlm_pow, powl, target, x_start, x_end, nb_tests);
   }
 
@@ -48,7 +50,7 @@ int main() {
   target_end = 32.0 * log(2.0);
   delta = (target_end - target_start) / (double)(nb_targets - 1);
   for (int j = 0; j < nb_targets; j++) {
-    target = target_start + (double)j * delta; 
+    target = target_start + (double)j * delta;
     report_err_pow_fp64(rvvlm_pow, powl, target, x_start, x_end, nb_tests);
   }
 
@@ -56,7 +58,7 @@ int main() {
   target_end = 1023.5 * log(2.0);
   delta = (target_end - target_start) / (double)(nb_targets - 1);
   for (int j = 0; j < nb_targets; j++) {
-    target = target_start + (double)j * delta; 
+    target = target_start + (double)j * delta;
     report_err_pow_fp64(rvvlm_pow, powl, target, x_start, x_end, nb_tests);
   }
 
@@ -64,7 +66,7 @@ int main() {
   target_end = -80.0 * log(2.0);
   delta = (target_end - target_start) / (double)(nb_targets - 1);
   for (int j = 0; j < nb_targets; j++) {
-    target = target_start + (double)j * delta; 
+    target = target_start + (double)j * delta;
     report_err_pow_fp64(rvvlm_pow, powl, target, x_start, x_end, nb_tests);
   }
 
diff --git a/test/src/test_powI.cpp b/test/src/test_powI.cpp
index 7748b6f..b29990b 100644
--- a/test/src/test_powI.cpp
+++ b/test/src/test_powI.cpp
@@ -3,8 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <math.h>
-#include <stdio.h>
 #include <stdbool.h>
+#include <stdio.h>
 
 #include "rvvlm.h"
 #include "test_infra.h"
@@ -29,9 +29,8 @@ int main() {
   stride_y = 39;
   stride_z = 11;
   swap_xy = 0;
-  report_err2_fp64(rvvlm_powI, powl, x_start, x_end, nb_pts_x, stride_x, 
-                                    y_start, y_end, nb_pts_y, stride_y,
-                                    stride_z, swap_xy);
+  report_err2_fp64(rvvlm_powI, powl, x_start, x_end, nb_pts_x, stride_x,
+                   y_start, y_end, nb_pts_y, stride_y, stride_z, swap_xy);
 
   return 0;
 }
diff --git a/tools/run-clang-format.py b/tools/run-clang-format.py
new file mode 100755
index 0000000..49cb772
--- /dev/null
+++ b/tools/run-clang-format.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: 2023 Rivos Inc.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import click
+from pathlib import Path
+import subprocess
+import sys
+
+def get_files(git_dir: Path, base_commit: str):
+    if base_commit is None:
+        git_command = [
+            "git", "-C", str(git_dir.absolute()), "ls-tree", "-r", "--name-only", "HEAD"
+        ]
+    else:
+        git_command = [
+            "git", "-C", str(git_dir.absolute()), "diff", "--name-only", "--diff-filter=CAMT", base_commit
+        ]
+    git_files = subprocess.run(git_command, check=True, capture_output=True).\
+            stdout.decode("utf-8").strip().split("\n")
+    return git_files
+
+
+def file_filter(fname: str):
+    extensions = [".c", ".cpp", ".h", ".hpp"]
+    for extension in extensions:
+        if fname.endswith(extension):
+            return True
+    return False
+
+
+def check_format_changes(git_dir: Path, base_commit: str):
+    git_files = get_files(git_dir, base_commit)
+    files_to_update = []
+    for fname in filter(file_filter, git_files):
+        # Do a dry run, and stop on the first error found
+        clang_format_dry = ["clang-format", "-n", "--ferror-limit=1", str(git_dir.absolute() / fname)]
+        clang_format_output = (
+            subprocess.run(clang_format_dry, check=True, capture_output=True)
+                      .stderr.decode("utf-8").strip()
+        )
+        needs_clang_format = (len(clang_format_output) > 0)
+        if needs_clang_format:
+            files_to_update.append(str(git_dir.absolute() / fname))
+    if len(files_to_update) > 0:
+        sep="\n\t"
+        print(f"Found clang-format changes in files\n{sep.join(files_to_update)}"
+               "\nPlease address clang-format errors and commit the changes")
+        return 1
+    print("No clang-format changes found")
+    return 0
+
+
+def apply_clang_format(git_dir: Path, base_commit: str):
+    git_files = get_files(git_dir, base_commit)
+    # Apply clang format to all of the files we are interested in
+    for fname in filter(file_filter, git_files):
+        clang_format_apply = ["clang-format", "-i", str(git_dir.absolute() / fname)]
+        subprocess.run(clang_format_apply, check=True)
+    return 0
+
+
+@click.command()
+@click.argument(
+    "run_type",
+    type=click.Choice(["check", "apply"]),
+)
+@click.option(
+    "--base-commit",
+    help="Base commit to find files with changes in. "
+         "If not given, runs clang format on the whole repo"
+)
+def main(run_type, base_commit):
+    git_dir = Path(__file__).absolute().parent.parent
+
+    clang_format_func = check_format_changes if (run_type == "check") else apply_clang_format
+    sys.exit(clang_format_func(git_dir, base_commit))
+
+
+if __name__ == "__main__":
+    main()