From ab3c498d81295116fc578adc2facb4dd1e932020 Mon Sep 17 00:00:00 2001
From: PingTakPeterTang <ptpt@rivosinc.com>
Date: Wed, 6 Mar 2024 17:55:10 -0800
Subject: [PATCH] fix formatting

---
 include/foo.h                  |  11 --
 include/rvvlm_erfcinvD.inc.h   | 229 ++++++++++++++-------------
 include/rvvlm_erfinvD.inc.h    | 207 ++++++++++++++-----------
 include/rvvlm_inverrorfuncsD.h | 276 ++++++++++++++++-----------------
 test/src/test_erfcinv.cpp      |  46 ++++--
 test/src/test_erfinv.cpp       |  59 ++++---
 6 files changed, 444 insertions(+), 384 deletions(-)
 delete mode 100755 include/foo.h

diff --git a/include/foo.h b/include/foo.h
deleted file mode 100755
index 49c5391..0000000
--- a/include/foo.h
+++ /dev/null
@@ -1,11 +0,0 @@
-
-#define PSTEP_I_SLL(COEFF_j, X, K, POLY, vlen)                                 \
-  __riscv_vsadd(__riscv_vsll(                                                  \
-    __riscv_vsmul((POLY), (X), 1, (vlen)), (K), (vlen)),                       \
-        (COEFF_j), (vlen))
-
-#define PSTEP_I_SRA(COEFF_j, X, K, POLY, vlen)                                 \
-  __riscv_vsadd(__riscv_vsra(                                                  \
-    __riscv_vsmul((POLY), (X), 1, (vlen)), (K), (vlen)),                       \
-        (COEFF_j), (vlen))
-
diff --git a/include/rvvlm_erfcinvD.inc.h b/include/rvvlm_erfcinvD.inc.h
index 15816ee..609f797 100644
--- a/include/rvvlm_erfcinvD.inc.h
+++ b/include/rvvlm_erfcinvD.inc.h
@@ -8,66 +8,65 @@
 #define F_VER1 RVVLM_ERFCINVDI_STD
 #endif
 
-//Erfcinv is defined on (0, 2). Suffices to consider (0, 1]
-//Two regions of approximation: left is [0, 0x1.2p-2) and right is [0x1.2p-1, 1)
-//Both are done with rational functions.
-//For right, t*P(t)/Q(t) t = 1-x; x in [0x1.2p-1, 1)
-//For left, y*P(t)/Q(t), y = sqrt(-log(x)); and t = 1/y
-
-// P_coefficients in asending order, all in Q79. p0_delta is in floating point 
-#define P_right_0  -0x48dbe9f5b3eabaa
-#define P_right_1  -0xb35279f1a626ae5
-#define P_right_2  0x33789911873d184a
-#define P_right_3  -0x1bf9138fc77c0fbf
-#define P_right_4  -0x2d4ec43bc48403d4
-#define P_right_5  0x2d61deb53842cca1
-#define P_right_6  0x23324eca6b3ff02
-#define P_right_7  -0xd4ec1d31542c4fc
-#define P_right_8  0x2ecf3c60308b0f2
-#define P_right_9  0x7c917b3378071e
-#define P_right_10  -0x1e09b597f226ca
+// Erfcinv is defined on (0, 2). Suffices to consider (0, 1]
+// Two regions of approximation: left is [0, 0x1.2p-2) and right is [0x1.2p-1,
+// 1) Both are done with rational functions. For right, t*P(t)/Q(t) t = 1-x; x
+// in [0x1.2p-1, 1) For left, y*P(t)/Q(t), y = sqrt(-log(x)); and t = 1/y
+
+// P_coefficients in asending order, all in Q79. p0_delta is in floating point
+#define P_right_0 -0x48dbe9f5b3eabaa
+#define P_right_1 -0xb35279f1a626ae5
+#define P_right_2 0x33789911873d184a
+#define P_right_3 -0x1bf9138fc77c0fbf
+#define P_right_4 -0x2d4ec43bc48403d4
+#define P_right_5 0x2d61deb53842cca1
+#define P_right_6 0x23324eca6b3ff02
+#define P_right_7 -0xd4ec1d31542c4fc
+#define P_right_8 0x2ecf3c60308b0f2
+#define P_right_9 0x7c917b3378071e
+#define P_right_10 -0x1e09b597f226ca
 #define DELTA_P0_right -0x1.ec7dc41c17860p-2
 
-// Q_coefficients in asending order, all in Q79. q0_delta is in floating point 
-#define Q_right_0  -0x52366e5b14c0970
-#define Q_right_1  -0xca57e95abcc599b
-#define Q_right_2  0x3b6c91ec67f5759c
-#define Q_right_3  -0x1c40d5daa3be22bc
-#define Q_right_4  -0x41f11eb5d837386c
-#define Q_right_5  0x3c6ce478fcd75c9a
-#define Q_right_6  0xbb1cd7270cfba1d
-#define Q_right_7  -0x1988a4116498f1af
-#define Q_right_8  0x44dc3042f103d20
-#define Q_right_9  0x2390e683d02edf3
-#define Q_right_10  -0x8ec66f2a7e410c
+// Q_coefficients in asending order, all in Q79. q0_delta is in floating point
+#define Q_right_0 -0x52366e5b14c0970
+#define Q_right_1 -0xca57e95abcc599b
+#define Q_right_2 0x3b6c91ec67f5759c
+#define Q_right_3 -0x1c40d5daa3be22bc
+#define Q_right_4 -0x41f11eb5d837386c
+#define Q_right_5 0x3c6ce478fcd75c9a
+#define Q_right_6 0xbb1cd7270cfba1d
+#define Q_right_7 -0x1988a4116498f1af
+#define Q_right_8 0x44dc3042f103d20
+#define Q_right_9 0x2390e683d02edf3
+#define Q_right_10 -0x8ec66f2a7e410c
 #define DELTA_Q0_right -0x1.29a0161e99446p-3
 
-// P_coefficients in asending order, all in Q67. p0_delta is in floating point 
-#define P_left_0  0x17a0bb69321df
-#define P_left_1  0x402eb416ae6015
-#define P_left_2  0x2973eb18028ce34
-#define P_left_3  0x8034a7ece1d5370
-#define P_left_4  0xa76c08a74dae273
-#define P_left_5  0x11dd3876b83dd078
-#define P_left_6  0xfdd7693c3b77653
-#define P_left_7  0xb33d66152b3c223
-#define P_left_8  0x5a564c28c6a41a9
-#define P_left_9  0x1190449fe630213
-#define P_left_10  -0x659c784274e1
+// P_coefficients in asending order, all in Q67. p0_delta is in floating point
+#define P_left_0 0x17a0bb69321df
+#define P_left_1 0x402eb416ae6015
+#define P_left_2 0x2973eb18028ce34
+#define P_left_3 0x8034a7ece1d5370
+#define P_left_4 0xa76c08a74dae273
+#define P_left_5 0x11dd3876b83dd078
+#define P_left_6 0xfdd7693c3b77653
+#define P_left_7 0xb33d66152b3c223
+#define P_left_8 0x5a564c28c6a41a9
+#define P_left_9 0x1190449fe630213
+#define P_left_10 -0x659c784274e1
 #define DELTA_P0_left -0x1.d622f4cbe0eeep-2
 
-// Q_coefficients in asending order, all in Q67. q0_delta is in floating point 
-#define Q_left_0  0x17a09aabf9cee
-#define Q_left_1  0x4030b9059ffcad
-#define Q_left_2  0x29b26b0d87f7855
-#define Q_left_3  0x87572a13d3fa2dd
-#define Q_left_4  0xd7a728b5620ac3c
-#define Q_left_5  0x1754392b473fd439
-#define Q_left_6  0x1791b9a091a816c2
-#define Q_left_7  0x167f71db9e13b075
-#define Q_left_8  0xcb9f5f3e5e618a4
-#define Q_left_9  0x68271fae767c68e
-#define Q_left_10  0x13745c4fa224b25
+// Q_coefficients in asending order, all in Q67. q0_delta is in floating point
+#define Q_left_0 0x17a09aabf9cee
+#define Q_left_1 0x4030b9059ffcad
+#define Q_left_2 0x29b26b0d87f7855
+#define Q_left_3 0x87572a13d3fa2dd
+#define Q_left_4 0xd7a728b5620ac3c
+#define Q_left_5 0x1754392b473fd439
+#define Q_left_6 0x1791b9a091a816c2
+#define Q_left_7 0x167f71db9e13b075
+#define Q_left_8 0xcb9f5f3e5e618a4
+#define Q_left_9 0x68271fae767c68e
+#define Q_left_10 0x13745c4fa224b25
 #define DELTA_Q0_left 0x1.f7e7557a34ae6p-2
 
 void F_VER1(API) {
@@ -75,7 +74,7 @@ void F_VER1(API) {
   VFLOAT vx, vx_sign, vy, vy_special;
   VBOOL special_args;
 
-  VFLOAT dummy; 
+  VFLOAT dummy;
 
   SET_ROUNDTONEAREST;
   // stripmining over input arguments
@@ -92,7 +91,7 @@ void F_VER1(API) {
     VFLOAT two_minus_x = __riscv_vfadd(vx_sign, fp_posOne, vlen);
     VBOOL x_gt_1 = __riscv_vmflt(vx_sign, fp_posZero, vlen);
     vx = __riscv_vmerge(vx, two_minus_x, x_gt_1, vlen);
-    // vx is now in (0, 1] 
+    // vx is now in (0, 1]
     VBOOL x_in_left = __riscv_vmfle(vx, 0x1.2p-2, vlen);
 
     VFLOAT w_hi, w_lo, w_hi_left, w_lo_left, y_hi, y_lo;
@@ -100,22 +99,22 @@ void F_VER1(API) {
     VBOOL x_is_tiny;
 
     if (__riscv_vcpop(x_in_left, vlen) > 0) {
-        VFLOAT x_left = VFMV_VF(0x1.0p-3, vlen);
-        x_left = __riscv_vmerge(x_left, vx, x_in_left, vlen);
-        x_is_tiny = __riscv_vmflt(x_left, 0x1.0p-52, vlen);
-        INT n_adjust = 60;
-        x_left = __riscv_vfmul(x_left, 0x1.0p60, vlen);
-        NEG_LOGX_4_TRANSFORM(x_left, n_adjust, y_hi, y_lo, vlen);
-    
-        SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_left, w_lo_left, T_left, 
-                          0x1.0p63, 0x1.0p-63, vlen);
-        if (__riscv_vcpop(x_is_tiny, vlen) > 0) {
-          VFLOAT w_hi_dummy, w_lo_dummy;
-          SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_dummy, w_lo_dummy, T_tiny,
-                            0x1.0p65, 0x1.0p-65, vlen);
-          dummy = __riscv_vfmul(__riscv_vfcvt_f(T_tiny, vlen), 0x1.0p-2, vlen);
-          dummy = __riscv_vfsub(dummy, __riscv_vfcvt_f(T_left, vlen), vlen);
-        }
+      VFLOAT x_left = VFMV_VF(0x1.0p-3, vlen);
+      x_left = __riscv_vmerge(x_left, vx, x_in_left, vlen);
+      x_is_tiny = __riscv_vmflt(x_left, 0x1.0p-52, vlen);
+      INT n_adjust = 60;
+      x_left = __riscv_vfmul(x_left, 0x1.0p60, vlen);
+      NEG_LOGX_4_TRANSFORM(x_left, n_adjust, y_hi, y_lo, vlen);
+
+      SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_left, w_lo_left, T_left, 0x1.0p63,
+                        0x1.0p-63, vlen);
+      if (__riscv_vcpop(x_is_tiny, vlen) > 0) {
+        VFLOAT w_hi_dummy, w_lo_dummy;
+        SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_dummy, w_lo_dummy, T_tiny, 0x1.0p65,
+                          0x1.0p-65, vlen);
+        dummy = __riscv_vfmul(__riscv_vfcvt_f(T_tiny, vlen), 0x1.0p-2, vlen);
+        dummy = __riscv_vfsub(dummy, __riscv_vfcvt_f(T_left, vlen), vlen);
+      }
     }
     w_hi = VFMV_VF(fp_posOne, vlen);
     w_hi = __riscv_vfsub(w_hi, vx, vlen);
@@ -125,54 +124,76 @@ void F_VER1(API) {
     VFLOAT delta_t = __riscv_vfmul(w_lo, 0x1.0p63, vlen);
     T = __riscv_vadd(T, __riscv_vfcvt_x(delta_t, vlen), vlen);
     T = __riscv_vmerge(T, T_left, x_in_left, vlen);
-    
+
     w_hi = __riscv_vmerge(w_hi, w_hi_left, x_in_left, vlen);
     w_lo = __riscv_vmerge(w_lo, w_lo_left, x_in_left, vlen);
 
-    //For transformed branch, compute (w_hi + w_lo) * P(T)/Q(T)
+    // For transformed branch, compute (w_hi + w_lo) * P(T)/Q(T)
     VINT P, Q;
 
     P = __riscv_vmerge(VMVI_VX(P_right_10, vlen), P_left_10, x_in_left, vlen);
-    P = PSTEP_I_ab(x_in_left, P_left_6, P_right_6, T, 
-        PSTEP_I_ab(x_in_left, P_left_7, P_right_7, T, 
-        PSTEP_I_ab(x_in_left, P_left_8, P_right_8, T, 
-        PSTEP_I_ab(x_in_left, P_left_9, P_right_9, T, P,
-        vlen), vlen), vlen), vlen); 
+    P = PSTEP_I_ab(x_in_left, P_left_6, P_right_6, T,
+                   PSTEP_I_ab(x_in_left, P_left_7, P_right_7, T,
+                              PSTEP_I_ab(x_in_left, P_left_8, P_right_8, T,
+                                         PSTEP_I_ab(x_in_left, P_left_9,
+                                                    P_right_9, T, P, vlen),
+                                         vlen),
+                              vlen),
+                   vlen);
 
     Q = __riscv_vmerge(VMVI_VX(Q_right_10, vlen), Q_left_10, x_in_left, vlen);
-    Q = PSTEP_I_ab(x_in_left, Q_left_6, Q_right_6, T, 
-        PSTEP_I_ab(x_in_left, Q_left_7, Q_right_7, T, 
-        PSTEP_I_ab(x_in_left, Q_left_8, Q_right_8, T, 
-        PSTEP_I_ab(x_in_left, Q_left_9, Q_right_9, T, Q,
-        vlen), vlen), vlen), vlen); 
-        
-    P = PSTEP_I_ab(x_in_left, P_left_0, P_right_0, T, 
-        PSTEP_I_ab(x_in_left, P_left_1, P_right_1, T, 
-        PSTEP_I_ab(x_in_left, P_left_2, P_right_2, T, 
-        PSTEP_I_ab(x_in_left, P_left_3, P_right_3, T, 
-        PSTEP_I_ab(x_in_left, P_left_4, P_right_4, T, 
-        PSTEP_I_ab(x_in_left, P_left_5, P_right_5, T, P,
-        vlen), vlen), vlen), vlen), vlen), vlen);
-
-    Q = PSTEP_I_ab(x_in_left, Q_left_0, Q_right_0, T, 
-        PSTEP_I_ab(x_in_left, Q_left_1, Q_right_1, T, 
-        PSTEP_I_ab(x_in_left, Q_left_2, Q_right_2, T, 
-        PSTEP_I_ab(x_in_left, Q_left_3, Q_right_3, T, 
-        PSTEP_I_ab(x_in_left, Q_left_4, Q_right_4, T, 
-        PSTEP_I_ab(x_in_left, Q_left_5, Q_right_5, T, Q,
-        vlen), vlen), vlen), vlen), vlen), vlen);
+    Q = PSTEP_I_ab(x_in_left, Q_left_6, Q_right_6, T,
+                   PSTEP_I_ab(x_in_left, Q_left_7, Q_right_7, T,
+                              PSTEP_I_ab(x_in_left, Q_left_8, Q_right_8, T,
+                                         PSTEP_I_ab(x_in_left, Q_left_9,
+                                                    Q_right_9, T, Q, vlen),
+                                         vlen),
+                              vlen),
+                   vlen);
+
+    P = PSTEP_I_ab(
+        x_in_left, P_left_0, P_right_0, T,
+        PSTEP_I_ab(
+            x_in_left, P_left_1, P_right_1, T,
+            PSTEP_I_ab(x_in_left, P_left_2, P_right_2, T,
+                       PSTEP_I_ab(x_in_left, P_left_3, P_right_3, T,
+                                  PSTEP_I_ab(x_in_left, P_left_4, P_right_4, T,
+                                             PSTEP_I_ab(x_in_left, P_left_5,
+                                                        P_right_5, T, P, vlen),
+                                             vlen),
+                                  vlen),
+                       vlen),
+            vlen),
+        vlen);
+
+    Q = PSTEP_I_ab(
+        x_in_left, Q_left_0, Q_right_0, T,
+        PSTEP_I_ab(
+            x_in_left, Q_left_1, Q_right_1, T,
+            PSTEP_I_ab(x_in_left, Q_left_2, Q_right_2, T,
+                       PSTEP_I_ab(x_in_left, Q_left_3, Q_right_3, T,
+                                  PSTEP_I_ab(x_in_left, Q_left_4, Q_right_4, T,
+                                             PSTEP_I_ab(x_in_left, Q_left_5,
+                                                        Q_right_5, T, Q, vlen),
+                                             vlen),
+                                  vlen),
+                       vlen),
+            vlen),
+        vlen);
 
     VFLOAT p_hi, p_lo;
     p_hi = __riscv_vfcvt_f(P, vlen);
 
-    p_lo = __riscv_vfcvt_f(__riscv_vsub(P, __riscv_vfcvt_x(p_hi, vlen), vlen), vlen);
+    p_lo = __riscv_vfcvt_f(__riscv_vsub(P, __riscv_vfcvt_x(p_hi, vlen), vlen),
+                           vlen);
     VFLOAT delta_p0 = VFMV_VF(DELTA_P0_right, vlen);
     delta_p0 = __riscv_vfmerge(delta_p0, DELTA_P0_left, x_in_left, vlen);
     p_lo = __riscv_vfadd(p_lo, delta_p0, vlen);
 
     VFLOAT q_hi, q_lo;
     q_hi = __riscv_vfcvt_f(Q, vlen);
-    q_lo = __riscv_vfcvt_f(__riscv_vsub(Q, __riscv_vfcvt_x(q_hi, vlen), vlen), vlen);
+    q_lo = __riscv_vfcvt_f(__riscv_vsub(Q, __riscv_vfcvt_x(q_hi, vlen), vlen),
+                           vlen);
     VFLOAT delta_q0 = VFMV_VF(DELTA_Q0_right, vlen);
     delta_q0 = __riscv_vfmerge(delta_q0, DELTA_Q0_left, x_in_left, vlen);
     q_lo = __riscv_vfadd(q_lo, delta_q0, vlen);
@@ -193,8 +214,8 @@ void F_VER1(API) {
     y_lo = __riscv_vfmacc(y_lo, w_lo, p_hi, vlen);
 
     DIV_N2D2(y_hi, y_lo, q_hi, q_lo, w_hi, vlen);
-    
-    vy = w_hi; 
+
+    vy = w_hi;
 
     vy = __riscv_vfsgnj(vy, vx_sign, vlen);
     vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
diff --git a/include/rvvlm_erfinvD.inc.h b/include/rvvlm_erfinvD.inc.h
index f7d78b4..b1f346d 100644
--- a/include/rvvlm_erfinvD.inc.h
+++ b/include/rvvlm_erfinvD.inc.h
@@ -8,65 +8,64 @@
 #define F_VER1 RVVLM_ERFINVDI_STD
 #endif
 
-//Two regions of approximation: left is [0, 0x1.7p-1) and right is [0x1.7p-1, 1)
-//Both are done with rational functions.
-//For left, x*P(x)/Q(x) x in [0, 0x1.7p-1)
-//For right, y*P(t)/Q(t), y = sqrt(-log(1-x)); and t = 1/y
-
-// P_coefficients in asending order, all in Q79. p0_delta is in floating point 
-#define P_left_0  -0x48dbe9f5b3eabaa
-#define P_left_1  -0xb35279f1a626ae5
-#define P_left_2  0x33789911873d184a
-#define P_left_3  -0x1bf9138fc77c0fbf
-#define P_left_4  -0x2d4ec43bc48403d4
-#define P_left_5  0x2d61deb53842cca1
-#define P_left_6  0x23324eca6b3ff02
-#define P_left_7  -0xd4ec1d31542c4fc
-#define P_left_8  0x2ecf3c60308b0f2
-#define P_left_9  0x7c917b3378071e
-#define P_left_10  -0x1e09b597f226ca
+// Two regions of approximation: left is [0, 0x1.7p-1) and right is [0x1.7p-1,
+// 1) Both are done with rational functions. For left, x*P(x)/Q(x) x in [0,
+// 0x1.7p-1) For right, y*P(t)/Q(t), y = sqrt(-log(1-x)); and t = 1/y
+
+// P_coefficients in asending order, all in Q79. p0_delta is in floating point
+#define P_left_0 -0x48dbe9f5b3eabaa
+#define P_left_1 -0xb35279f1a626ae5
+#define P_left_2 0x33789911873d184a
+#define P_left_3 -0x1bf9138fc77c0fbf
+#define P_left_4 -0x2d4ec43bc48403d4
+#define P_left_5 0x2d61deb53842cca1
+#define P_left_6 0x23324eca6b3ff02
+#define P_left_7 -0xd4ec1d31542c4fc
+#define P_left_8 0x2ecf3c60308b0f2
+#define P_left_9 0x7c917b3378071e
+#define P_left_10 -0x1e09b597f226ca
 #define DELTA_P0_left -0x1.ec7dc41c17860p-2
 
-// Q_coefficients in asending order, all in Q79. q0_delta is in floating point 
-#define Q_left_0  -0x52366e5b14c0970
-#define Q_left_1  -0xca57e95abcc599b
-#define Q_left_2  0x3b6c91ec67f5759c
-#define Q_left_3  -0x1c40d5daa3be22bc
-#define Q_left_4  -0x41f11eb5d837386c
-#define Q_left_5  0x3c6ce478fcd75c9a
-#define Q_left_6  0xbb1cd7270cfba1d
-#define Q_left_7  -0x1988a4116498f1af
-#define Q_left_8  0x44dc3042f103d20
-#define Q_left_9  0x2390e683d02edf3
-#define Q_left_10  -0x8ec66f2a7e410c
+// Q_coefficients in asending order, all in Q79. q0_delta is in floating point
+#define Q_left_0 -0x52366e5b14c0970
+#define Q_left_1 -0xca57e95abcc599b
+#define Q_left_2 0x3b6c91ec67f5759c
+#define Q_left_3 -0x1c40d5daa3be22bc
+#define Q_left_4 -0x41f11eb5d837386c
+#define Q_left_5 0x3c6ce478fcd75c9a
+#define Q_left_6 0xbb1cd7270cfba1d
+#define Q_left_7 -0x1988a4116498f1af
+#define Q_left_8 0x44dc3042f103d20
+#define Q_left_9 0x2390e683d02edf3
+#define Q_left_10 -0x8ec66f2a7e410c
 #define DELTA_Q0_left -0x1.29a0161e99446p-3
 
-// P_coefficients in asending order, all in Q67. p0_delta is in floating point 
-#define P_right_0  0x17a0bb69321df
-#define P_right_1  0x402eb416ae6015
-#define P_right_2  0x2973eb18028ce34
-#define P_right_3  0x8034a7ece1d5370
-#define P_right_4  0xa76c08a74dae273
-#define P_right_5  0x11dd3876b83dd078
-#define P_right_6  0xfdd7693c3b77653
-#define P_right_7  0xb33d66152b3c223
-#define P_right_8  0x5a564c28c6a41a9
-#define P_right_9  0x1190449fe630213
-#define P_right_10  -0x659c784274e1
+// P_coefficients in asending order, all in Q67. p0_delta is in floating point
+#define P_right_0 0x17a0bb69321df
+#define P_right_1 0x402eb416ae6015
+#define P_right_2 0x2973eb18028ce34
+#define P_right_3 0x8034a7ece1d5370
+#define P_right_4 0xa76c08a74dae273
+#define P_right_5 0x11dd3876b83dd078
+#define P_right_6 0xfdd7693c3b77653
+#define P_right_7 0xb33d66152b3c223
+#define P_right_8 0x5a564c28c6a41a9
+#define P_right_9 0x1190449fe630213
+#define P_right_10 -0x659c784274e1
 #define DELTA_P0_right -0x1.d622f4cbe0eeep-2
 
-// Q_coefficients in asending order, all in Q67. q0_delta is in floating point 
-#define Q_right_0  0x17a09aabf9cee
-#define Q_right_1  0x4030b9059ffcad
-#define Q_right_2  0x29b26b0d87f7855
-#define Q_right_3  0x87572a13d3fa2dd
-#define Q_right_4  0xd7a728b5620ac3c
-#define Q_right_5  0x1754392b473fd439
-#define Q_right_6  0x1791b9a091a816c2
-#define Q_right_7  0x167f71db9e13b075
-#define Q_right_8  0xcb9f5f3e5e618a4
-#define Q_right_9  0x68271fae767c68e
-#define Q_right_10  0x13745c4fa224b25
+// Q_coefficients in asending order, all in Q67. q0_delta is in floating point
+#define Q_right_0 0x17a09aabf9cee
+#define Q_right_1 0x4030b9059ffcad
+#define Q_right_2 0x29b26b0d87f7855
+#define Q_right_3 0x87572a13d3fa2dd
+#define Q_right_4 0xd7a728b5620ac3c
+#define Q_right_5 0x1754392b473fd439
+#define Q_right_6 0x1791b9a091a816c2
+#define Q_right_7 0x167f71db9e13b075
+#define Q_right_8 0xcb9f5f3e5e618a4
+#define Q_right_9 0x68271fae767c68e
+#define Q_right_10 0x13745c4fa224b25
 #define DELTA_Q0_right 0x1.f7e7557a34ae6p-2
 
 void F_VER1(API) {
@@ -91,69 +90,91 @@ void F_VER1(API) {
     VINT T, T_right;
 
     if (__riscv_vcpop(x_in_right, vlen) > 0) {
-        VFLOAT one_minus_x;
-        one_minus_x = __riscv_vfrsub(vx, fp_posOne, vlen);
+      VFLOAT one_minus_x;
+      one_minus_x = __riscv_vfrsub(vx, fp_posOne, vlen);
 
-        VINT n_adjust;
-        n_adjust = __riscv_vxor(n_adjust, n_adjust, vlen);
+      VINT n_adjust;
+      n_adjust = __riscv_vxor(n_adjust, n_adjust, vlen);
 
-        NEG_LOGX_4_TRANSFORM(one_minus_x, n_adjust, y_hi, y_lo, vlen);
+      NEG_LOGX_4_TRANSFORM(one_minus_x, n_adjust, y_hi, y_lo, vlen);
 
-        SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_right, w_lo_right, T_right, 
-                          0x1.0p63, 0x1.0p-63, vlen);
+      SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_right, w_lo_right, T_right, 0x1.0p63,
+                        0x1.0p-63, vlen);
     }
     T = __riscv_vfcvt_x(__riscv_vfmul(vx, 0x1.0p63, vlen), vlen);
     T = __riscv_vmerge(T, T_right, x_in_right, vlen);
-    
+
     w_hi = vx;
     w_lo = I_AS_F(__riscv_vxor(F_AS_I(w_lo), F_AS_I(w_lo), vlen));
     w_hi = __riscv_vmerge(w_hi, w_hi_right, x_in_right, vlen);
     w_lo = __riscv_vmerge(w_lo, w_lo_right, x_in_right, vlen);
 
-    //For transformed branch, compute (w_hi + w_lo) * P(T)/Q(T)
+    // For transformed branch, compute (w_hi + w_lo) * P(T)/Q(T)
     VINT P, Q;
 
     P = __riscv_vmerge(VMVI_VX(P_left_10, vlen), P_right_10, x_in_right, vlen);
-    P = PSTEP_I_ab(x_in_right, P_right_6, P_left_6, T, 
-        PSTEP_I_ab(x_in_right, P_right_7, P_left_7, T, 
-        PSTEP_I_ab(x_in_right, P_right_8, P_left_8, T, 
-        PSTEP_I_ab(x_in_right, P_right_9, P_left_9, T, P,
-        vlen), vlen), vlen), vlen); 
+    P = PSTEP_I_ab(x_in_right, P_right_6, P_left_6, T,
+                   PSTEP_I_ab(x_in_right, P_right_7, P_left_7, T,
+                              PSTEP_I_ab(x_in_right, P_right_8, P_left_8, T,
+                                         PSTEP_I_ab(x_in_right, P_right_9,
+                                                    P_left_9, T, P, vlen),
+                                         vlen),
+                              vlen),
+                   vlen);
 
     Q = __riscv_vmerge(VMVI_VX(Q_left_10, vlen), Q_right_10, x_in_right, vlen);
-    Q = PSTEP_I_ab(x_in_right, Q_right_6, Q_left_6, T, 
-        PSTEP_I_ab(x_in_right, Q_right_7, Q_left_7, T, 
-        PSTEP_I_ab(x_in_right, Q_right_8, Q_left_8, T, 
-        PSTEP_I_ab(x_in_right, Q_right_9, Q_left_9, T, Q,
-        vlen), vlen), vlen), vlen); 
-        
-    P = PSTEP_I_ab(x_in_right, P_right_0, P_left_0, T, 
-        PSTEP_I_ab(x_in_right, P_right_1, P_left_1, T, 
-        PSTEP_I_ab(x_in_right, P_right_2, P_left_2, T, 
-        PSTEP_I_ab(x_in_right, P_right_3, P_left_3, T, 
-        PSTEP_I_ab(x_in_right, P_right_4, P_left_4, T, 
-        PSTEP_I_ab(x_in_right, P_right_5, P_left_5, T, P,
-        vlen), vlen), vlen), vlen), vlen), vlen);
-
-    Q = PSTEP_I_ab(x_in_right, Q_right_0, Q_left_0, T, 
-        PSTEP_I_ab(x_in_right, Q_right_1, Q_left_1, T, 
-        PSTEP_I_ab(x_in_right, Q_right_2, Q_left_2, T, 
-        PSTEP_I_ab(x_in_right, Q_right_3, Q_left_3, T, 
-        PSTEP_I_ab(x_in_right, Q_right_4, Q_left_4, T, 
-        PSTEP_I_ab(x_in_right, Q_right_5, Q_left_5, T, Q,
-        vlen), vlen), vlen), vlen), vlen), vlen);
-        
+    Q = PSTEP_I_ab(x_in_right, Q_right_6, Q_left_6, T,
+                   PSTEP_I_ab(x_in_right, Q_right_7, Q_left_7, T,
+                              PSTEP_I_ab(x_in_right, Q_right_8, Q_left_8, T,
+                                         PSTEP_I_ab(x_in_right, Q_right_9,
+                                                    Q_left_9, T, Q, vlen),
+                                         vlen),
+                              vlen),
+                   vlen);
+
+    P = PSTEP_I_ab(
+        x_in_right, P_right_0, P_left_0, T,
+        PSTEP_I_ab(
+            x_in_right, P_right_1, P_left_1, T,
+            PSTEP_I_ab(x_in_right, P_right_2, P_left_2, T,
+                       PSTEP_I_ab(x_in_right, P_right_3, P_left_3, T,
+                                  PSTEP_I_ab(x_in_right, P_right_4, P_left_4, T,
+                                             PSTEP_I_ab(x_in_right, P_right_5,
+                                                        P_left_5, T, P, vlen),
+                                             vlen),
+                                  vlen),
+                       vlen),
+            vlen),
+        vlen);
+
+    Q = PSTEP_I_ab(
+        x_in_right, Q_right_0, Q_left_0, T,
+        PSTEP_I_ab(
+            x_in_right, Q_right_1, Q_left_1, T,
+            PSTEP_I_ab(x_in_right, Q_right_2, Q_left_2, T,
+                       PSTEP_I_ab(x_in_right, Q_right_3, Q_left_3, T,
+                                  PSTEP_I_ab(x_in_right, Q_right_4, Q_left_4, T,
+                                             PSTEP_I_ab(x_in_right, Q_right_5,
+                                                        Q_left_5, T, Q, vlen),
+                                             vlen),
+                                  vlen),
+                       vlen),
+            vlen),
+        vlen);
+
     VFLOAT p_hi, p_lo;
     p_hi = __riscv_vfcvt_f(P, vlen);
 
-    p_lo = __riscv_vfcvt_f(__riscv_vsub(P, __riscv_vfcvt_x(p_hi, vlen), vlen), vlen);
+    p_lo = __riscv_vfcvt_f(__riscv_vsub(P, __riscv_vfcvt_x(p_hi, vlen), vlen),
+                           vlen);
     VFLOAT delta_p0 = VFMV_VF(DELTA_P0_left, vlen);
     delta_p0 = __riscv_vfmerge(delta_p0, DELTA_P0_right, x_in_right, vlen);
     p_lo = __riscv_vfadd(p_lo, delta_p0, vlen);
 
     VFLOAT q_hi, q_lo;
     q_hi = __riscv_vfcvt_f(Q, vlen);
-    q_lo = __riscv_vfcvt_f(__riscv_vsub(Q, __riscv_vfcvt_x(q_hi, vlen), vlen), vlen);
+    q_lo = __riscv_vfcvt_f(__riscv_vsub(Q, __riscv_vfcvt_x(q_hi, vlen), vlen),
+                           vlen);
     VFLOAT delta_q0 = VFMV_VF(DELTA_Q0_left, vlen);
     delta_q0 = __riscv_vfmerge(delta_q0, DELTA_Q0_right, x_in_right, vlen);
     q_lo = __riscv_vfadd(q_lo, delta_q0, vlen);
@@ -165,8 +186,8 @@ void F_VER1(API) {
     y_lo = __riscv_vfmacc(y_lo, w_lo, p_hi, vlen);
 
     DIV_N2D2(y_hi, y_lo, q_hi, q_lo, w_hi, vlen);
-    
-    vy = w_hi; 
+
+    vy = w_hi;
 
     vy = __riscv_vfsgnj(vy, vx_orig, vlen);
     vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
diff --git a/include/rvvlm_inverrorfuncsD.h b/include/rvvlm_inverrorfuncsD.h
index 1b87630..8e1b619 100644
--- a/include/rvvlm_inverrorfuncsD.h
+++ b/include/rvvlm_inverrorfuncsD.h
@@ -7,30 +7,32 @@
 #define NEG_LOG2_HI -0x1.62e42fefa4000p-1
 #define NEG_LOG2_LO 0x1.8432a1b0e2634p-43
 
-// P_coefficients in asending order, in varying scales. p0_delta is in floating point 
-#define P_tiny_0  -0x8593442e139d      // scale 66
-#define P_tiny_1  -0x1fcf7055ac5f03    // scale 64
-#define P_tiny_2  -0x106dde33d8dc179   // scale 61
-#define P_tiny_3  -0xc31d6e09935118a   // scale 60
-#define P_tiny_4  -0x3560de73cb5bbcc0  // scale 59
-#define P_tiny_5  -0x1eb4c7e14b254de8  // scale 57
-#define P_tiny_6  0x1fdf5a9d23430bd7   // scale 56
-#define P_tiny_7  0x62c4020631de121b   // scale 56
-#define P_tiny_8  0x4be1ed5d031773f1   // scale 58
-#define P_tiny_9  0x55a9f8b9538981a1   // scale 60
-#define DELTA_P0_tiny 0x1.ba4d0b79d16e6p-2   // scale 66
+// P_coefficients in asending order, in varying scales. p0_delta is in floating
+// point
+#define P_tiny_0 -0x8593442e139d           // scale 66
+#define P_tiny_1 -0x1fcf7055ac5f03         // scale 64
+#define P_tiny_2 -0x106dde33d8dc179        // scale 61
+#define P_tiny_3 -0xc31d6e09935118a        // scale 60
+#define P_tiny_4 -0x3560de73cb5bbcc0       // scale 59
+#define P_tiny_5 -0x1eb4c7e14b254de8       // scale 57
+#define P_tiny_6 0x1fdf5a9d23430bd7        // scale 56
+#define P_tiny_7 0x62c4020631de121b        // scale 56
+#define P_tiny_8 0x4be1ed5d031773f1        // scale 58
+#define P_tiny_9 0x55a9f8b9538981a1        // scale 60
+#define DELTA_P0_tiny 0x1.ba4d0b79d16e6p-2 // scale 66
 
-// Q_coefficients in asending order, in varying scales. q0_delta is in floating point 
-#define Q_tiny_0  -0x85933cda2d6d      // sacle 66
-#define Q_tiny_1  -0x1fcf792da7d51d    // sacle 64
-#define Q_tiny_2  -0x106ec6e0ed13ae1   // sacle 61
-#define Q_tiny_3  -0x61b925a39a461aa   // sacle 59
-#define Q_tiny_4  -0x35ebf9dc72fab062  // sacle 59
-#define Q_tiny_5  -0x2131cf7760e82873  // sacle 57
-#define Q_tiny_6  0x1860ae67db2a6609   // sacle 56
-#define Q_tiny_7  0x5e9a123701d89289   // sacle 56
-#define Q_tiny_8  0x417b35aab14ac49d   // sacle 56
-#define Q_tiny_9  0x5e4a26a7c1415755   // sacle 57
+// Q_coefficients in asending order, in varying scales. q0_delta is in floating
+// point
+#define Q_tiny_0 -0x85933cda2d6d           // sacle 66
+#define Q_tiny_1 -0x1fcf792da7d51d         // sacle 64
+#define Q_tiny_2 -0x106ec6e0ed13ae1        // sacle 61
+#define Q_tiny_3 -0x61b925a39a461aa        // sacle 59
+#define Q_tiny_4 -0x35ebf9dc72fab062       // sacle 59
+#define Q_tiny_5 -0x2131cf7760e82873       // sacle 57
+#define Q_tiny_6 0x1860ae67db2a6609        // sacle 56
+#define Q_tiny_7 0x5e9a123701d89289        // sacle 56
+#define Q_tiny_8 0x417b35aab14ac49d        // sacle 56
+#define Q_tiny_9 0x5e4a26a7c1415755        // sacle 57
 #define DELTA_Q0_tiny 0x1.8a7adad44d65ap-4 // scale 66
 
 // erfinv(+-1) = +-Inf with divide by zero
@@ -93,123 +95,119 @@
   } while (0)
 
 // Compute -log(2^(-n_adjust) * x), where x < 1
-#define NEG_LOGX_4_TRANSFORM(vx, n_adjust, y_hi, y_lo, vlen) \
-  do { \
-    /* work on entire vector register */ \
-    VFLOAT vx_in = (vx); \
-    VINT n = __riscv_vadd(__riscv_vsra(F_AS_I(vx_in), MAN_LEN - 8, (vlen)), 0x96, \
-                     vlen); \
-    n = __riscv_vsub(__riscv_vsra(n, 8, vlen), EXP_BIAS, vlen); \
-    VFLOAT scale = I_AS_F(__riscv_vsll(__riscv_vrsub(n, EXP_BIAS, (vlen)), MAN_LEN, (vlen))); \
-    vx_in = __riscv_vfmul(vx_in, scale, (vlen)); \
-    /* x is scaled, and -log(x) is 2 atanh(w/2); w = 2(1-x)/(1+x) */ \
-    n = __riscv_vsub(n, (n_adjust), (vlen)); \
-    VFLOAT n_flt = __riscv_vfcvt_f(n, (vlen)); \
-    VFLOAT numer = __riscv_vfrsub(vx_in, fp_posOne, (vlen)); \
-    /* note that 1-x is exact as 1/2 < x < 2 */ \
-    numer = __riscv_vfadd(numer, numer, (vlen)); \
-    VFLOAT denom = __riscv_vfadd(vx_in, fp_posOne, (vlen)); \
-    VFLOAT delta_denom = __riscv_vfadd(__riscv_vfrsub(denom, fp_posOne, (vlen)), vx_in, (vlen)); \
-    /* note that 1 - denom is exact even if denom > 2 */ \
-    /* becase 1 has many trailing zeros */ \
-    VFLOAT r_hi, r_lo, r; \
-    DIV_N1D2(numer, denom, delta_denom, r_hi, r_lo, (vlen)); \
-    r = __riscv_vfadd(r_hi, r_lo, (vlen)); \
-    /* for the original unscaled x, we have */ \
-    /* -log(x) = -n * log(2) + 2 atanh(-w/2) */ \
-    /* where w = 2(1-x)/(1+x); -w = 2(x-1)/(x+1) */ \
-    VFLOAT A, B; \
-    A = __riscv_vfmadd(n_flt, NEG_LOG2_HI, r_hi, (vlen)); \
-    B = __riscv_vfmsub(n_flt, NEG_LOG2_HI, A, (vlen)); \
-    B = __riscv_vfadd(B, r_hi, (vlen)); \
-    B = __riscv_vfadd(r_lo, B, (vlen)); \
-    VFLOAT rsq = __riscv_vfmul(r, r, (vlen)); \
-    VFLOAT rcube = __riscv_vfmul(rsq, r, (vlen)); \
-    VFLOAT r6 = __riscv_vfmul(rcube, rcube, (vlen)); \
-    VFLOAT poly_right = PSTEP(0x1.74681ff881228p-14, rsq, \
-                            PSTEP( 0x1.39751be23e4a3p-16, \
-                                   0x1.30a893993e73dp-18, rsq, vlen), \
-                        vlen); \
-    VFLOAT poly_left = PSTEP(0x1.999999996ce82p-7, rsq, \
-                            PSTEP(0x1.249249501b1adp-9, \
-                                  0x1.c71c47e7189f6p-12, rsq, vlen), \
-                       vlen); \
-    poly_left = __riscv_vfmacc(poly_left, r6, poly_right, (vlen)); \
-    poly_left = PSTEP(0x1.55555555555dbp-4, rsq, poly_left, (vlen)); \
-    B = __riscv_vfmacc(B, NEG_LOG2_LO, n_flt, (vlen)); \
-    B = __riscv_vfmacc(B, rcube, poly_left, (vlen)); \
-    FAST2SUM(A, B, (y_hi), (y_lo), (vlen)); \
-    /* A + B is -log(x) with extra precision, |B| \le ulp(A)/2 */ \
-  } while(0)
+#define NEG_LOGX_4_TRANSFORM(vx, n_adjust, y_hi, y_lo, vlen)                   \
+  do {                                                                         \
+    /* work on entire vector register */                                       \
+    VFLOAT vx_in = (vx);                                                       \
+    VINT n = __riscv_vadd(__riscv_vsra(F_AS_I(vx_in), MAN_LEN - 8, (vlen)),    \
+                          0x96, vlen);                                         \
+    n = __riscv_vsub(__riscv_vsra(n, 8, vlen), EXP_BIAS, vlen);                \
+    VFLOAT scale = I_AS_F(                                                     \
+        __riscv_vsll(__riscv_vrsub(n, EXP_BIAS, (vlen)), MAN_LEN, (vlen)));    \
+    vx_in = __riscv_vfmul(vx_in, scale, (vlen));                               \
+    /* x is scaled, and -log(x) is 2 atanh(w/2); w = 2(1-x)/(1+x) */           \
+    n = __riscv_vsub(n, (n_adjust), (vlen));                                   \
+    VFLOAT n_flt = __riscv_vfcvt_f(n, (vlen));                                 \
+    VFLOAT numer = __riscv_vfrsub(vx_in, fp_posOne, (vlen));                   \
+    /* note that 1-x is exact as 1/2 < x < 2 */                                \
+    numer = __riscv_vfadd(numer, numer, (vlen));                               \
+    VFLOAT denom = __riscv_vfadd(vx_in, fp_posOne, (vlen));                    \
+    VFLOAT delta_denom = __riscv_vfadd(                                        \
+        __riscv_vfrsub(denom, fp_posOne, (vlen)), vx_in, (vlen));              \
+    /* note that 1 - denom is exact even if denom > 2 */                       \
+    /* becase 1 has many trailing zeros */                                     \
+    VFLOAT r_hi, r_lo, r;                                                      \
+    DIV_N1D2(numer, denom, delta_denom, r_hi, r_lo, (vlen));                   \
+    r = __riscv_vfadd(r_hi, r_lo, (vlen));                                     \
+    /* for the original unscaled x, we have */                                 \
+    /* -log(x) = -n * log(2) + 2 atanh(-w/2) */                                \
+    /* where w = 2(1-x)/(1+x); -w = 2(x-1)/(x+1) */                            \
+    VFLOAT A, B;                                                               \
+    A = __riscv_vfmadd(n_flt, NEG_LOG2_HI, r_hi, (vlen));                      \
+    B = __riscv_vfmsub(n_flt, NEG_LOG2_HI, A, (vlen));                         \
+    B = __riscv_vfadd(B, r_hi, (vlen));                                        \
+    B = __riscv_vfadd(r_lo, B, (vlen));                                        \
+    VFLOAT rsq = __riscv_vfmul(r, r, (vlen));                                  \
+    VFLOAT rcube = __riscv_vfmul(rsq, r, (vlen));                              \
+    VFLOAT r6 = __riscv_vfmul(rcube, rcube, (vlen));                           \
+    VFLOAT poly_right = PSTEP(                                                 \
+        0x1.74681ff881228p-14, rsq,                                            \
+        PSTEP(0x1.39751be23e4a3p-16, 0x1.30a893993e73dp-18, rsq, vlen), vlen); \
+    VFLOAT poly_left = PSTEP(                                                  \
+        0x1.999999996ce82p-7, rsq,                                             \
+        PSTEP(0x1.249249501b1adp-9, 0x1.c71c47e7189f6p-12, rsq, vlen), vlen);  \
+    poly_left = __riscv_vfmacc(poly_left, r6, poly_right, (vlen));             \
+    poly_left = PSTEP(0x1.55555555555dbp-4, rsq, poly_left, (vlen));           \
+    B = __riscv_vfmacc(B, NEG_LOG2_LO, n_flt, (vlen));                         \
+    B = __riscv_vfmacc(B, rcube, poly_left, (vlen));                           \
+    FAST2SUM(A, B, (y_hi), (y_lo), (vlen));                                    \
+    /* A + B is -log(x) with extra precision, |B| \le ulp(A)/2 */              \
+  } while (0)
 
-// This macro computes w_hi + w_lo = sqrt(y_hi + y_lo) in floating point 
-// and 1/(w_hi + w_lo) as a Q63 fixed-point T 
-// y_hi, y_lo is normalized on input; that is y_hi has 
+// This macro computes w_hi + w_lo = sqrt(y_hi + y_lo) in floating point
+// and 1/(w_hi + w_lo) as a Q63 fixed-point T
+// y_hi, y_lo is normalized on input; that is y_hi has
 // full working precision of the sum y_hi + y_lo
 // and 2 log(2) < y_hi < 1100 log(2)
-#define SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi, w_lo, T, t_sc, t_sc_inv, vlen) \
-  do { \
-    (w_hi) = __riscv_vfsqrt((y_hi), (vlen)); \
-    (w_lo) = __riscv_vfnmsub((w_hi), (w_hi), (y_hi), (vlen)); \
-    (w_lo) = __riscv_vfadd((w_lo), (y_lo), (vlen)); \
-    VFLOAT recip; \
-    recip = __riscv_vfadd((y_hi), (y_hi), (vlen)); \
-    recip = __riscv_vfrec7(recip, (vlen)); \
-    recip = __riscv_vfmul(recip, (w_hi), (vlen)); \
-    (w_lo) = __riscv_vfmul((w_lo), recip, (vlen)); \
-    /* w_hi + w_lo is sqrt(y_hi + y_lo) to extra precision */ \
-    /* now compute T = t_sc/(w_hi + w_lo) as fixed point */ \
-    VFLOAT t_lo = VFMV_VF((t_sc), (vlen)); \
-    VFLOAT t_hi = __riscv_vfdiv(t_lo, (w_hi), (vlen)); \
-    (T) = __riscv_vfcvt_x(t_hi, (vlen)); \
-    t_lo = __riscv_vfnmsac(t_lo, (w_hi), t_hi, (vlen)); \
-    t_lo = __riscv_vfnmsac(t_lo, (w_lo), t_hi, (vlen)); \
-    t_lo = __riscv_vfmul(t_lo, t_hi, (vlen)); \
-    t_lo = __riscv_vfmul(t_lo, (t_sc_inv), vlen); \
-    (T) = __riscv_vadd((T), __riscv_vfcvt_x(t_lo, (vlen)), (vlen)); \
-  } while(0)
-
-#define ERFCINV_PQ_TINY(T, p_hi_tiny, p_lo_tiny, q_hi_tiny, q_lo_tiny, vlen) \
-  do { \
-    /* T is in scale of 65 */ \
-    VINT P, Q; \
-    P = PSTEP_I_SRA(P_tiny_7, T, 4, PSTEP_I_SRA(P_tiny_8, P_tiny_9, 4, T, \
-        (vlen)), (vlen)); \
-    /* P in Q_56 */ \
-    P = PSTEP_I_SRA(P_tiny_5, T, 1, PSTEP_I_SRA(P_tiny_6, P, 2, T, \
-        (vlen)), (vlen)); \
-    /* P in Q_57 */ \
-    P = PSTEP_I_SRA(P_tiny_3, T, 1, PSTEP_I(P_tiny_4, P, T, \
-        (vlen)), (vlen)); \
-    /* P in Q_60 */ \
-    P = PSTEP_I_SLL(P_tiny_1, T, 1, PSTEP_I_SRA(P_tiny_2, P, 1, T, \
-        (vlen)), (vlen)); \
-    /* P in Q_64 */ \
-    P = PSTEP_I(P_tiny_0, T, P, (vlen));\
-    /* P in Q_66 */ \
-    \
-    Q = PSTEP_I_SRA(Q_tiny_7, T, 2, PSTEP_I_SRA(Q_tiny_8, Q_tiny_9, 3, T, \
-        (vlen)), (vlen)); \
-    /* Q in Q_56 */ \
-    Q = PSTEP_I_SRA(Q_tiny_5, T, 1, PSTEP_I_SRA(Q_tiny_6, Q, 2, T, \
-        (vlen)), (vlen)); \
-    /* Q in Q_57 */ \
-    Q = PSTEP_I_SRA(Q_tiny_3, T, 2, PSTEP_I(Q_tiny_4, Q, T, \
-        (vlen)), (vlen)); \
-    /* P in Q_59 */ \
-    Q = PSTEP_I_SLL(Q_tiny_1, T, 1, PSTEP_I(Q_tiny_2, Q, T, \
-        (vlen)), (vlen)); \
-    /* Q in Q_64 */ \
-    Q = PSTEP_I(Q_tiny_0, T, Q, (vlen));\
-    /* Q in Q_66 */ \
-    \
-    p_hi_tiny = __riscv_vfcvt_f(P, (vlen)); \
-    p_lo_tiny = __riscv_vfcvt_f( \
-                __riscv_vsub(P, __riscv_vfcvt_x(p_hi_tiny, (vlen)), (vlen)), (vlen)); \
-    p_lo_tiny = __riscv_vfadd(p_lo_tiny, DELTA_P0_tiny, (vlen)); \
-    q_hi_tiny = __riscv_vfcvt_f(Q, vlen); \
-    q_lo_tiny = __riscv_vfcvt_f( \
-                __riscv_vsub(Q, __riscv_vfcvt_x(q_hi_tiny, (vlen)), (vlen)), (vlen)); \
-    q_lo_tiny = __riscv_vfadd(q_lo, DELTA_Q0_tiny, (vlen)); \
-  } while(0)
+#define SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi, w_lo, T, t_sc, t_sc_inv, vlen)     \
+  do {                                                                         \
+    (w_hi) = __riscv_vfsqrt((y_hi), (vlen));                                   \
+    (w_lo) = __riscv_vfnmsub((w_hi), (w_hi), (y_hi), (vlen));                  \
+    (w_lo) = __riscv_vfadd((w_lo), (y_lo), (vlen));                            \
+    VFLOAT recip;                                                              \
+    recip = __riscv_vfadd((y_hi), (y_hi), (vlen));                             \
+    recip = __riscv_vfrec7(recip, (vlen));                                     \
+    recip = __riscv_vfmul(recip, (w_hi), (vlen));                              \
+    (w_lo) = __riscv_vfmul((w_lo), recip, (vlen));                             \
+    /* w_hi + w_lo is sqrt(y_hi + y_lo) to extra precision */                  \
+    /* now compute T = t_sc/(w_hi + w_lo) as fixed point */                    \
+    VFLOAT t_lo = VFMV_VF((t_sc), (vlen));                                     \
+    VFLOAT t_hi = __riscv_vfdiv(t_lo, (w_hi), (vlen));                         \
+    (T) = __riscv_vfcvt_x(t_hi, (vlen));                                       \
+    t_lo = __riscv_vfnmsac(t_lo, (w_hi), t_hi, (vlen));                        \
+    t_lo = __riscv_vfnmsac(t_lo, (w_lo), t_hi, (vlen));                        \
+    t_lo = __riscv_vfmul(t_lo, t_hi, (vlen));                                  \
+    t_lo = __riscv_vfmul(t_lo, (t_sc_inv), vlen);                              \
+    (T) = __riscv_vadd((T), __riscv_vfcvt_x(t_lo, (vlen)), (vlen));            \
+  } while (0)
 
+#define ERFCINV_PQ_TINY(T, p_hi_tiny, p_lo_tiny, q_hi_tiny, q_lo_tiny, vlen)   \
+  do {                                                                         \
+    /* T is in scale of 65 */                                                  \
+    VINT P, Q;                                                                 \
+    P = PSTEP_I_SRA(P_tiny_7, T, 4,                                            \
+                    PSTEP_I_SRA(P_tiny_8, P_tiny_9, 4, T, (vlen)), (vlen));    \
+    /* P in Q_56 */                                                            \
+    P = PSTEP_I_SRA(P_tiny_5, T, 1, PSTEP_I_SRA(P_tiny_6, P, 2, T, (vlen)),    \
+                    (vlen));                                                   \
+    /* P in Q_57 */                                                            \
+    P = PSTEP_I_SRA(P_tiny_3, T, 1, PSTEP_I(P_tiny_4, P, T, (vlen)), (vlen));  \
+    /* P in Q_60 */                                                            \
+    P = PSTEP_I_SLL(P_tiny_1, T, 1, PSTEP_I_SRA(P_tiny_2, P, 1, T, (vlen)),    \
+                    (vlen));                                                   \
+    /* P in Q_64 */                                                            \
+    P = PSTEP_I(P_tiny_0, T, P, (vlen));                                       \
+    /* P in Q_66 */                                                            \
+                                                                               \
+    Q = PSTEP_I_SRA(Q_tiny_7, T, 2,                                            \
+                    PSTEP_I_SRA(Q_tiny_8, Q_tiny_9, 3, T, (vlen)), (vlen));    \
+    /* Q in Q_56 */                                                            \
+    Q = PSTEP_I_SRA(Q_tiny_5, T, 1, PSTEP_I_SRA(Q_tiny_6, Q, 2, T, (vlen)),    \
+                    (vlen));                                                   \
+    /* Q in Q_57 */                                                            \
+    Q = PSTEP_I_SRA(Q_tiny_3, T, 2, PSTEP_I(Q_tiny_4, Q, T, (vlen)), (vlen));  \
+    /* P in Q_59 */                                                            \
+    Q = PSTEP_I_SLL(Q_tiny_1, T, 1, PSTEP_I(Q_tiny_2, Q, T, (vlen)), (vlen));  \
+    /* Q in Q_64 */                                                            \
+    Q = PSTEP_I(Q_tiny_0, T, Q, (vlen));                                       \
+    /* Q in Q_66 */                                                            \
+                                                                               \
+    p_hi_tiny = __riscv_vfcvt_f(P, (vlen));                                    \
+    p_lo_tiny = __riscv_vfcvt_f(                                               \
+        __riscv_vsub(P, __riscv_vfcvt_x(p_hi_tiny, (vlen)), (vlen)), (vlen));  \
+    p_lo_tiny = __riscv_vfadd(p_lo_tiny, DELTA_P0_tiny, (vlen));               \
+    q_hi_tiny = __riscv_vfcvt_f(Q, vlen);                                      \
+    q_lo_tiny = __riscv_vfcvt_f(                                               \
+        __riscv_vsub(Q, __riscv_vfcvt_x(q_hi_tiny, (vlen)), (vlen)), (vlen));  \
+    q_lo_tiny = __riscv_vfadd(q_lo, DELTA_Q0_tiny, (vlen));                    \
+  } while (0)
diff --git a/test/src/test_erfcinv.cpp b/test/src/test_erfcinv.cpp
index dc7f416..e937fcf 100644
--- a/test/src/test_erfcinv.cpp
+++ b/test/src/test_erfcinv.cpp
@@ -23,22 +23,28 @@ TEST(erfcinv, tiny_args) {
 
   COMMENT("erfcinv: current chosen algorithm; reduced argument in FP64 only")
 
-  x_start = 0x1.0p-1074;;
+  x_start = 0x1.0p-1074;
+  ;
   x_end = 0x1.0p-1000;
   nb_tests = 40000;
   double thres = 3.5;
-  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests, thres);
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end,
+                        nb_tests, thres);
 
-  x_start = 0x1.0p-1000;;
+  x_start = 0x1.0p-1000;
+  ;
   x_end = 0x1.0p-500;
   nb_tests = 40000;
   thres = 2.5;
-  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests, thres);
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end,
+                        nb_tests, thres);
 
-  x_start = 0x1.0p-500;;
+  x_start = 0x1.0p-500;
+  ;
   x_end = 0x1.0p-52;
   nb_tests = 40000;
-  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests, thres);
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end,
+                        nb_tests, thres);
 }
 
 TEST(erfcinv, small_args) {
@@ -47,15 +53,19 @@ TEST(erfcinv, small_args) {
 
   COMMENT("erfcinv: current chosen algorithm; reduced argument in FP64 only")
 
-  x_start = 0x1.0p-50;;
+  x_start = 0x1.0p-50;
+  ;
   x_end = 0x1.0p-20;
   nb_tests = 40000;
-  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end,
+                        nb_tests);
 
-  x_start = 0x1.0p-20;;
+  x_start = 0x1.0p-20;
+  ;
   x_end = 0x1.0p-4;
   nb_tests = 40000;
-  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end,
+                        nb_tests);
 }
 
 TEST(erfcinv, medium_args) {
@@ -64,19 +74,23 @@ TEST(erfcinv, medium_args) {
 
   COMMENT("erfcinv: current chosen algorithm; reduced argument in FP64 only")
 
-  x_start = 0x1.0p-4;;
+  x_start = 0x1.0p-4;
+  ;
   x_end = 0x1.0p-1;
   nb_tests = 40000;
-  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end,
+                        nb_tests);
 
-  x_start = 0x1.0p-1;;
+  x_start = 0x1.0p-1;
+  ;
   x_end = 0x1.8p0;
   nb_tests = 40000;
-  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end,
+                        nb_tests);
 
   x_start = 2.0 - 0x1.0p-52;
   x_end = 0x1.8p0;
   nb_tests = 40000;
-  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end,
+                        nb_tests);
 }
-
diff --git a/test/src/test_erfinv.cpp b/test/src/test_erfinv.cpp
index 1744e02..9e7a436 100644
--- a/test/src/test_erfinv.cpp
+++ b/test/src/test_erfinv.cpp
@@ -23,21 +23,26 @@ TEST(erfinv, small_args) {
 
   COMMENT("erfinv: current chosen algorithm; reduced argument in FP64 only")
 
-  x_start = 0x1.0p-40;;
+  x_start = 0x1.0p-40;
+  ;
   x_end = 0x1.0p-30;
   nb_tests = 40000;
-  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests);
+  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end,
+                        nb_tests);
 
-  x_start = 0x1.0p-30;;
+  x_start = 0x1.0p-30;
+  ;
   x_end = 0x1.0p-10;
   nb_tests = 40000;
-  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests);
+  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end,
+                        nb_tests);
 
-  x_start = 0x1.0p-10;;
+  x_start = 0x1.0p-10;
+  ;
   x_end = 0x1.0p-4;
   nb_tests = 40000;
-  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests);
-
+  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end,
+                        nb_tests);
 }
 
 TEST(erfinv, middle_args) {
@@ -46,40 +51,52 @@ TEST(erfinv, middle_args) {
 
   COMMENT("erfinv: current chosen algorithm; reduced argument in FP64 only")
 
-  x_start = 0x1.0p-4;;
+  x_start = 0x1.0p-4;
+  ;
   x_end = 0x1.0p-2;
   nb_tests = 40000;
-  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests);
+  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end,
+                        nb_tests);
 
-  x_start = 0x1.0p-2;;
+  x_start = 0x1.0p-2;
+  ;
   x_end = 0x1.0p-1;
   nb_tests = 40000;
-  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests);
+  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end,
+                        nb_tests);
 
-  x_start = 0x1.0p-1;;
+  x_start = 0x1.0p-1;
+  ;
   x_end = 0x1.78p-1;
   nb_tests = 40000;
-  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests);
+  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end,
+                        nb_tests);
 
-  x_start = 0x1.6p-1;;
+  x_start = 0x1.6p-1;
+  ;
   x_end = 0x1.7cp-1;
   nb_tests = 40000;
-  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests);
+  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end,
+                        nb_tests);
 
-  x_start = 0x1.71p-1;;
+  x_start = 0x1.71p-1;
+  ;
   x_end = 0x1.8p-1;
   nb_tests = 40000;
-  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests);
+  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end,
+                        nb_tests);
 
   x_start = -0.8;
   x_end = 0.8;
   nb_tests = 40000;
-  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests);
+  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end,
+                        nb_tests);
 
   x_start = 0.25;
   x_end = 0.9;
   nb_tests = 2;
-  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests);
+  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end,
+                        nb_tests);
 }
 
 TEST(erfinv, close_to_1) {
@@ -91,6 +108,6 @@ TEST(erfinv, close_to_1) {
   x_start = 1.0 - 0x1.0p-53;
   x_end = 0x1.ffp-1;
   nb_tests = 40000;
-  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests);
+  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end,
+                        nb_tests);
 }
-