diff --git a/CMakeLists.txt b/CMakeLists.txt
index 233f8ef..846675a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,6 +48,10 @@ set(PROJECT_SOURCES
     src/rvvlm_erfDI.c
     src/rvvlm_erfcD.c
     src/rvvlm_erfcDI.c
+    src/rvvlm_erfcinvD.c
+    src/rvvlm_erfcinvDI.c
+    src/rvvlm_erfinvD.c
+    src/rvvlm_erfinvDI.c
     src/rvvlm_expD_tbl.c
     src/rvvlm_expD.c
     src/rvvlm_expDI.c
diff --git a/include/foo.h b/include/foo.h
new file mode 100755
index 0000000..49c5391
--- /dev/null
+++ b/include/foo.h
@@ -0,0 +1,11 @@
+
+#define PSTEP_I_SLL(COEFF_j, X, K, POLY, vlen)                                 \
+  __riscv_vsadd(__riscv_vsll(                                                  \
+    __riscv_vsmul((POLY), (X), 1, (vlen)), (K), (vlen)),                       \
+        (COEFF_j), (vlen))
+
+#define PSTEP_I_SRA(COEFF_j, X, K, POLY, vlen)                                 \
+  __riscv_vsadd(__riscv_vsra(                                                  \
+    __riscv_vsmul((POLY), (X), 1, (vlen)), (K), (vlen)),                       \
+        (COEFF_j), (vlen))
+
diff --git a/include/rvvlm.h b/include/rvvlm.h
index 569af80..a5ac3a0 100644
--- a/include/rvvlm.h
+++ b/include/rvvlm.h
@@ -83,9 +83,25 @@ union sui64_fp64 {
 #define PSTEP_I(COEFF_j, X, POLY, vlen)                                        \
   __riscv_vsadd(__riscv_vsmul((POLY), (X), 1, (vlen)), (COEFF_j), (vlen))
 
+#define PSTEP_I_SLL(COEFF_j, X, K, POLY, vlen)                                 \
+  __riscv_vsadd(                                                               \
+      __riscv_vsll(__riscv_vsmul((POLY), (X), 1, (vlen)), (K), (vlen)),        \
+      (COEFF_j), (vlen))
+
+#define PSTEP_I_SRA(COEFF_j, X, K, POLY, vlen)                                 \
+  __riscv_vsadd(                                                               \
+      __riscv_vsra(__riscv_vsmul((POLY), (X), 1, (vlen)), (K), (vlen)),        \
+      (COEFF_j), (vlen))
+
 #define PSTEPN_I(COEFF_j, X, POLY, vlen)                                       \
   __riscv_vrsub(__riscv_vsmul((POLY), (X), 1, (vlen)), (COEFF_j), (vlen))
 
+#define PSTEP_I_ab(pick_a, COEFF_a, COEFF_b, X, POLY, vlen)                    \
+  __riscv_vsadd(                                                               \
+      __riscv_vsmul((POLY), (X), 1, (vlen)),                                   \
+      __riscv_vmerge(VMVI_VX((COEFF_b), (vlen)), (COEFF_a), (pick_a), (vlen)), \
+      (vlen))
+
 #define FAST2SUM(X, Y, S, s, vlen)                                             \
   do {                                                                         \
     (S) = __riscv_vfadd((X), (Y), (vlen));                                     \
@@ -285,6 +301,20 @@ union sui64_fp64 {
 #define RVVLM_ERFCDI_VSET_CONFIG "rvvlm_fp64m1.h"
 #define RVVLM_ERFCDI_STD rvvlm_erfcI
 
+// FP64 erfinv function configuration
+#define RVVLM_ERFINVD_VSET_CONFIG "rvvlm_fp64m1.h"
+#define RVVLM_ERFINVD_STD rvvlm_erfinv
+
+#define RVVLM_ERFINVDI_VSET_CONFIG "rvvlm_fp64m1.h"
+#define RVVLM_ERFINVDI_STD rvvlm_erfinvI
+
+// FP64 erfcinv function configuration
+#define RVVLM_ERFCINVD_VSET_CONFIG "rvvlm_fp64m1.h"
+#define RVVLM_ERFCINVD_STD rvvlm_erfcinv
+
+#define RVVLM_ERFCINVDI_VSET_CONFIG "rvvlm_fp64m1.h"
+#define RVVLM_ERFCINVDI_STD rvvlm_erfcinvI
+
 // FP64 exp function configuration
 #define RVVLM_EXPD_VSET_CONFIG "rvvlm_fp64m4.h"
 #define RVVLM_EXPD_STD rvvlm_expD_std
@@ -518,6 +548,14 @@ void RVVLM_ERFCD_STD(size_t x_len, const double *x, double *y);
 void RVVLM_ERFCDI_STD(size_t x_len, const double *x, size_t stride_x, double *y,
                       size_t stride_y);
 
+void RVVLM_ERFCINVD_STD(size_t x_len, const double *x, double *y);
+void RVVLM_ERFCINVDI_STD(size_t x_len, const double *x, size_t stride_x,
+                         double *y, size_t stride_y);
+
+void RVVLM_ERFINVD_STD(size_t x_len, const double *x, double *y);
+void RVVLM_ERFINVDI_STD(size_t x_len, const double *x, size_t stride_x,
+                        double *y, size_t stride_y);
+
 void RVVLM_EXPD_STD(size_t x_len, const double *x, double *y);
 void RVVLM_EXPDI_STD(size_t x_len, const double *x, size_t stride_x, double *y,
                      size_t stride_y);
diff --git a/include/rvvlm_erfcinvD.inc.h b/include/rvvlm_erfcinvD.inc.h
new file mode 100644
index 0000000..15816ee
--- /dev/null
+++ b/include/rvvlm_erfcinvD.inc.h
@@ -0,0 +1,211 @@
+// SPDX-FileCopyrightText: 2023 Rivos Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#if (STRIDE == UNIT_STRIDE)
+#define F_VER1 RVVLM_ERFCINVD_STD
+#else
+#define F_VER1 RVVLM_ERFCINVDI_STD
+#endif
+
+//Erfcinv is defined on (0, 2). Suffices to consider (0, 1]
+//Two regions of approximation: left is [0, 0x1.2p-2) and right is [0x1.2p-1, 1)
+//Both are done with rational functions.
+//For right, t*P(t)/Q(t) t = 1-x; x in [0x1.2p-1, 1)
+//For left, y*P(t)/Q(t), y = sqrt(-log(x)); and t = 1/y
+
+// P_coefficients in asending order, all in Q79. p0_delta is in floating point 
+#define P_right_0  -0x48dbe9f5b3eabaa
+#define P_right_1  -0xb35279f1a626ae5
+#define P_right_2  0x33789911873d184a
+#define P_right_3  -0x1bf9138fc77c0fbf
+#define P_right_4  -0x2d4ec43bc48403d4
+#define P_right_5  0x2d61deb53842cca1
+#define P_right_6  0x23324eca6b3ff02
+#define P_right_7  -0xd4ec1d31542c4fc
+#define P_right_8  0x2ecf3c60308b0f2
+#define P_right_9  0x7c917b3378071e
+#define P_right_10  -0x1e09b597f226ca
+#define DELTA_P0_right -0x1.ec7dc41c17860p-2
+
+// Q_coefficients in asending order, all in Q79. q0_delta is in floating point 
+#define Q_right_0  -0x52366e5b14c0970
+#define Q_right_1  -0xca57e95abcc599b
+#define Q_right_2  0x3b6c91ec67f5759c
+#define Q_right_3  -0x1c40d5daa3be22bc
+#define Q_right_4  -0x41f11eb5d837386c
+#define Q_right_5  0x3c6ce478fcd75c9a
+#define Q_right_6  0xbb1cd7270cfba1d
+#define Q_right_7  -0x1988a4116498f1af
+#define Q_right_8  0x44dc3042f103d20
+#define Q_right_9  0x2390e683d02edf3
+#define Q_right_10  -0x8ec66f2a7e410c
+#define DELTA_Q0_right -0x1.29a0161e99446p-3
+
+// P_coefficients in asending order, all in Q67. p0_delta is in floating point 
+#define P_left_0  0x17a0bb69321df
+#define P_left_1  0x402eb416ae6015
+#define P_left_2  0x2973eb18028ce34
+#define P_left_3  0x8034a7ece1d5370
+#define P_left_4  0xa76c08a74dae273
+#define P_left_5  0x11dd3876b83dd078
+#define P_left_6  0xfdd7693c3b77653
+#define P_left_7  0xb33d66152b3c223
+#define P_left_8  0x5a564c28c6a41a9
+#define P_left_9  0x1190449fe630213
+#define P_left_10  -0x659c784274e1
+#define DELTA_P0_left -0x1.d622f4cbe0eeep-2
+
+// Q_coefficients in asending order, all in Q67. q0_delta is in floating point 
+#define Q_left_0  0x17a09aabf9cee
+#define Q_left_1  0x4030b9059ffcad
+#define Q_left_2  0x29b26b0d87f7855
+#define Q_left_3  0x87572a13d3fa2dd
+#define Q_left_4  0xd7a728b5620ac3c
+#define Q_left_5  0x1754392b473fd439
+#define Q_left_6  0x1791b9a091a816c2
+#define Q_left_7  0x167f71db9e13b075
+#define Q_left_8  0xcb9f5f3e5e618a4
+#define Q_left_9  0x68271fae767c68e
+#define Q_left_10  0x13745c4fa224b25
+#define DELTA_Q0_left 0x1.f7e7557a34ae6p-2
+
+void F_VER1(API) {
+  size_t vlen;
+  VFLOAT vx, vx_sign, vy, vy_special;
+  VBOOL special_args;
+
+  VFLOAT dummy; 
+
+  SET_ROUNDTONEAREST;
+  // stripmining over input arguments
+  for (; _inarg_n > 0; _inarg_n -= vlen) {
+    vlen = VSET(_inarg_n);
+    vx = VFLOAD_INARG1(vlen);
+
+    // Handle Inf and NaN
+    EXCEPTION_HANDLING_ERFCINV(vx, special_args, vy_special, vlen);
+
+    dummy = VFMV_VF(3.1415926535, vlen);
+
+    vx_sign = __riscv_vfrsub(vx, fp_posOne, vlen);
+    VFLOAT two_minus_x = __riscv_vfadd(vx_sign, fp_posOne, vlen);
+    VBOOL x_gt_1 = __riscv_vmflt(vx_sign, fp_posZero, vlen);
+    vx = __riscv_vmerge(vx, two_minus_x, x_gt_1, vlen);
+    // vx is now in (0, 1] 
+    VBOOL x_in_left = __riscv_vmfle(vx, 0x1.2p-2, vlen);
+
+    VFLOAT w_hi, w_lo, w_hi_left, w_lo_left, y_hi, y_lo;
+    VINT T, T_left, T_tiny;
+    VBOOL x_is_tiny;
+
+    if (__riscv_vcpop(x_in_left, vlen) > 0) {
+        VFLOAT x_left = VFMV_VF(0x1.0p-3, vlen);
+        x_left = __riscv_vmerge(x_left, vx, x_in_left, vlen);
+        x_is_tiny = __riscv_vmflt(x_left, 0x1.0p-52, vlen);
+        INT n_adjust = 60;
+        x_left = __riscv_vfmul(x_left, 0x1.0p60, vlen);
+        NEG_LOGX_4_TRANSFORM(x_left, n_adjust, y_hi, y_lo, vlen);
+    
+        SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_left, w_lo_left, T_left, 
+                          0x1.0p63, 0x1.0p-63, vlen);
+        if (__riscv_vcpop(x_is_tiny, vlen) > 0) {
+          VFLOAT w_hi_dummy, w_lo_dummy;
+          SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_dummy, w_lo_dummy, T_tiny,
+                            0x1.0p65, 0x1.0p-65, vlen);
+          dummy = __riscv_vfmul(__riscv_vfcvt_f(T_tiny, vlen), 0x1.0p-2, vlen);
+          dummy = __riscv_vfsub(dummy, __riscv_vfcvt_f(T_left, vlen), vlen);
+        }
+    }
+    w_hi = VFMV_VF(fp_posOne, vlen);
+    w_hi = __riscv_vfsub(w_hi, vx, vlen);
+    w_lo = __riscv_vfrsub(w_hi, fp_posOne, vlen);
+    w_lo = __riscv_vfsub(w_lo, vx, vlen);
+    T = __riscv_vfcvt_x(__riscv_vfmul(w_hi, 0x1.0p63, vlen), vlen);
+    VFLOAT delta_t = __riscv_vfmul(w_lo, 0x1.0p63, vlen);
+    T = __riscv_vadd(T, __riscv_vfcvt_x(delta_t, vlen), vlen);
+    T = __riscv_vmerge(T, T_left, x_in_left, vlen);
+    
+    w_hi = __riscv_vmerge(w_hi, w_hi_left, x_in_left, vlen);
+    w_lo = __riscv_vmerge(w_lo, w_lo_left, x_in_left, vlen);
+
+    //For transformed branch, compute (w_hi + w_lo) * P(T)/Q(T)
+    VINT P, Q;
+
+    P = __riscv_vmerge(VMVI_VX(P_right_10, vlen), P_left_10, x_in_left, vlen);
+    P = PSTEP_I_ab(x_in_left, P_left_6, P_right_6, T, 
+        PSTEP_I_ab(x_in_left, P_left_7, P_right_7, T, 
+        PSTEP_I_ab(x_in_left, P_left_8, P_right_8, T, 
+        PSTEP_I_ab(x_in_left, P_left_9, P_right_9, T, P,
+        vlen), vlen), vlen), vlen); 
+
+    Q = __riscv_vmerge(VMVI_VX(Q_right_10, vlen), Q_left_10, x_in_left, vlen);
+    Q = PSTEP_I_ab(x_in_left, Q_left_6, Q_right_6, T, 
+        PSTEP_I_ab(x_in_left, Q_left_7, Q_right_7, T, 
+        PSTEP_I_ab(x_in_left, Q_left_8, Q_right_8, T, 
+        PSTEP_I_ab(x_in_left, Q_left_9, Q_right_9, T, Q,
+        vlen), vlen), vlen), vlen); 
+        
+    P = PSTEP_I_ab(x_in_left, P_left_0, P_right_0, T, 
+        PSTEP_I_ab(x_in_left, P_left_1, P_right_1, T, 
+        PSTEP_I_ab(x_in_left, P_left_2, P_right_2, T, 
+        PSTEP_I_ab(x_in_left, P_left_3, P_right_3, T, 
+        PSTEP_I_ab(x_in_left, P_left_4, P_right_4, T, 
+        PSTEP_I_ab(x_in_left, P_left_5, P_right_5, T, P,
+        vlen), vlen), vlen), vlen), vlen), vlen);
+
+    Q = PSTEP_I_ab(x_in_left, Q_left_0, Q_right_0, T, 
+        PSTEP_I_ab(x_in_left, Q_left_1, Q_right_1, T, 
+        PSTEP_I_ab(x_in_left, Q_left_2, Q_right_2, T, 
+        PSTEP_I_ab(x_in_left, Q_left_3, Q_right_3, T, 
+        PSTEP_I_ab(x_in_left, Q_left_4, Q_right_4, T, 
+        PSTEP_I_ab(x_in_left, Q_left_5, Q_right_5, T, Q,
+        vlen), vlen), vlen), vlen), vlen), vlen);
+
+    VFLOAT p_hi, p_lo;
+    p_hi = __riscv_vfcvt_f(P, vlen);
+
+    p_lo = __riscv_vfcvt_f(__riscv_vsub(P, __riscv_vfcvt_x(p_hi, vlen), vlen), vlen);
+    VFLOAT delta_p0 = VFMV_VF(DELTA_P0_right, vlen);
+    delta_p0 = __riscv_vfmerge(delta_p0, DELTA_P0_left, x_in_left, vlen);
+    p_lo = __riscv_vfadd(p_lo, delta_p0, vlen);
+
+    VFLOAT q_hi, q_lo;
+    q_hi = __riscv_vfcvt_f(Q, vlen);
+    q_lo = __riscv_vfcvt_f(__riscv_vsub(Q, __riscv_vfcvt_x(q_hi, vlen), vlen), vlen);
+    VFLOAT delta_q0 = VFMV_VF(DELTA_Q0_right, vlen);
+    delta_q0 = __riscv_vfmerge(delta_q0, DELTA_Q0_left, x_in_left, vlen);
+    q_lo = __riscv_vfadd(q_lo, delta_q0, vlen);
+
+    if (__riscv_vcpop(x_is_tiny, vlen) > 0) {
+      VFLOAT p_hi_tiny, p_lo_tiny, q_hi_tiny, q_lo_tiny;
+      ERFCINV_PQ_TINY(T_tiny, p_hi_tiny, p_lo_tiny, q_hi_tiny, q_lo_tiny, vlen);
+      p_hi = __riscv_vmerge(p_hi, p_hi_tiny, x_is_tiny, vlen);
+      p_lo = __riscv_vmerge(p_lo, p_lo_tiny, x_is_tiny, vlen);
+      q_hi = __riscv_vmerge(q_hi, q_hi_tiny, x_is_tiny, vlen);
+      q_lo = __riscv_vmerge(q_lo, q_lo_tiny, x_is_tiny, vlen);
+    }
+
+    // (y_hi, y_lo) <-- (w_hi + w_lo) * (p_hi + p_lo)
+    y_hi = __riscv_vfmul(w_hi, p_hi, vlen);
+    y_lo = __riscv_vfmsub(w_hi, p_hi, y_hi, vlen);
+    y_lo = __riscv_vfmacc(y_lo, w_hi, p_lo, vlen);
+    y_lo = __riscv_vfmacc(y_lo, w_lo, p_hi, vlen);
+
+    DIV_N2D2(y_hi, y_lo, q_hi, q_lo, w_hi, vlen);
+    
+    vy = w_hi; 
+
+    vy = __riscv_vfsgnj(vy, vx_sign, vlen);
+    vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
+
+    // vy = dummy;
+
+    // copy vy into y and increment addr pointers
+    VFSTORE_OUTARG1(vy, vlen);
+
+    INCREMENT_INARG1(vlen);
+    INCREMENT_OUTARG1(vlen);
+  }
+  RESTORE_FRM;
+}
diff --git a/include/rvvlm_erfinvD.inc.h b/include/rvvlm_erfinvD.inc.h
new file mode 100644
index 0000000..f7d78b4
--- /dev/null
+++ b/include/rvvlm_erfinvD.inc.h
@@ -0,0 +1,180 @@
+// SPDX-FileCopyrightText: 2023 Rivos Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#if (STRIDE == UNIT_STRIDE)
+#define F_VER1 RVVLM_ERFINVD_STD
+#else
+#define F_VER1 RVVLM_ERFINVDI_STD
+#endif
+
+//Two regions of approximation: left is [0, 0x1.7p-1) and right is [0x1.7p-1, 1)
+//Both are done with rational functions.
+//For left, x*P(x)/Q(x) x in [0, 0x1.7p-1)
+//For right, y*P(t)/Q(t), y = sqrt(-log(1-x)); and t = 1/y
+
+// P_coefficients in asending order, all in Q79. p0_delta is in floating point 
+#define P_left_0  -0x48dbe9f5b3eabaa
+#define P_left_1  -0xb35279f1a626ae5
+#define P_left_2  0x33789911873d184a
+#define P_left_3  -0x1bf9138fc77c0fbf
+#define P_left_4  -0x2d4ec43bc48403d4
+#define P_left_5  0x2d61deb53842cca1
+#define P_left_6  0x23324eca6b3ff02
+#define P_left_7  -0xd4ec1d31542c4fc
+#define P_left_8  0x2ecf3c60308b0f2
+#define P_left_9  0x7c917b3378071e
+#define P_left_10  -0x1e09b597f226ca
+#define DELTA_P0_left -0x1.ec7dc41c17860p-2
+
+// Q_coefficients in asending order, all in Q79. q0_delta is in floating point 
+#define Q_left_0  -0x52366e5b14c0970
+#define Q_left_1  -0xca57e95abcc599b
+#define Q_left_2  0x3b6c91ec67f5759c
+#define Q_left_3  -0x1c40d5daa3be22bc
+#define Q_left_4  -0x41f11eb5d837386c
+#define Q_left_5  0x3c6ce478fcd75c9a
+#define Q_left_6  0xbb1cd7270cfba1d
+#define Q_left_7  -0x1988a4116498f1af
+#define Q_left_8  0x44dc3042f103d20
+#define Q_left_9  0x2390e683d02edf3
+#define Q_left_10  -0x8ec66f2a7e410c
+#define DELTA_Q0_left -0x1.29a0161e99446p-3
+
+// P_coefficients in asending order, all in Q67. p0_delta is in floating point 
+#define P_right_0  0x17a0bb69321df
+#define P_right_1  0x402eb416ae6015
+#define P_right_2  0x2973eb18028ce34
+#define P_right_3  0x8034a7ece1d5370
+#define P_right_4  0xa76c08a74dae273
+#define P_right_5  0x11dd3876b83dd078
+#define P_right_6  0xfdd7693c3b77653
+#define P_right_7  0xb33d66152b3c223
+#define P_right_8  0x5a564c28c6a41a9
+#define P_right_9  0x1190449fe630213
+#define P_right_10  -0x659c784274e1
+#define DELTA_P0_right -0x1.d622f4cbe0eeep-2
+
+// Q_coefficients in asending order, all in Q67. q0_delta is in floating point 
+#define Q_right_0  0x17a09aabf9cee
+#define Q_right_1  0x4030b9059ffcad
+#define Q_right_2  0x29b26b0d87f7855
+#define Q_right_3  0x87572a13d3fa2dd
+#define Q_right_4  0xd7a728b5620ac3c
+#define Q_right_5  0x1754392b473fd439
+#define Q_right_6  0x1791b9a091a816c2
+#define Q_right_7  0x167f71db9e13b075
+#define Q_right_8  0xcb9f5f3e5e618a4
+#define Q_right_9  0x68271fae767c68e
+#define Q_right_10  0x13745c4fa224b25
+#define DELTA_Q0_right 0x1.f7e7557a34ae6p-2
+
+void F_VER1(API) {
+  size_t vlen;
+  VFLOAT vx, vx_orig, vy, vy_special;
+  VBOOL special_args;
+
+  SET_ROUNDTONEAREST;
+  // stripmining over input arguments
+  for (; _inarg_n > 0; _inarg_n -= vlen) {
+    vlen = VSET(_inarg_n);
+    vx = VFLOAD_INARG1(vlen);
+    vx_orig = vx;
+
+    // Handle Inf and NaN
+    EXCEPTION_HANDLING_ERFINV(vx, special_args, vy_special, vlen);
+
+    vx = __riscv_vfsgnj(vx, fp_posOne, vlen);
+    VBOOL x_in_right = __riscv_vmfge(vx, 0x1.7p-1, vlen);
+
+    VFLOAT w_hi, w_lo, w_hi_right, w_lo_right, y_hi, y_lo;
+    VINT T, T_right;
+
+    if (__riscv_vcpop(x_in_right, vlen) > 0) {
+        VFLOAT one_minus_x;
+        one_minus_x = __riscv_vfrsub(vx, fp_posOne, vlen);
+
+        VINT n_adjust;
+        n_adjust = __riscv_vxor(n_adjust, n_adjust, vlen);
+
+        NEG_LOGX_4_TRANSFORM(one_minus_x, n_adjust, y_hi, y_lo, vlen);
+
+        SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_right, w_lo_right, T_right, 
+                          0x1.0p63, 0x1.0p-63, vlen);
+    }
+    T = __riscv_vfcvt_x(__riscv_vfmul(vx, 0x1.0p63, vlen), vlen);
+    T = __riscv_vmerge(T, T_right, x_in_right, vlen);
+    
+    w_hi = vx;
+    w_lo = I_AS_F(__riscv_vxor(F_AS_I(w_lo), F_AS_I(w_lo), vlen));
+    w_hi = __riscv_vmerge(w_hi, w_hi_right, x_in_right, vlen);
+    w_lo = __riscv_vmerge(w_lo, w_lo_right, x_in_right, vlen);
+
+    //For transformed branch, compute (w_hi + w_lo) * P(T)/Q(T)
+    VINT P, Q;
+
+    P = __riscv_vmerge(VMVI_VX(P_left_10, vlen), P_right_10, x_in_right, vlen);
+    P = PSTEP_I_ab(x_in_right, P_right_6, P_left_6, T, 
+        PSTEP_I_ab(x_in_right, P_right_7, P_left_7, T, 
+        PSTEP_I_ab(x_in_right, P_right_8, P_left_8, T, 
+        PSTEP_I_ab(x_in_right, P_right_9, P_left_9, T, P,
+        vlen), vlen), vlen), vlen); 
+
+    Q = __riscv_vmerge(VMVI_VX(Q_left_10, vlen), Q_right_10, x_in_right, vlen);
+    Q = PSTEP_I_ab(x_in_right, Q_right_6, Q_left_6, T, 
+        PSTEP_I_ab(x_in_right, Q_right_7, Q_left_7, T, 
+        PSTEP_I_ab(x_in_right, Q_right_8, Q_left_8, T, 
+        PSTEP_I_ab(x_in_right, Q_right_9, Q_left_9, T, Q,
+        vlen), vlen), vlen), vlen); 
+        
+    P = PSTEP_I_ab(x_in_right, P_right_0, P_left_0, T, 
+        PSTEP_I_ab(x_in_right, P_right_1, P_left_1, T, 
+        PSTEP_I_ab(x_in_right, P_right_2, P_left_2, T, 
+        PSTEP_I_ab(x_in_right, P_right_3, P_left_3, T, 
+        PSTEP_I_ab(x_in_right, P_right_4, P_left_4, T, 
+        PSTEP_I_ab(x_in_right, P_right_5, P_left_5, T, P,
+        vlen), vlen), vlen), vlen), vlen), vlen);
+
+    Q = PSTEP_I_ab(x_in_right, Q_right_0, Q_left_0, T, 
+        PSTEP_I_ab(x_in_right, Q_right_1, Q_left_1, T, 
+        PSTEP_I_ab(x_in_right, Q_right_2, Q_left_2, T, 
+        PSTEP_I_ab(x_in_right, Q_right_3, Q_left_3, T, 
+        PSTEP_I_ab(x_in_right, Q_right_4, Q_left_4, T, 
+        PSTEP_I_ab(x_in_right, Q_right_5, Q_left_5, T, Q,
+        vlen), vlen), vlen), vlen), vlen), vlen);
+        
+    VFLOAT p_hi, p_lo;
+    p_hi = __riscv_vfcvt_f(P, vlen);
+
+    p_lo = __riscv_vfcvt_f(__riscv_vsub(P, __riscv_vfcvt_x(p_hi, vlen), vlen), vlen);
+    VFLOAT delta_p0 = VFMV_VF(DELTA_P0_left, vlen);
+    delta_p0 = __riscv_vfmerge(delta_p0, DELTA_P0_right, x_in_right, vlen);
+    p_lo = __riscv_vfadd(p_lo, delta_p0, vlen);
+
+    VFLOAT q_hi, q_lo;
+    q_hi = __riscv_vfcvt_f(Q, vlen);
+    q_lo = __riscv_vfcvt_f(__riscv_vsub(Q, __riscv_vfcvt_x(q_hi, vlen), vlen), vlen);
+    VFLOAT delta_q0 = VFMV_VF(DELTA_Q0_left, vlen);
+    delta_q0 = __riscv_vfmerge(delta_q0, DELTA_Q0_right, x_in_right, vlen);
+    q_lo = __riscv_vfadd(q_lo, delta_q0, vlen);
+
+    // (y_hi, y_lo) <-- (w_hi + w_lo) * (p_hi + p_lo)
+    y_hi = __riscv_vfmul(w_hi, p_hi, vlen);
+    y_lo = __riscv_vfmsub(w_hi, p_hi, y_hi, vlen);
+    y_lo = __riscv_vfmacc(y_lo, w_hi, p_lo, vlen);
+    y_lo = __riscv_vfmacc(y_lo, w_lo, p_hi, vlen);
+
+    DIV_N2D2(y_hi, y_lo, q_hi, q_lo, w_hi, vlen);
+    
+    vy = w_hi; 
+
+    vy = __riscv_vfsgnj(vy, vx_orig, vlen);
+    vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
+    // copy vy into y and increment addr pointers
+    VFSTORE_OUTARG1(vy, vlen);
+
+    INCREMENT_INARG1(vlen);
+    INCREMENT_OUTARG1(vlen);
+  }
+  RESTORE_FRM;
+}
diff --git a/include/rvvlm_inverrorfuncsD.h b/include/rvvlm_inverrorfuncsD.h
new file mode 100644
index 0000000..1b87630
--- /dev/null
+++ b/include/rvvlm_inverrorfuncsD.h
@@ -0,0 +1,215 @@
+// SPDX-FileCopyrightText: 2023 Rivos Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#define RTPI_BY2_HI 0x1.c5bf891b4ef6ap-1
+#define RTPI_BY2_LO 0x1.4f38760a41abbp-54
+#define NEG_LOG2_HI -0x1.62e42fefa4000p-1
+#define NEG_LOG2_LO 0x1.8432a1b0e2634p-43
+
+// P_coefficients in asending order, in varying scales. p0_delta is in floating point 
+#define P_tiny_0  -0x8593442e139d      // scale 66
+#define P_tiny_1  -0x1fcf7055ac5f03    // scale 64
+#define P_tiny_2  -0x106dde33d8dc179   // scale 61
+#define P_tiny_3  -0xc31d6e09935118a   // scale 60
+#define P_tiny_4  -0x3560de73cb5bbcc0  // scale 59
+#define P_tiny_5  -0x1eb4c7e14b254de8  // scale 57
+#define P_tiny_6  0x1fdf5a9d23430bd7   // scale 56
+#define P_tiny_7  0x62c4020631de121b   // scale 56
+#define P_tiny_8  0x4be1ed5d031773f1   // scale 58
+#define P_tiny_9  0x55a9f8b9538981a1   // scale 60
+#define DELTA_P0_tiny 0x1.ba4d0b79d16e6p-2   // scale 66
+
+// Q_coefficients in asending order, in varying scales. q0_delta is in floating point 
+#define Q_tiny_0  -0x85933cda2d6d      // sacle 66
+#define Q_tiny_1  -0x1fcf792da7d51d    // sacle 64
+#define Q_tiny_2  -0x106ec6e0ed13ae1   // sacle 61
+#define Q_tiny_3  -0x61b925a39a461aa   // sacle 59
+#define Q_tiny_4  -0x35ebf9dc72fab062  // sacle 59
+#define Q_tiny_5  -0x2131cf7760e82873  // sacle 57
+#define Q_tiny_6  0x1860ae67db2a6609   // sacle 56
+#define Q_tiny_7  0x5e9a123701d89289   // sacle 56
+#define Q_tiny_8  0x417b35aab14ac49d   // sacle 56
+#define Q_tiny_9  0x5e4a26a7c1415755   // sacle 57
+#define DELTA_Q0_tiny 0x1.8a7adad44d65ap-4 // scale 66
+
+// erfinv(+-1) = +-Inf with divide by zero
+// erfinv(x) |x| > 1, real is NaN with invalid
+// erfinv(NaN) is NaN, invalid if input is signalling NaN
+// erfinv(x) is (2/rt(pi)) x  for |x| < 2^-30
+#define EXCEPTION_HANDLING_ERFINV(vx, special_args, vy_special, vlen)          \
+  do {                                                                         \
+    VUINT expo_x = __riscv_vand(__riscv_vsrl(F_AS_U((vx)), MAN_LEN, (vlen)),   \
+                                0x7FF, (vlen));                                \
+    VBOOL x_large = __riscv_vmsgeu(expo_x, EXP_BIAS, (vlen));                  \
+    VBOOL x_small = __riscv_vmsltu(expo_x, EXP_BIAS - 30, (vlen));             \
+    (special_args) = __riscv_vmor(x_large, x_small, (vlen));                   \
+    if (__riscv_vcpop((special_args), (vlen)) > 0) {                           \
+      VFLOAT abs_x = __riscv_vfsgnj((vx), fp_posOne, (vlen));                  \
+      VBOOL x_gt_1 = __riscv_vmfgt(abs_x, fp_posOne, (vlen));                  \
+      VBOOL x_eq_1 = __riscv_vmfeq(abs_x, fp_posOne, (vlen));                  \
+      /* substitute |x| > 1 with sNaN */                                       \
+      (vx) = __riscv_vfmerge((vx), fp_sNaN, x_gt_1, (vlen));                   \
+      /* substitute |x| = 1 with +/-Inf and generate div-by-zero signal */     \
+      VFLOAT tmp = VFMV_VF(fp_posZero, (vlen));                                \
+      tmp = __riscv_vfsgnj(tmp, (vx), (vlen));                                 \
+      tmp = __riscv_vfrec7(x_eq_1, tmp, (vlen));                               \
+      (vy_special) = __riscv_vfadd((special_args), (vx), (vx), (vlen));        \
+      (vy_special) = __riscv_vmerge((vy_special), tmp, x_eq_1, (vlen));        \
+      tmp = __riscv_vfmul(x_small, (vx), RTPI_BY2_LO, (vlen));                 \
+      tmp = __riscv_vfmacc(x_small, tmp, RTPI_BY2_HI, (vx), (vlen));           \
+      (vy_special) = __riscv_vmerge((vy_special), tmp, x_small, (vlen));       \
+      (vx) = __riscv_vfmerge((vx), fp_posZero, (special_args), (vlen));        \
+    }                                                                          \
+  } while (0)
+
+// erfcinv(0) = Inf, erfcinv(2) = -Inf with divide by zero
+// erfcinv(x) x outside [0, 2], real is NaN with invalid
+// erfcinv(NaN) is NaN, invalid if input is signalling NaN
+#define EXCEPTION_HANDLING_ERFCINV(vx, special_args, vy_special, vlen)         \
+  do {                                                                         \
+    VUINT vclass = __riscv_vfclass((vx), (vlen));                              \
+    IDENTIFY(vclass, 0x39F, (special_args), (vlen));                           \
+    VBOOL x_ge_2 = __riscv_vmfge((vx), 0x1.0p1, (vlen));                       \
+    (special_args) = __riscv_vmor((special_args), x_ge_2, (vlen));             \
+    if (__riscv_vcpop((special_args), (vlen)) > 0) {                           \
+      VBOOL x_gt_2 = __riscv_vmfgt((vx), 0x1.0p1, (vlen));                     \
+      VBOOL x_lt_0 = __riscv_vmflt((vx), fp_posZero, (vlen));                  \
+      /* substitute x > 2 or x < 0 with sNaN */                                \
+      (vx) = __riscv_vfmerge((vx), fp_sNaN, x_gt_2, (vlen));                   \
+      (vx) = __riscv_vfmerge((vx), fp_sNaN, x_lt_0, (vlen));                   \
+      /* substitute x = 0 or 2 with +/-Inf and generate div-by-zero signal */  \
+      VFLOAT tmp = VFMV_VF(fp_posZero, (vlen));                                \
+      VFLOAT x_tmp = __riscv_vfrsub((vx), fp_posOne, (vlen));                  \
+      tmp = __riscv_vfsgnj(tmp, x_tmp, (vlen));                                \
+      VBOOL x_eq_2 = __riscv_vmfeq((vx), 0x1.0p1, (vlen));                     \
+      VBOOL x_eq_0 = __riscv_vmfeq((vx), fp_posZero, (vlen));                  \
+      VBOOL pm_Inf = __riscv_vmor(x_eq_2, x_eq_0, (vlen));                     \
+      tmp = __riscv_vfrec7(pm_Inf, tmp, (vlen));                               \
+      (vy_special) = __riscv_vfsub((special_args), (vx), (vx), (vlen));        \
+      (vy_special) = __riscv_vmerge((vy_special), tmp, pm_Inf, (vlen));        \
+      (vx) = __riscv_vfmerge((vx), fp_posOne, (special_args), (vlen));         \
+    }                                                                          \
+  } while (0)
+
+// Compute -log(2^(-n_adjust) * x), where x < 1
+#define NEG_LOGX_4_TRANSFORM(vx, n_adjust, y_hi, y_lo, vlen) \
+  do { \
+    /* work on entire vector register */ \
+    VFLOAT vx_in = (vx); \
+    VINT n = __riscv_vadd(__riscv_vsra(F_AS_I(vx_in), MAN_LEN - 8, (vlen)), 0x96, \
+                     vlen); \
+    n = __riscv_vsub(__riscv_vsra(n, 8, vlen), EXP_BIAS, vlen); \
+    VFLOAT scale = I_AS_F(__riscv_vsll(__riscv_vrsub(n, EXP_BIAS, (vlen)), MAN_LEN, (vlen))); \
+    vx_in = __riscv_vfmul(vx_in, scale, (vlen)); \
+    /* x is scaled, and -log(x) is 2 atanh(w/2); w = 2(1-x)/(1+x) */ \
+    n = __riscv_vsub(n, (n_adjust), (vlen)); \
+    VFLOAT n_flt = __riscv_vfcvt_f(n, (vlen)); \
+    VFLOAT numer = __riscv_vfrsub(vx_in, fp_posOne, (vlen)); \
+    /* note that 1-x is exact as 1/2 < x < 2 */ \
+    numer = __riscv_vfadd(numer, numer, (vlen)); \
+    VFLOAT denom = __riscv_vfadd(vx_in, fp_posOne, (vlen)); \
+    VFLOAT delta_denom = __riscv_vfadd(__riscv_vfrsub(denom, fp_posOne, (vlen)), vx_in, (vlen)); \
+    /* note that 1 - denom is exact even if denom > 2 */ \
+    /* becase 1 has many trailing zeros */ \
+    VFLOAT r_hi, r_lo, r; \
+    DIV_N1D2(numer, denom, delta_denom, r_hi, r_lo, (vlen)); \
+    r = __riscv_vfadd(r_hi, r_lo, (vlen)); \
+    /* for the original unscaled x, we have */ \
+    /* -log(x) = -n * log(2) + 2 atanh(-w/2) */ \
+    /* where w = 2(1-x)/(1+x); -w = 2(x-1)/(x+1) */ \
+    VFLOAT A, B; \
+    A = __riscv_vfmadd(n_flt, NEG_LOG2_HI, r_hi, (vlen)); \
+    B = __riscv_vfmsub(n_flt, NEG_LOG2_HI, A, (vlen)); \
+    B = __riscv_vfadd(B, r_hi, (vlen)); \
+    B = __riscv_vfadd(r_lo, B, (vlen)); \
+    VFLOAT rsq = __riscv_vfmul(r, r, (vlen)); \
+    VFLOAT rcube = __riscv_vfmul(rsq, r, (vlen)); \
+    VFLOAT r6 = __riscv_vfmul(rcube, rcube, (vlen)); \
+    VFLOAT poly_right = PSTEP(0x1.74681ff881228p-14, rsq, \
+                            PSTEP( 0x1.39751be23e4a3p-16, \
+                                   0x1.30a893993e73dp-18, rsq, vlen), \
+                        vlen); \
+    VFLOAT poly_left = PSTEP(0x1.999999996ce82p-7, rsq, \
+                            PSTEP(0x1.249249501b1adp-9, \
+                                  0x1.c71c47e7189f6p-12, rsq, vlen), \
+                       vlen); \
+    poly_left = __riscv_vfmacc(poly_left, r6, poly_right, (vlen)); \
+    poly_left = PSTEP(0x1.55555555555dbp-4, rsq, poly_left, (vlen)); \
+    B = __riscv_vfmacc(B, NEG_LOG2_LO, n_flt, (vlen)); \
+    B = __riscv_vfmacc(B, rcube, poly_left, (vlen)); \
+    FAST2SUM(A, B, (y_hi), (y_lo), (vlen)); \
+    /* A + B is -log(x) with extra precision, |B| \le ulp(A)/2 */ \
+  } while(0)
+
+// This macro computes w_hi + w_lo = sqrt(y_hi + y_lo) in floating point 
+// and 1/(w_hi + w_lo) as a Q63 fixed-point T 
+// y_hi, y_lo is normalized on input; that is y_hi has 
+// full working precision of the sum y_hi + y_lo
+// and 2 log(2) < y_hi < 1100 log(2)
+#define SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi, w_lo, T, t_sc, t_sc_inv, vlen) \
+  do { \
+    (w_hi) = __riscv_vfsqrt((y_hi), (vlen)); \
+    (w_lo) = __riscv_vfnmsub((w_hi), (w_hi), (y_hi), (vlen)); \
+    (w_lo) = __riscv_vfadd((w_lo), (y_lo), (vlen)); \
+    VFLOAT recip; \
+    recip = __riscv_vfadd((y_hi), (y_hi), (vlen)); \
+    recip = __riscv_vfrec7(recip, (vlen)); \
+    recip = __riscv_vfmul(recip, (w_hi), (vlen)); \
+    (w_lo) = __riscv_vfmul((w_lo), recip, (vlen)); \
+    /* w_hi + w_lo is sqrt(y_hi + y_lo) to extra precision */ \
+    /* now compute T = t_sc/(w_hi + w_lo) as fixed point */ \
+    VFLOAT t_lo = VFMV_VF((t_sc), (vlen)); \
+    VFLOAT t_hi = __riscv_vfdiv(t_lo, (w_hi), (vlen)); \
+    (T) = __riscv_vfcvt_x(t_hi, (vlen)); \
+    t_lo = __riscv_vfnmsac(t_lo, (w_hi), t_hi, (vlen)); \
+    t_lo = __riscv_vfnmsac(t_lo, (w_lo), t_hi, (vlen)); \
+    t_lo = __riscv_vfmul(t_lo, t_hi, (vlen)); \
+    t_lo = __riscv_vfmul(t_lo, (t_sc_inv), vlen); \
+    (T) = __riscv_vadd((T), __riscv_vfcvt_x(t_lo, (vlen)), (vlen)); \
+  } while(0)
+
+#define ERFCINV_PQ_TINY(T, p_hi_tiny, p_lo_tiny, q_hi_tiny, q_lo_tiny, vlen) \
+  do { \
+    /* T is in scale of 65 */ \
+    VINT P, Q; \
+    P = PSTEP_I_SRA(P_tiny_7, T, 4, PSTEP_I_SRA(P_tiny_8, P_tiny_9, 4, T, \
+        (vlen)), (vlen)); \
+    /* P in Q_56 */ \
+    P = PSTEP_I_SRA(P_tiny_5, T, 1, PSTEP_I_SRA(P_tiny_6, P, 2, T, \
+        (vlen)), (vlen)); \
+    /* P in Q_57 */ \
+    P = PSTEP_I_SRA(P_tiny_3, T, 1, PSTEP_I(P_tiny_4, P, T, \
+        (vlen)), (vlen)); \
+    /* P in Q_60 */ \
+    P = PSTEP_I_SLL(P_tiny_1, T, 1, PSTEP_I_SRA(P_tiny_2, P, 1, T, \
+        (vlen)), (vlen)); \
+    /* P in Q_64 */ \
+    P = PSTEP_I(P_tiny_0, T, P, (vlen));\
+    /* P in Q_66 */ \
+    \
+    Q = PSTEP_I_SRA(Q_tiny_7, T, 2, PSTEP_I_SRA(Q_tiny_8, Q_tiny_9, 3, T, \
+        (vlen)), (vlen)); \
+    /* Q in Q_56 */ \
+    Q = PSTEP_I_SRA(Q_tiny_5, T, 1, PSTEP_I_SRA(Q_tiny_6, Q, 2, T, \
+        (vlen)), (vlen)); \
+    /* Q in Q_57 */ \
+    Q = PSTEP_I_SRA(Q_tiny_3, T, 2, PSTEP_I(Q_tiny_4, Q, T, \
+        (vlen)), (vlen)); \
+    /* P in Q_59 */ \
+    Q = PSTEP_I_SLL(Q_tiny_1, T, 1, PSTEP_I(Q_tiny_2, Q, T, \
+        (vlen)), (vlen)); \
+    /* Q in Q_64 */ \
+    Q = PSTEP_I(Q_tiny_0, T, Q, (vlen));\
+    /* Q in Q_66 */ \
+    \
+    p_hi_tiny = __riscv_vfcvt_f(P, (vlen)); \
+    p_lo_tiny = __riscv_vfcvt_f( \
+                __riscv_vsub(P, __riscv_vfcvt_x(p_hi_tiny, (vlen)), (vlen)), (vlen)); \
+    p_lo_tiny = __riscv_vfadd(p_lo_tiny, DELTA_P0_tiny, (vlen)); \
+    q_hi_tiny = __riscv_vfcvt_f(Q, vlen); \
+    q_lo_tiny = __riscv_vfcvt_f( \
+                __riscv_vsub(Q, __riscv_vfcvt_x(q_hi_tiny, (vlen)), (vlen)), (vlen)); \
+    q_lo_tiny = __riscv_vfadd(q_lo, DELTA_Q0_tiny, (vlen)); \
+  } while(0)
+
diff --git a/src/rvvlm_erfcinvD.c b/src/rvvlm_erfcinvD.c
new file mode 100644
index 0000000..356c2ff
--- /dev/null
+++ b/src/rvvlm_erfcinvD.c
@@ -0,0 +1,17 @@
+// SPDX-FileCopyrightText: 2023 Rivos Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <riscv_vector.h>
+#include <stdio.h>
+
+#include "rvvlm.h"
+#define API_SIGNATURE API_SIGNATURE_11
+#define STRIDE UNIT_STRIDE
+
+#include RVVLM_ERFCINVD_VSET_CONFIG
+
+#define COMPILE_FOR_ERFCINV
+#include "rvvlm_inverrorfuncsD.h"
+
+#include "rvvlm_erfcinvD.inc.h"
diff --git a/src/rvvlm_erfcinvDI.c b/src/rvvlm_erfcinvDI.c
new file mode 100644
index 0000000..ab6defb
--- /dev/null
+++ b/src/rvvlm_erfcinvDI.c
@@ -0,0 +1,17 @@
+// SPDX-FileCopyrightText: 2023 Rivos Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <riscv_vector.h>
+#include <stdio.h>
+
+#include "rvvlm.h"
+#define API_SIGNATURE API_SIGNATURE_11
+#define STRIDE GENERAL_STRIDE
+
+#include RVVLM_ERFCINVDI_VSET_CONFIG
+
+#define COMPILE_FOR_ERFCINV
+#include "rvvlm_inverrorfuncsD.h"
+
+#include "rvvlm_erfcinvD.inc.h"
diff --git a/src/rvvlm_erfinvD.c b/src/rvvlm_erfinvD.c
new file mode 100644
index 0000000..864d6c2
--- /dev/null
+++ b/src/rvvlm_erfinvD.c
@@ -0,0 +1,17 @@
+// SPDX-FileCopyrightText: 2023 Rivos Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <riscv_vector.h>
+#include <stdio.h>
+
+#include "rvvlm.h"
+#define API_SIGNATURE API_SIGNATURE_11
+#define STRIDE UNIT_STRIDE
+
+#include RVVLM_ERFINVD_VSET_CONFIG
+
+#define COMPILE_FOR_ERFINV
+#include "rvvlm_inverrorfuncsD.h"
+
+#include "rvvlm_erfinvD.inc.h"
diff --git a/src/rvvlm_erfinvDI.c b/src/rvvlm_erfinvDI.c
new file mode 100644
index 0000000..005d032
--- /dev/null
+++ b/src/rvvlm_erfinvDI.c
@@ -0,0 +1,17 @@
+// SPDX-FileCopyrightText: 2023 Rivos Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <riscv_vector.h>
+#include <stdio.h>
+
+#include "rvvlm.h"
+#define API_SIGNATURE API_SIGNATURE_11
+#define STRIDE GENERAL_STRIDE
+
+#include RVVLM_ERFINVDI_VSET_CONFIG
+
+#define COMPILE_FOR_ERFINV
+#include "rvvlm_inverrorfuncsD.h"
+
+#include "rvvlm_erfinvD.inc.h"
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index f4ce376..89ccd76 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -30,10 +30,12 @@ set(TEST_SOURCES
 #   src/test_cbrtI.cpp
 #   src/test_cdfnorm.cpp
 #   src/test_cdfnormI.cpp
-#   src/test_erf.cpp
-#   src/test_erfI.cpp
-#   src/test_erfc.cpp
-#   src/test_erfcI.cpp
+    src/test_erf.cpp
+    src/test_erfI.cpp
+    src/test_erfc.cpp
+    src/test_erfcI.cpp
+    src/test_erfinv.cpp
+    src/test_erfcinv.cpp
     src/test_exp.cpp
     src/test_expI.cpp
     src/test_exp2.cpp
diff --git a/test/include/test_infra.h b/test/include/test_infra.h
index ba09dce..ccb01ab 100644
--- a/test/include/test_infra.h
+++ b/test/include/test_infra.h
@@ -42,6 +42,11 @@ void report_err2_fp64(void (*test_func)(size_t, const double *, size_t,
                       long double (*ref_func)(long double, long double), double,
                       double, int, int, double, double, int, int, int, bool);
 
+void report_err_byinv_fp64(void (*test_func)(size_t, const double *, double *),
+                           long double (*ref_inv_func)(long double),
+                           long double (*ref_inv_func_prime)(long double),
+                           double, double, int, double = 1.0);
+
 void report_err_pow_fp64(void (*test_func)(size_t, const double *,
                                            const double *, double *),
                          long double (*ref_func)(long double, long double),
@@ -78,3 +83,5 @@ long double cdfnorml(long double);
 long double exp_neg_rsq(long double);
 long double x_transform(long double);
 long double recip_scale(long double);
+long double erfl_prime(long double);
+long double erfcl_prime(long double);
diff --git a/test/src/test_erf.cpp b/test/src/test_erf.cpp
index d8de902..6f82462 100644
--- a/test/src/test_erf.cpp
+++ b/test/src/test_erf.cpp
@@ -14,7 +14,7 @@ TEST(erf, test) {
 
   COMMENT("erf: current chosen algorithm; reduced argument in FP64 only")
 
-  // show_special_fp64(rvvlm_erf, "Special Value handling of this function");
+  show_special_fp64(rvvlm_erf, "Special Value handling of this function");
 
   x_start = 0x1.0p-40;
   x_end = 0x1.0p-20;
diff --git a/test/src/test_erfc.cpp b/test/src/test_erfc.cpp
index 265a15a..e0f149f 100644
--- a/test/src/test_erfc.cpp
+++ b/test/src/test_erfc.cpp
@@ -8,45 +8,48 @@
 #include "rvvlm.h"
 #include "test_infra.h"
 
-TEST(erfc, test) {
+TEST(erfc, special) {
   unsigned long nb_tests;
   double x_start, x_end;
 
   COMMENT("erfc: current chosen algorithm; reduced argument in FP64 only")
 
   show_special_fp64(rvvlm_erfc, "Special Value handling of this function");
+}
+
+TEST(erfc, medium_args) {
+  unsigned long nb_tests;
+  double x_start, x_end;
+
+  COMMENT("erfc: current chosen algorithm; reduced argument in FP64 only")
 
   x_start = -0x1.0p0;
   x_end = 0x1.0p0;
   nb_tests = 400000;
   report_err_fp64(rvvlm_erfc, erfcl, x_start, x_end, nb_tests);
+}
 
-  x_start = 0x1.0p-2;
-  x_end = 0x1.0p0;
-  nb_tests = 400000;
-  report_err_fp64(rvvlm_erfc, erfcl, x_start, x_end, nb_tests);
-  report_err_fp64(rvvlm_erfc, erfcl, -x_start, -x_end, nb_tests);
+TEST(erfc, around_one) {
+  unsigned long nb_tests;
+  double x_start, x_end;
 
-  x_start = 0x1.0p0;
-  x_end = 0x1.0p3;
-  nb_tests = 400000;
-  report_err_fp64(rvvlm_erfc, erfcl, x_start, x_end, nb_tests);
-  report_err_fp64(rvvlm_erfc, erfcl, -x_start, -x_end, nb_tests);
+  COMMENT("erfc: current chosen algorithm; reduced argument in FP64 only")
 
-  x_start = 0x1.0p3;
-  x_end = 30.0;
+  x_start = 0x1.0p-1;
+  x_end = 0x1.0p1;
   nb_tests = 400000;
   report_err_fp64(rvvlm_erfc, erfcl, x_start, x_end, nb_tests);
   report_err_fp64(rvvlm_erfc, erfcl, -x_start, -x_end, nb_tests);
+}
 
-  x_start = 0x1.0p-80;
-  x_end = 0x1.0p-50;
-  nb_tests = 400000;
-  report_err_fp64(rvvlm_erfc, erfcl, x_start, x_end, nb_tests);
-  report_err_fp64(rvvlm_erfc, erfcl, -x_start, -x_end, nb_tests);
+TEST(erfc, large) {
+  unsigned long nb_tests;
+  double x_start, x_end;
 
-  x_start = 0x1.0p-50;
-  x_end = 0x1.0p-10;
+  COMMENT("erfc: current chosen algorithm; reduced argument in FP64 only")
+
+  x_start = 0x1.0p1;
+  x_end = 0x1.0p3;
   nb_tests = 400000;
   report_err_fp64(rvvlm_erfc, erfcl, x_start, x_end, nb_tests);
   report_err_fp64(rvvlm_erfc, erfcl, -x_start, -x_end, nb_tests);
diff --git a/test/src/test_erfcinv.cpp b/test/src/test_erfcinv.cpp
new file mode 100644
index 0000000..dc7f416
--- /dev/null
+++ b/test/src/test_erfcinv.cpp
@@ -0,0 +1,82 @@
+// SPDX-FileCopyrightText: 2023 Rivos Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gtest/gtest.h>
+#include <math.h>
+
+#include "rvvlm.h"
+#include "test_infra.h"
+
+TEST(erfcinv, special) {
+  unsigned long nb_tests;
+  double x_start, x_end;
+
+  COMMENT("erfcinv: current chosen algorithm; reduced argument in FP64 only")
+
+  show_special_fp64(rvvlm_erfcinv, "Special Value handling of this function");
+}
+
+TEST(erfcinv, tiny_args) {
+  unsigned long nb_tests;
+  double x_start, x_end;
+
+  COMMENT("erfcinv: current chosen algorithm; reduced argument in FP64 only")
+
+  x_start = 0x1.0p-1074;;
+  x_end = 0x1.0p-1000;
+  nb_tests = 40000;
+  double thres = 3.5;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests, thres);
+
+  x_start = 0x1.0p-1000;;
+  x_end = 0x1.0p-500;
+  nb_tests = 40000;
+  thres = 2.5;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests, thres);
+
+  x_start = 0x1.0p-500;;
+  x_end = 0x1.0p-52;
+  nb_tests = 40000;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests, thres);
+}
+
+TEST(erfcinv, small_args) {
+  unsigned long nb_tests;
+  double x_start, x_end;
+
+  COMMENT("erfcinv: current chosen algorithm; reduced argument in FP64 only")
+
+  x_start = 0x1.0p-50;;
+  x_end = 0x1.0p-20;
+  nb_tests = 40000;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-20;;
+  x_end = 0x1.0p-4;
+  nb_tests = 40000;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+}
+
+TEST(erfcinv, medium_args) {
+  unsigned long nb_tests;
+  double x_start, x_end;
+
+  COMMENT("erfcinv: current chosen algorithm; reduced argument in FP64 only")
+
+  x_start = 0x1.0p-4;;
+  x_end = 0x1.0p-1;
+  nb_tests = 40000;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-1;;
+  x_end = 0x1.8p0;
+  nb_tests = 40000;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+
+  x_start = 2.0 - 0x1.0p-52;
+  x_end = 0x1.8p0;
+  nb_tests = 40000;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+}
+
diff --git a/test/src/test_erfcinv.foo b/test/src/test_erfcinv.foo
new file mode 100755
index 0000000..43231db
--- /dev/null
+++ b/test/src/test_erfcinv.foo
@@ -0,0 +1,109 @@
+// SPDX-FileCopyrightText: 2023 Rivos Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <math.h>
+#include <stdio.h>
+
+#include "rvvlm.h"
+#include "test_infra.h"
+
+#define COMMENT(comment)                                                       \
+  { printf("\n=====\t" comment "\n"); }
+
+int main() {
+  unsigned long nb_tests;
+  double x_start, x_end;
+
+  COMMENT("erfcinv: current chosen algorithm; reduced argument in FP64 only")
+
+  show_special_fp64(rvvlm_erfcinv, "Special Value handling of this function");
+
+  x_start = 0x1.0p-5;
+  x_end =  0x1.0p-2;
+  nb_tests = 100000;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-2;
+  x_end =  0x1.0p-1;
+  nb_tests = 100000;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-1;
+  x_end =  0x1.fffp-1;
+  nb_tests = 100000;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+
+  x_start = 1.0 + 0x1.0p-5;
+  x_end =  1.0 + 0x1.6p-2;
+  nb_tests = 100000;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+
+  x_start = 2.0 - 0x1.0p-52;
+  x_end =  1.0 + 0x1.6p-1;
+  nb_tests = 100000;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0.24;
+  x_end =  0.3;
+  nb_tests = 2;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-54;
+  x_end =  0x1.0p-53;
+  nb_tests = 100000;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-55;
+  x_end =  0x1.0p-54;
+  nb_tests = 100000;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-56;
+  x_end =  0x1.0p-55;
+  nb_tests = 100000;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-57;
+  x_end =  0x1.0p-56;
+  nb_tests = 100000;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-58;
+  x_end =  0x1.0p-57;
+  nb_tests = 100000;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-59;
+  x_end =  0x1.0p-58;
+  nb_tests = 100000;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-60;
+  x_end =  0x1.0p-59;
+  nb_tests = 100000;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-100;
+  x_end =  0x1.0p-60;
+  nb_tests = 100000;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-500;
+  x_end =  0x1.0p-100;
+  nb_tests = 100000;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-1022;
+  x_end =  0x1.0p-100;
+  nb_tests = 100000;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-1074;
+  x_end =  0x1.0p-1022;
+  nb_tests = 10000;
+  report_err_byinv_fp64(rvvlm_erfcinv, erfcl, erfcl_prime, x_start, x_end, nb_tests);
+
+  return 0;
+}
+
diff --git a/test/src/test_erfinv.cpp b/test/src/test_erfinv.cpp
new file mode 100644
index 0000000..1744e02
--- /dev/null
+++ b/test/src/test_erfinv.cpp
@@ -0,0 +1,96 @@
+// SPDX-FileCopyrightText: 2023 Rivos Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gtest/gtest.h>
+#include <math.h>
+
+#include "rvvlm.h"
+#include "test_infra.h"
+
+TEST(erfinv, special) {
+  unsigned long nb_tests;
+  double x_start, x_end;
+
+  COMMENT("erfinv: current chosen algorithm; reduced argument in FP64 only")
+
+  show_special_fp64(rvvlm_erfinv, "Special Value handling of this function");
+}
+
+TEST(erfinv, small_args) {
+  unsigned long nb_tests;
+  double x_start, x_end;
+
+  COMMENT("erfinv: current chosen algorithm; reduced argument in FP64 only")
+
+  x_start = 0x1.0p-40;;
+  x_end = 0x1.0p-30;
+  nb_tests = 40000;
+  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-30;;
+  x_end = 0x1.0p-10;
+  nb_tests = 40000;
+  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-10;;
+  x_end = 0x1.0p-4;
+  nb_tests = 40000;
+  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests);
+
+}
+
+TEST(erfinv, middle_args) {
+  unsigned long nb_tests;
+  double x_start, x_end;
+
+  COMMENT("erfinv: current chosen algorithm; reduced argument in FP64 only")
+
+  x_start = 0x1.0p-4;;
+  x_end = 0x1.0p-2;
+  nb_tests = 40000;
+  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-2;;
+  x_end = 0x1.0p-1;
+  nb_tests = 40000;
+  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0x1.0p-1;;
+  x_end = 0x1.78p-1;
+  nb_tests = 40000;
+  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0x1.6p-1;;
+  x_end = 0x1.7cp-1;
+  nb_tests = 40000;
+  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0x1.71p-1;;
+  x_end = 0x1.8p-1;
+  nb_tests = 40000;
+  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests);
+
+  x_start = -0.8;
+  x_end = 0.8;
+  nb_tests = 40000;
+  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests);
+
+  x_start = 0.25;
+  x_end = 0.9;
+  nb_tests = 2;
+  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests);
+}
+
+TEST(erfinv, close_to_1) {
+  unsigned long nb_tests;
+  double x_start, x_end;
+
+  COMMENT("erfinv: current chosen algorithm; reduced argument in FP64 only")
+
+  x_start = 1.0 - 0x1.0p-53;
+  x_end = 0x1.ffp-1;
+  nb_tests = 40000;
+  report_err_byinv_fp64(rvvlm_erfinv, erfl, erfl_prime, x_start, x_end, nb_tests);
+}
+
diff --git a/test/src/test_infra.cpp b/test/src/test_infra.cpp
index 715a075..686501e 100644
--- a/test/src/test_infra.cpp
+++ b/test/src/test_infra.cpp
@@ -457,6 +457,91 @@ void report_err_fp64(void (*test_func)(size_t, const double *, size_t, double *,
   free(z);
 }
 
+void report_err_byinv_fp64(void (*test_func)(size_t, const double *, double *),
+                           long double (*ref_inv_func)(long double),
+                           long double (*ref_inv_func_prime)(long double),
+                           double start, double end, int nb_pts,
+                           double threshold) {
+
+  long double f_inv_ref;
+  double *x, *y, delta;
+  long double abs_err, rel_err, ulp_err;
+  long double max_abs_err, max_rel_err, max_ulp_err;
+
+  x = (double *)malloc(nb_pts * sizeof(double));
+  y = (double *)malloc(nb_pts * sizeof(double));
+
+  if (nb_pts <= 1) {
+    delta = 0.0;
+    nb_pts = 1;
+  } else {
+    delta = (end - start) / (double)(nb_pts - 1);
+  }
+
+  max_abs_err = 0.0L;
+  max_rel_err = 0.0L;
+  max_ulp_err = 0.0L;
+
+  // fill up the set of test points
+  for (int i = 0; i < nb_pts; ++i) {
+    x[i] = start + (double)i * delta;
+  }
+
+  // call function under test
+  test_func((size_t)nb_pts, x, y);
+
+  // now for each point we compute error and log the max
+  for (int j = 0; j < nb_pts; j++) {
+    f_inv_ref = ref_inv_func((long double)y[j]);
+    abs_err = (long double)x[j] - f_inv_ref;
+    abs_err = abs_err / ref_inv_func_prime((long double)y[j]);
+    abs_err = ABS(abs_err);
+
+    if (ABS(y[j]) > 0.0) {
+      rel_err = abs_err / ABS(y[j]);
+    } else {
+      rel_err = abs_err / 0x1.0p-1074;
+    }
+    ulp_err = abs_err / ulp_64(y[j]);
+
+    max_abs_err = MAX(max_abs_err, abs_err);
+    max_rel_err = MAX(max_rel_err, rel_err);
+    max_ulp_err = MAX(max_ulp_err, ulp_err);
+
+    if (VERBOSE) {
+      union sui64_fp64 xxx, yyy;
+      xxx.f = x[j];
+      yyy.f = y[j];
+      printf("--input %24.17le, 0x%016lx, output %24.17le, 0x%016lx \n", xxx.f,
+             xxx.ui, yyy.f, yyy.ui);
+      printf("  inverse of computed value %28.25Le\n\n", f_inv_ref);
+      printf("  abs err %8.3Le, rel err %8.3Le, ulp err %8.3Le\n\n", abs_err,
+             rel_err, ulp_err);
+      printf("  max observered abs err %8.3Le\n", max_abs_err);
+    }
+  }
+  printf("----------------------------\n");
+  if ((ABS(start) > 100.) || (ABS(end) < 1.e-2)) {
+    printf("Tested %d points in [%8.3le, %8.3le]\n", nb_pts, start, end);
+  } else {
+    printf("Tested %d points in [%3.3lf, %3.3lf]\n", nb_pts, start, end);
+  }
+  printf("Maximum observed absolute error is %8.3Le\n", max_abs_err);
+  printf("Maximum observed relative error is %8.3Le\n", max_rel_err);
+  if (max_rel_err > 0.0) {
+    printf("                          which is 2^(%3.3Lf)\n",
+           log2(max_rel_err));
+  }
+  printf("Maximum observed ULP      error is %3.3Lf\n", max_ulp_err);
+
+  EXPECT_LT(max_ulp_err, threshold);
+
+  free(x);
+  free(y);
+
+  return;
+}
+
 void report_err_pow_fp64(void (*test_func)(size_t, const double *,
                                            const double *, double *),
                          long double (*ref_func)(long double, long double),
@@ -1108,3 +1193,17 @@ long double recip_scale(long double x) {
   result = result / (result + x + x);
   return result;
 }
+
+long double erfl_prime(long double x) {
+  long double two_ov_rt_pi = 1.12837916709551257389615890312154517L;
+  long double y;
+  y = two_ov_rt_pi * expl(-x * x);
+  return y;
+}
+
+long double erfcl_prime(long double x) {
+  long double two_ov_rt_pi = 1.12837916709551257389615890312154517L;
+  long double y;
+  y = -two_ov_rt_pi * expl(-x * x);
+  return y;
+}