From b0d08463380742de7d7d22b83182f0230c2fe8bd Mon Sep 17 00:00:00 2001
From: Yang Hau <yuanyanghau@gmail.com>
Date: Thu, 1 Aug 2024 10:38:01 +0800
Subject: [PATCH] feat: Add vqrdml[a|s]h[q]_lane[q]_[s16|s32]

---
 neon2rvv.h     |  36 ++++----
 tests/common.h |   4 +-
 tests/impl.cpp | 239 ++++++++++++++++++++++++++++++++++++++++---------
 tests/impl.h   |  12 +--
 4 files changed, 223 insertions(+), 68 deletions(-)

diff --git a/neon2rvv.h b/neon2rvv.h
index 8cdec040..f8face56 100644
--- a/neon2rvv.h
+++ b/neon2rvv.h
@@ -9844,60 +9844,60 @@ FORCE_INLINE int32x2_t vqrdmulh_lane_s32(int32x2_t a, int32x2_t b, const int c)
   return __riscv_vnclip_wx_i32m1(ab_mulx2, 32, __RISCV_VXRM_RNU, 2);
 }
 
-FORCE_INLINE int16x8_t vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c, const int __d) {
-  vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, __d, 8);
+FORCE_INLINE int16x8_t vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c, const int lane) {
+  vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 8);
   vint32m2_t bc_mul = __riscv_vwmul_vv_i32m2(b, c_dup, 8);
   vint32m2_t bc_mulx2 = __riscv_vsll_vx_i32m2(bc_mul, 1, 8);
   vint16m1_t bc_s = __riscv_vnclip_wx_i16m1(bc_mulx2, 16, __RISCV_VXRM_RNU, 8);
   return __riscv_vsadd_vv_i16m1(a, bc_s, 8);
 }
 
-// FORCE_INLINE int16x4_t vqrdmlah_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v, const int lane);
+FORCE_INLINE int16x4_t vqrdmlah_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t c, const int lane);
 
-// FORCE_INLINE int16x8_t vqrdmlahq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v, const int lane);
+FORCE_INLINE int16x8_t vqrdmlahq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t c, const int lane);
 
-FORCE_INLINE int32x4_t vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c, const int __d) {
-  vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, __d, 4);
+FORCE_INLINE int32x4_t vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c, const int lane) {
+  vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 4);
   vint64m2_t bc_mul = __riscv_vwmul_vv_i64m2(b, c_dup, 4);
   vint64m2_t bc_mulx2 = __riscv_vsll_vx_i64m2(bc_mul, 1, 4);
   vint32m1_t bc_s = __riscv_vnclip_wx_i32m1(bc_mulx2, 32, __RISCV_VXRM_RNU, 4);
   return __riscv_vsadd_vv_i32m1(a, bc_s, 4);
 }
 
-// FORCE_INLINE int32x2_t vqrdmlah_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v, const int lane);
+FORCE_INLINE int32x2_t vqrdmlah_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t c, const int lane);
 
-// FORCE_INLINE int32x4_t vqrdmlahq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v, const int lane);
+FORCE_INLINE int32x4_t vqrdmlahq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t c, const int lane);
 
-FORCE_INLINE int16x4_t vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c, const int __d) {
-  vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, __d, 4);
+FORCE_INLINE int16x4_t vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c, const int lane) {
+  vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 4);
   vint32m2_t bc_mul = __riscv_vwmul_vv_i32m2(b, c_dup, 4);
   vint32m2_t bc_mulx2 = __riscv_vsll_vx_i32m2(bc_mul, 1, 4);
   vint16m1_t bc_s = __riscv_vnclip_wx_i16m1(bc_mulx2, 16, __RISCV_VXRM_RNU, 4);
   return __riscv_vsadd_vv_i16m1(a, bc_s, 4);
 }
 
-FORCE_INLINE int32x2_t vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c, const int __d) {
-  vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, __d, 2);
+FORCE_INLINE int32x2_t vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c, const int lane) {
+  vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 2);
   vint64m2_t bc_mul = __riscv_vwmul_vv_i64m2(b, c_dup, 2);
   vint64m2_t bc_mulx2 = __riscv_vsll_vx_i64m2(bc_mul, 1, 2);
   vint32m1_t bc_s = __riscv_vnclip_wx_i32m1(bc_mulx2, 32, __RISCV_VXRM_RNU, 2);
   return __riscv_vsadd_vv_i32m1(a, bc_s, 2);
 }
 
-FORCE_INLINE int16x8_t vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c, const int __d) {
-  vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, __d, 4);
+FORCE_INLINE int16x8_t vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c, const int lane) {
+  vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 4);
   vint32m2_t bc_mul = __riscv_vwmul_vv_i32m2(b, c_dup, 4);
   vint32m2_t bc_mulx2 = __riscv_vsll_vx_i32m2(bc_mul, 1, 4);
   vint16m1_t bc_s = __riscv_vnclip_wx_i16m1(bc_mulx2, 16, __RISCV_VXRM_RNU, 4);
   return __riscv_vssub_vv_i16m1(a, bc_s, 4);
 }
 
-// FORCE_INLINE int16x4_t vqrdmlsh_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v, const int lane);
+FORCE_INLINE int16x4_t vqrdmlsh_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t c, const int lane);
 
-// FORCE_INLINE int16x8_t vqrdmlshq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v, const int lane);
+FORCE_INLINE int16x8_t vqrdmlshq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t c, const int lane);
 
-FORCE_INLINE int32x4_t vqrdmlshq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c, const int __d) {
-  vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, __d, 2);
+FORCE_INLINE int32x4_t vqrdmlshq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c, const int lane) {
+  vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 2);
   vint64m2_t bc_mul = __riscv_vwmul_vv_i64m2(b, c_dup, 2);
   vint64m2_t bc_mulx2 = __riscv_vsll_vx_i64m2(bc_mul, 1, 2);
   vint32m1_t bc_s = __riscv_vnclip_wx_i32m1(bc_mulx2, 32, __RISCV_VXRM_RNU, 2);
diff --git a/tests/common.h b/tests/common.h
index 16243658..bdaeb5a8 100644
--- a/tests/common.h
+++ b/tests/common.h
@@ -202,12 +202,12 @@ TEST_SATURATE_ADD_SUB(64)
   }                                                                                                             \
   static inline int##CBIT##_t sat_rdmlah(int##CBIT##_t a, int##CBIT##_t b, int##CBIT##_t c) {                   \
     int##HBIT##_t tmp = sat_dmull(b, c);                                                                        \
-    tmp = sat_add(tmp, (int##HBIT##_t)(1 << (CBIT - 1)));                                                       \
+    tmp = sat_add(tmp, (int##HBIT##_t)((int##HBIT##_t)1 << (CBIT - 1)));                                        \
     return sat_add(a, (int##CBIT##_t)(tmp >> CBIT));                                                            \
   }                                                                                                             \
   static inline int##CBIT##_t sat_rdmlsh(int##CBIT##_t a, int##CBIT##_t b, int##CBIT##_t c) {                   \
     int##HBIT##_t tmp = sat_dmull(b, c);                                                                        \
-    tmp = sat_sub(tmp, (int##HBIT##_t)(1 << (CBIT - 1)));                                                       \
+    tmp = sat_add(tmp, (int##HBIT##_t)((int##HBIT##_t)1 << (CBIT - 1)));                                        \
     return sat_sub(a, (int##CBIT##_t)(tmp >> CBIT));                                                            \
   }
 TEST_SATURATE_DMUL(8, 16)
diff --git a/tests/impl.cpp b/tests/impl.cpp
index 1339c891..8741f83c 100644
--- a/tests/impl.cpp
+++ b/tests/impl.cpp
@@ -36177,53 +36177,141 @@ result_t test_vqrdmlahq_lane_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter)
   a = vld1q_s16(_a);                           \
   b = vld1q_s16(_b);                           \
   c = vld1_s16(_c);                            \
-  d = vqrdmlahq_lane_s16(a, b, c, IDX);
+  d = vqrdmlahq_lane_s16(a, b, c, IDX);        \
+  CHECK_RESULT(validate_int16(d, _d[0], _d[1], _d[2], _d[3]))
 
   IMM_4_ITER
 #undef TEST_IMPL
 
-  return validate_int16(d, _d[0], _d[1], _d[2], _d[3], _d[4], _d[5], _d[6], _d[7]);
+  return TEST_SUCCESS;
 #endif
 #else
   return TEST_UNIMPL;
 #endif  // ENABLE_TEST_ALL
 }
 
-result_t test_vqrdmlah_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+result_t test_vqrdmlah_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   #ifdef ENABLE_TEST_ALL
+  // #if defined(__GNUC__)
+  //   return TEST_UNIMPL;
+  // #else
+  const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  const int16_t *_c = (const int16_t *)impl.test_cases_int_pointer3;
+  int16_t _d[4];
+  int16x4_t d;
+  int16x4_t a = vld1_s16(_a);
+  int16x4_t b = vld1_s16(_b);
+  int16x8_t c = vld1q_s16(_c);
+
+#define TEST_IMPL(IDX)                         \
+  for (int i = 0; i < 4; i++) {                \
+    _d[i] = sat_rdmlah(_a[i], _b[i], _c[IDX]); \
+  }                                            \
+  d = vqrdmlah_laneq_s16(a, b, c, IDX);        \
+  CHECK_RESULT(validate_int16(d, _d[0], _d[1], _d[2], _d[3]))
+
+  IMM_8_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+  // #endif
+  // #else
+  //   return TEST_UNIMPL;
+  // #endif  // ENABLE_TEST_ALL
+}
+
+result_t test_vqrdmlahq_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   #ifdef ENABLE_TEST_ALL
+  // #if defined(__GNUC__)
+  //   return TEST_UNIMPL;
+  // #else
+  const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  const int16_t *_c = (const int16_t *)impl.test_cases_int_pointer3;
+  int16_t _d[8];
+  int16x8_t d;
+  int16x8_t a = vld1q_s16(_a);
+  int16x8_t b = vld1q_s16(_b);
+  int16x8_t c = vld1q_s16(_c);
+
+#define TEST_IMPL(IDX)                         \
+  for (int i = 0; i < 8; i++) {                \
+    _d[i] = sat_rdmlah(_a[i], _b[i], _c[IDX]); \
+  }                                            \
+  d = vqrdmlahq_laneq_s16(a, b, c, IDX);       \
+  CHECK_RESULT(validate_int16(d, _d[0], _d[1], _d[2], _d[3], _d[4], _d[5], _d[6], _d[7]))
+
+  IMM_8_ITER
+#undef TEST_IMPL
 
-result_t test_vqrdmlahq_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+  return TEST_SUCCESS;
+  // #endif
+  // #else
+  //   return TEST_UNIMPL;
+  // #endif  // ENABLE_TEST_ALL
+}
 
 result_t test_vqrdmlahq_lane_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
-#ifdef ENABLE_TEST_ALL
-#if defined(__GNUC__)
-  return TEST_UNIMPL;
-#else
+  // #ifdef ENABLE_TEST_ALL
+  // #if defined(__GNUC__)
+  //   return TEST_UNIMPL;
+  // #else
   const int32_t *_a = (int32_t *)impl.test_cases_int_pointer1;
   const int32_t *_b = (int32_t *)impl.test_cases_int_pointer2;
   const int32_t *_c = (int32_t *)impl.test_cases_int_pointer3;
   int32_t _d[4];
-  int32x4_t a, b, d;
-  int32x2_t c;
+  int32x4_t d;
+  int32x4_t a = vld1q_s32(_a);
+  int32x4_t b = vld1q_s32(_b);
+  int32x2_t c = vld1_s32(_c);
 #define TEST_IMPL(IDX)                         \
   for (int i = 0; i < 4; i++) {                \
     _d[i] = sat_rdmlah(_a[i], _b[i], _c[IDX]); \
   }                                            \
-  a = vld1q_s32(_a);                           \
-  b = vld1q_s32(_b);                           \
-  c = vld1_s32(_c);                            \
-  d = vqrdmlahq_lane_s32(a, b, c, IDX);
+  d = vqrdmlahq_lane_s32(a, b, c, IDX);        \
+  CHECK_RESULT(validate_int32(d, _d[0], _d[1], _d[2], _d[3]))
 
   IMM_2_ITER
 #undef TEST_IMPL
 
-  return validate_int32(d, _d[0], _d[1], _d[2], _d[3]);
-#endif
-#else
-  return TEST_UNIMPL;
-#endif  // ENABLE_TEST_ALL
-}  // namespace NEON2RVV
+  return TEST_SUCCESS;
+  // #endif
+  // #else
+  //   return TEST_UNIMPL;
+  // #endif  // ENABLE_TEST_ALL
+}
 
-result_t test_vqrdmlah_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+result_t test_vqrdmlah_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   #ifdef ENABLE_TEST_ALL
+  // #if defined(__GNUC__)
+  //   return TEST_UNIMPL;
+  // #else
+  const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2;
+  const int32_t *_c = (const int32_t *)impl.test_cases_int_pointer3;
+  int32_t _d[2];
+  int32x2_t d;
+  int32x2_t a = vld1_s32(_a);
+  int32x2_t b = vld1_s32(_b);
+  int32x4_t c = vld1q_s32(_c);
+
+#define TEST_IMPL(IDX)                         \
+  for (int i = 0; i < 2; i++) {                \
+    _d[i] = sat_rdmlah(_a[i], _b[i], _c[IDX]); \
+  }                                            \
+  d = vqrdmlah_laneq_s32(a, b, c, IDX);        \
+  CHECK_RESULT(validate_int32(d, _d[0], _d[1]))
+
+  IMM_4_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+  // #endif
+  // #else
+  //   return TEST_UNIMPL;
+  // #endif  // ENABLE_TEST_ALL
+}
 
 result_t test_vqrdmlahq_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
 
@@ -36236,20 +36324,21 @@ result_t test_vqrdmlah_lane_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
   const int16_t *_b = (int16_t *)impl.test_cases_int_pointer2;
   const int16_t *_c = (int16_t *)impl.test_cases_int_pointer3;
   int16_t _d[4];
-  int16x4_t a, b, c, d;
+  int16x4_t d;
+  int16x4_t a = vld1_s16(_a);
+  int16x4_t b = vld1_s16(_b);
+  int16x4_t c = vld1_s16(_c);
 #define TEST_IMPL(IDX)                         \
   for (int i = 0; i < 4; i++) {                \
     _d[i] = sat_rdmlah(_a[i], _b[i], _c[IDX]); \
   }                                            \
-  a = vld1_s16(_a);                            \
-  b = vld1_s16(_b);                            \
-  c = vld1_s16(_c);                            \
-  d = vqrdmlah_lane_s16(a, b, c, IDX);
+  d = vqrdmlah_lane_s16(a, b, c, IDX);         \
+  CHECK_RESULT(validate_int16(d, _d[0], _d[1], _d[2], _d[3]))
 
   IMM_4_ITER
 #undef TEST_IMPL
 
-  return validate_int16(d, _d[0], _d[1], _d[2], _d[3]);
+  return TEST_SUCCESS;
 #endif
 #else
   return TEST_UNIMPL;
@@ -36265,20 +36354,21 @@ result_t test_vqrdmlah_lane_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
   const int32_t *_b = (int32_t *)impl.test_cases_int_pointer2;
   const int32_t *_c = (int32_t *)impl.test_cases_int_pointer3;
   int32_t _d[2];
-  int32x2_t a, b, c, d;
+  int32x2_t d;
+  int32x2_t a = vld1_s32(_a);
+  int32x2_t b = vld1_s32(_b);
+  int32x2_t c = vld1_s32(_c);
 #define TEST_IMPL(IDX)                         \
   for (int i = 0; i < 2; i++) {                \
     _d[i] = sat_rdmlah(_a[i], _b[i], _c[IDX]); \
   }                                            \
-  a = vld1_s32(_a);                            \
-  b = vld1_s32(_b);                            \
-  c = vld1_s32(_c);                            \
-  d = vqrdmlah_lane_s32(a, b, c, IDX);
+  d = vqrdmlah_lane_s32(a, b, c, IDX);         \
+  CHECK_RESULT(validate_int32(d, _d[0], _d[1]))
 
   IMM_2_ITER
 #undef TEST_IMPL
 
-  return validate_int32(d, _d[0], _d[1]);
+  return TEST_SUCCESS;
 #endif
 #else
   return TEST_UNIMPL;
@@ -36294,30 +36384,95 @@ result_t test_vqrdmlshq_lane_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter)
   const int16_t *_b = (int16_t *)impl.test_cases_int_pointer2;
   const int16_t *_c = (int16_t *)impl.test_cases_int_pointer3;
   int16_t _d[8];
-  int16x8_t a, b, d;
-  int16x4_t c;
+  int16x8_t d;
+  int16x8_t a = vld1q_s16(_a);
+  int16x8_t b = vld1q_s16(_b);
+  int16x4_t c = vld1_s16(_c);
 #define TEST_IMPL(IDX)                         \
   for (int i = 0; i < 8; i++) {                \
     _d[i] = sat_rdmlsh(_a[i], _b[i], _c[IDX]); \
   }                                            \
-  a = vld1q_s16(_a);                           \
-  b = vld1q_s16(_b);                           \
-  c = vld1_s16(_c);                            \
-  d = vqrdmlshq_lane_s16(a, b, c, IDX);
+  d = vqrdmlshq_lane_s16(a, b, c, IDX);        \
+  CHECK_RESULT(validate_int16(d, _d[0], _d[1], _d[2], _d[3], _d[4], _d[5], _d[6], _d[7]))
 
   IMM_4_ITER
 #undef TEST_IMPL
 
-  return validate_int16(d, _d[0], _d[1], _d[2], _d[3], _d[4], _d[5], _d[6], _d[7]);
+  return TEST_SUCCESS;
 #endif
 #else
   return TEST_UNIMPL;
 #endif
 }
 
-result_t test_vqrdmlsh_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+result_t test_vqrdmlsh_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   #ifdef ENABLE_TEST_ALL
+  // #if defined(__GNUC__)
+  //   return TEST_UNIMPL;
+  // #else
+  const int16_t *_a = (int16_t *)impl.test_cases_int_pointer1;
+  const int16_t *_b = (int16_t *)impl.test_cases_int_pointer2;
+  const int16_t *_c = (int16_t *)impl.test_cases_int_pointer3;
+  int16_t _d[4];
+  int16x4_t d;
+  int16x4_t a = vld1_s16(_a);
+  int16x4_t b = vld1_s16(_b);
+  int16x8_t c = vld1q_s16(_c);
+  print_s16_64("_a", _a);
+  print_s16_64("_b", _b);
+  print_s16_64("_c", _c);
+#define TEST_IMPL(IDX)                            \
+  for (int i = 0; i < 4; i++) {                   \
+    int32_t tmp = sat_dmull(_b[i], _c[i]);        \
+    tmp += (int32_t)((int32_t)1 << (16 - 1));     \
+    tmp = ((int32_t)_a[i] << 16) +  tmp; \
+    _d[i] = saturate_int16(tmp >> 16);\
+    /*_d[i] = sat_sub(_a[i], (int16_t)(tmp >> 16));*/ \
+  }                                               \
+  d = vqrdmlsh_laneq_s16(a, b, c, IDX);           \
+  print_s16_64("_d", _d);                         \
+  print_s16_64(" d", d);                          \
+  CHECK_RESULT(validate_int16(d, _d[0], _d[1], _d[2], _d[3]))
+
+  IMM_8_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+  // #endif
+  // #else
+  //   return TEST_UNIMPL;
+  // #endif
+}
+
+result_t test_vqrdmlshq_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   #ifdef ENABLE_TEST_ALL
+  // #if defined(__GNUC__)
+  //   return TEST_UNIMPL;
+  // #else
+  const int16_t *_a = (int16_t *)impl.test_cases_int_pointer1;
+  const int16_t *_b = (int16_t *)impl.test_cases_int_pointer2;
+  const int16_t *_c = (int16_t *)impl.test_cases_int_pointer3;
+  int16_t _d[8];
+  int16x8_t d;
+  int16x8_t a = vld1q_s16(_a);
+  int16x8_t b = vld1q_s16(_b);
+  int16x8_t c = vld1q_s16(_c);
+#define TEST_IMPL(IDX)                         \
+  for (int i = 0; i < 8; i++) {                \
+    _d[i] = sat_rdmlsh(_a[i], _b[i], _c[IDX]); \
+  }                                            \
+  d = vqrdmlshq_laneq_s16(a, b, c, IDX);       \
+  CHECK_RESULT(validate_int16(d, _d[0], _d[1], _d[2], _d[3], _d[4], _d[5], _d[6], _d[7]))
 
-result_t test_vqrdmlshq_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+  IMM_8_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+  // #endif
+  // #else
+  //   return TEST_UNIMPL;
+  // #endif
+}
 
 result_t test_vqrdmlshq_lane_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
 #ifdef ENABLE_TEST_ALL
diff --git a/tests/impl.h b/tests/impl.h
index b053816d..0bd62c66 100644
--- a/tests/impl.h
+++ b/tests/impl.h
@@ -2059,16 +2059,16 @@
   _(vqrdmulh_lane_s16)                                                           \
   _(vqrdmulh_lane_s32)                                                           \
   _(vqrdmlahq_lane_s16)                                                          \
-  /*_(vqrdmlah_laneq_s16)                                                     */ \
-  /*_(vqrdmlahq_laneq_s16)                                                    */ \
+  _(vqrdmlah_laneq_s16)                                                          \
+  _(vqrdmlahq_laneq_s16)                                                         \
   _(vqrdmlahq_lane_s32)                                                          \
-  /*_(vqrdmlah_laneq_s32)                                                     */ \
-  /*_(vqrdmlahq_laneq_s32)                                                    */ \
+  _(vqrdmlah_laneq_s32)                                                          \
+  _(vqrdmlahq_laneq_s32)                                                         \
   _(vqrdmlah_lane_s16)                                                           \
   _(vqrdmlah_lane_s32)                                                           \
   _(vqrdmlshq_lane_s16)                                                          \
-  /*_(vqrdmlsh_laneq_s16)                                                     */ \
-  /*_(vqrdmlshq_laneq_s16)                                                    */ \
+  _(vqrdmlsh_laneq_s16)                                                          \
+  _(vqrdmlshq_laneq_s16)                                                         \
   _(vqrdmlshq_lane_s32)                                                          \
   /*_(vqrdmlsh_laneq_s32)                                                     */ \
   /*_(vqrdmlshq_laneq_s32)                                                    */ \