From b0d08463380742de7d7d22b83182f0230c2fe8bd Mon Sep 17 00:00:00 2001 From: Yang Hau Date: Thu, 1 Aug 2024 10:38:01 +0800 Subject: [PATCH] feat: Add vqrdml[a|s]h[q]_lane[q]_[s16|s32] --- neon2rvv.h | 36 ++++---- tests/common.h | 4 +- tests/impl.cpp | 239 ++++++++++++++++++++++++++++++++++++++++--------- tests/impl.h | 12 +-- 4 files changed, 223 insertions(+), 68 deletions(-) diff --git a/neon2rvv.h b/neon2rvv.h index 8cdec040..f8face56 100644 --- a/neon2rvv.h +++ b/neon2rvv.h @@ -9844,60 +9844,60 @@ FORCE_INLINE int32x2_t vqrdmulh_lane_s32(int32x2_t a, int32x2_t b, const int c) return __riscv_vnclip_wx_i32m1(ab_mulx2, 32, __RISCV_VXRM_RNU, 2); } -FORCE_INLINE int16x8_t vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c, const int __d) { - vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, __d, 8); +FORCE_INLINE int16x8_t vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c, const int lane) { + vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 8); vint32m2_t bc_mul = __riscv_vwmul_vv_i32m2(b, c_dup, 8); vint32m2_t bc_mulx2 = __riscv_vsll_vx_i32m2(bc_mul, 1, 8); vint16m1_t bc_s = __riscv_vnclip_wx_i16m1(bc_mulx2, 16, __RISCV_VXRM_RNU, 8); return __riscv_vsadd_vv_i16m1(a, bc_s, 8); } -// FORCE_INLINE int16x4_t vqrdmlah_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v, const int lane); +FORCE_INLINE int16x4_t vqrdmlah_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t c, const int lane); -// FORCE_INLINE int16x8_t vqrdmlahq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v, const int lane); +FORCE_INLINE int16x8_t vqrdmlahq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t c, const int lane); -FORCE_INLINE int32x4_t vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c, const int __d) { - vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, __d, 4); +FORCE_INLINE int32x4_t vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c, const int lane) { + vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 4); vint64m2_t bc_mul = __riscv_vwmul_vv_i64m2(b, c_dup, 4); vint64m2_t bc_mulx2 = __riscv_vsll_vx_i64m2(bc_mul, 1, 4); vint32m1_t bc_s = __riscv_vnclip_wx_i32m1(bc_mulx2, 32, __RISCV_VXRM_RNU, 4); return __riscv_vsadd_vv_i32m1(a, bc_s, 4); } -// FORCE_INLINE int32x2_t vqrdmlah_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v, const int lane); +FORCE_INLINE int32x2_t vqrdmlah_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t c, const int lane); -// FORCE_INLINE int32x4_t vqrdmlahq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v, const int lane); +FORCE_INLINE int32x4_t vqrdmlahq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t c, const int lane); -FORCE_INLINE int16x4_t vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c, const int __d) { - vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, __d, 4); +FORCE_INLINE int16x4_t vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c, const int lane) { + vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 4); vint32m2_t bc_mul = __riscv_vwmul_vv_i32m2(b, c_dup, 4); vint32m2_t bc_mulx2 = __riscv_vsll_vx_i32m2(bc_mul, 1, 4); vint16m1_t bc_s = __riscv_vnclip_wx_i16m1(bc_mulx2, 16, __RISCV_VXRM_RNU, 4); return __riscv_vsadd_vv_i16m1(a, bc_s, 4); } -FORCE_INLINE int32x2_t vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c, const int __d) { - vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, __d, 2); +FORCE_INLINE int32x2_t vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c, const int lane) { + vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 2); vint64m2_t bc_mul = __riscv_vwmul_vv_i64m2(b, c_dup, 2); vint64m2_t bc_mulx2 = __riscv_vsll_vx_i64m2(bc_mul, 1, 2); vint32m1_t bc_s = __riscv_vnclip_wx_i32m1(bc_mulx2, 32, __RISCV_VXRM_RNU, 2); return __riscv_vsadd_vv_i32m1(a, bc_s, 2); } -FORCE_INLINE int16x8_t vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c, const int __d) { - vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, __d, 4); +FORCE_INLINE int16x8_t vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c, const int lane) { + vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 4); vint32m2_t bc_mul = __riscv_vwmul_vv_i32m2(b, c_dup, 4); vint32m2_t bc_mulx2 = __riscv_vsll_vx_i32m2(bc_mul, 1, 4); vint16m1_t bc_s = __riscv_vnclip_wx_i16m1(bc_mulx2, 16, __RISCV_VXRM_RNU, 4); return __riscv_vssub_vv_i16m1(a, bc_s, 4); } -// FORCE_INLINE int16x4_t vqrdmlsh_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v, const int lane); +FORCE_INLINE int16x4_t vqrdmlsh_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t c, const int lane); -// FORCE_INLINE int16x8_t vqrdmlshq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v, const int lane); +FORCE_INLINE int16x8_t vqrdmlshq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t c, const int lane); -FORCE_INLINE int32x4_t vqrdmlshq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c, const int __d) { - vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, __d, 2); +FORCE_INLINE int32x4_t vqrdmlshq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c, const int lane) { + vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 2); vint64m2_t bc_mul = __riscv_vwmul_vv_i64m2(b, c_dup, 2); vint64m2_t bc_mulx2 = __riscv_vsll_vx_i64m2(bc_mul, 1, 2); vint32m1_t bc_s = __riscv_vnclip_wx_i32m1(bc_mulx2, 32, __RISCV_VXRM_RNU, 2); diff --git a/tests/common.h b/tests/common.h index 16243658..bdaeb5a8 100644 --- a/tests/common.h +++ b/tests/common.h @@ -202,12 +202,12 @@ TEST_SATURATE_ADD_SUB(64) } \ static inline int##CBIT##_t sat_rdmlah(int##CBIT##_t a, int##CBIT##_t b, int##CBIT##_t c) { \ int##HBIT##_t tmp = sat_dmull(b, c); \ - tmp = sat_add(tmp, (int##HBIT##_t)(1 << (CBIT - 1))); \ + tmp = sat_add(tmp, (int##HBIT##_t)((int##HBIT##_t)1 << (CBIT - 1))); \ return sat_add(a, (int##CBIT##_t)(tmp >> CBIT)); \ } \ static inline int##CBIT##_t sat_rdmlsh(int##CBIT##_t a, int##CBIT##_t b, int##CBIT##_t c) { \ int##HBIT##_t tmp = sat_dmull(b, c); \ - tmp = sat_sub(tmp, (int##HBIT##_t)(1 << (CBIT - 1))); \ + tmp = sat_add(tmp, (int##HBIT##_t)((int##HBIT##_t)1 << (CBIT - 1))); \ return sat_sub(a, (int##CBIT##_t)(tmp >> CBIT)); \ } TEST_SATURATE_DMUL(8, 16) diff --git a/tests/impl.cpp b/tests/impl.cpp index 1339c891..8741f83c 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -36177,53 +36177,141 @@ result_t test_vqrdmlahq_lane_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) a = vld1q_s16(_a); \ b = vld1q_s16(_b); \ c = vld1_s16(_c); \ - d = vqrdmlahq_lane_s16(a, b, c, IDX); + d = vqrdmlahq_lane_s16(a, b, c, IDX); \ + CHECK_RESULT(validate_int16(d, _d[0], _d[1], _d[2], _d[3])) IMM_4_ITER #undef TEST_IMPL - return validate_int16(d, _d[0], _d[1], _d[2], _d[3], _d[4], _d[5], _d[6], _d[7]); + return TEST_SUCCESS; #endif #else return TEST_UNIMPL; #endif // ENABLE_TEST_ALL } -result_t test_vqrdmlah_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } +result_t test_vqrdmlah_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { + // #ifdef ENABLE_TEST_ALL + // #if defined(__GNUC__) + // return TEST_UNIMPL; + // #else + const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + const int16_t *_c = (const int16_t *)impl.test_cases_int_pointer3; + int16_t _d[4]; + int16x4_t d; + int16x4_t a = vld1_s16(_a); + int16x4_t b = vld1_s16(_b); + int16x8_t c = vld1q_s16(_c); + +#define TEST_IMPL(IDX) \ + for (int i = 0; i < 4; i++) { \ + _d[i] = sat_rdmlah(_a[i], _b[i], _c[IDX]); \ + } \ + d = vqrdmlah_laneq_s16(a, b, c, IDX); \ + CHECK_RESULT(validate_int16(d, _d[0], _d[1], _d[2], _d[3])) + + IMM_8_ITER +#undef TEST_IMPL + + return TEST_SUCCESS; + // #endif + // #else + // return TEST_UNIMPL; + // #endif // ENABLE_TEST_ALL +} + +result_t test_vqrdmlahq_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { + // #ifdef ENABLE_TEST_ALL + // #if defined(__GNUC__) + // return TEST_UNIMPL; + // #else + const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + const int16_t *_c = (const int16_t *)impl.test_cases_int_pointer3; + int16_t _d[8]; + int16x8_t d; + int16x8_t a = vld1q_s16(_a); + int16x8_t b = vld1q_s16(_b); + int16x8_t c = vld1q_s16(_c); + +#define TEST_IMPL(IDX) \ + for (int i = 0; i < 8; i++) { \ + _d[i] = sat_rdmlah(_a[i], _b[i], _c[IDX]); \ + } \ + d = vqrdmlahq_laneq_s16(a, b, c, IDX); \ + CHECK_RESULT(validate_int16(d, _d[0], _d[1], _d[2], _d[3], _d[4], _d[5], _d[6], _d[7])) + + IMM_8_ITER +#undef TEST_IMPL -result_t test_vqrdmlahq_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } + return TEST_SUCCESS; + // #endif + // #else + // return TEST_UNIMPL; + // #endif // ENABLE_TEST_ALL +} result_t test_vqrdmlahq_lane_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { -#ifdef ENABLE_TEST_ALL -#if defined(__GNUC__) - return TEST_UNIMPL; -#else + // #ifdef ENABLE_TEST_ALL + // #if defined(__GNUC__) + // return TEST_UNIMPL; + // #else const int32_t *_a = (int32_t *)impl.test_cases_int_pointer1; const int32_t *_b = (int32_t *)impl.test_cases_int_pointer2; const int32_t *_c = (int32_t *)impl.test_cases_int_pointer3; int32_t _d[4]; - int32x4_t a, b, d; - int32x2_t c; + int32x4_t d; + int32x4_t a = vld1q_s32(_a); + int32x4_t b = vld1q_s32(_b); + int32x2_t c = vld1_s32(_c); #define TEST_IMPL(IDX) \ for (int i = 0; i < 4; i++) { \ _d[i] = sat_rdmlah(_a[i], _b[i], _c[IDX]); \ } \ - a = vld1q_s32(_a); \ - b = vld1q_s32(_b); \ - c = vld1_s32(_c); \ - d = vqrdmlahq_lane_s32(a, b, c, IDX); + d = vqrdmlahq_lane_s32(a, b, c, IDX); \ + CHECK_RESULT(validate_int32(d, _d[0], _d[1], _d[2], _d[3])) IMM_2_ITER #undef TEST_IMPL - return validate_int32(d, _d[0], _d[1], _d[2], _d[3]); -#endif -#else - return TEST_UNIMPL; -#endif // ENABLE_TEST_ALL -} // namespace NEON2RVV + return TEST_SUCCESS; + // #endif + // #else + // return TEST_UNIMPL; + // #endif // ENABLE_TEST_ALL +} -result_t test_vqrdmlah_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } +result_t test_vqrdmlah_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { + // #ifdef ENABLE_TEST_ALL + // #if defined(__GNUC__) + // return TEST_UNIMPL; + // #else + const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2; + const int32_t *_c = (const int32_t *)impl.test_cases_int_pointer3; + int32_t _d[2]; + int32x2_t d; + int32x2_t a = vld1_s32(_a); + int32x2_t b = vld1_s32(_b); + int32x4_t c = vld1q_s32(_c); + +#define TEST_IMPL(IDX) \ + for (int i = 0; i < 2; i++) { \ + _d[i] = sat_rdmlah(_a[i], _b[i], _c[IDX]); \ + } \ + d = vqrdmlah_laneq_s32(a, b, c, IDX); \ + CHECK_RESULT(validate_int32(d, _d[0], _d[1])) + + IMM_4_ITER +#undef TEST_IMPL + + return TEST_SUCCESS; + // #endif + // #else + // return TEST_UNIMPL; + // #endif // ENABLE_TEST_ALL +} result_t test_vqrdmlahq_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } @@ -36236,20 +36324,21 @@ result_t test_vqrdmlah_lane_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { const int16_t *_b = (int16_t *)impl.test_cases_int_pointer2; const int16_t *_c = (int16_t *)impl.test_cases_int_pointer3; int16_t _d[4]; - int16x4_t a, b, c, d; + int16x4_t d; + int16x4_t a = vld1_s16(_a); + int16x4_t b = vld1_s16(_b); + int16x4_t c = vld1_s16(_c); #define TEST_IMPL(IDX) \ for (int i = 0; i < 4; i++) { \ _d[i] = sat_rdmlah(_a[i], _b[i], _c[IDX]); \ } \ - a = vld1_s16(_a); \ - b = vld1_s16(_b); \ - c = vld1_s16(_c); \ - d = vqrdmlah_lane_s16(a, b, c, IDX); + d = vqrdmlah_lane_s16(a, b, c, IDX); \ + CHECK_RESULT(validate_int16(d, _d[0], _d[1], _d[2], _d[3])) IMM_4_ITER #undef TEST_IMPL - return validate_int16(d, _d[0], _d[1], _d[2], _d[3]); + return TEST_SUCCESS; #endif #else return TEST_UNIMPL; @@ -36265,20 +36354,21 @@ result_t test_vqrdmlah_lane_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { const int32_t *_b = (int32_t *)impl.test_cases_int_pointer2; const int32_t *_c = (int32_t *)impl.test_cases_int_pointer3; int32_t _d[2]; - int32x2_t a, b, c, d; + int32x2_t d; + int32x2_t a = vld1_s32(_a); + int32x2_t b = vld1_s32(_b); + int32x2_t c = vld1_s32(_c); #define TEST_IMPL(IDX) \ for (int i = 0; i < 2; i++) { \ _d[i] = sat_rdmlah(_a[i], _b[i], _c[IDX]); \ } \ - a = vld1_s32(_a); \ - b = vld1_s32(_b); \ - c = vld1_s32(_c); \ - d = vqrdmlah_lane_s32(a, b, c, IDX); + d = vqrdmlah_lane_s32(a, b, c, IDX); \ + CHECK_RESULT(validate_int32(d, _d[0], _d[1])) IMM_2_ITER #undef TEST_IMPL - return validate_int32(d, _d[0], _d[1]); + return TEST_SUCCESS; #endif #else return TEST_UNIMPL; @@ -36294,30 +36384,95 @@ result_t test_vqrdmlshq_lane_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) const int16_t *_b = (int16_t *)impl.test_cases_int_pointer2; const int16_t *_c = (int16_t *)impl.test_cases_int_pointer3; int16_t _d[8]; - int16x8_t a, b, d; - int16x4_t c; + int16x8_t d; + int16x8_t a = vld1q_s16(_a); + int16x8_t b = vld1q_s16(_b); + int16x4_t c = vld1_s16(_c); #define TEST_IMPL(IDX) \ for (int i = 0; i < 8; i++) { \ _d[i] = sat_rdmlsh(_a[i], _b[i], _c[IDX]); \ } \ - a = vld1q_s16(_a); \ - b = vld1q_s16(_b); \ - c = vld1_s16(_c); \ - d = vqrdmlshq_lane_s16(a, b, c, IDX); + d = vqrdmlshq_lane_s16(a, b, c, IDX); \ + CHECK_RESULT(validate_int16(d, _d[0], _d[1], _d[2], _d[3], _d[4], _d[5], _d[6], _d[7])) IMM_4_ITER #undef TEST_IMPL - return validate_int16(d, _d[0], _d[1], _d[2], _d[3], _d[4], _d[5], _d[6], _d[7]); + return TEST_SUCCESS; #endif #else return TEST_UNIMPL; #endif } -result_t test_vqrdmlsh_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } +result_t test_vqrdmlsh_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { + // #ifdef ENABLE_TEST_ALL + // #if defined(__GNUC__) + // return TEST_UNIMPL; + // #else + const int16_t *_a = (int16_t *)impl.test_cases_int_pointer1; + const int16_t *_b = (int16_t *)impl.test_cases_int_pointer2; + const int16_t *_c = (int16_t *)impl.test_cases_int_pointer3; + int16_t _d[4]; + int16x4_t d; + int16x4_t a = vld1_s16(_a); + int16x4_t b = vld1_s16(_b); + int16x8_t c = vld1q_s16(_c); + print_s16_64("_a", _a); + print_s16_64("_b", _b); + print_s16_64("_c", _c); +#define TEST_IMPL(IDX) \ + for (int i = 0; i < 4; i++) { \ + int32_t tmp = sat_dmull(_b[i], _c[i]); \ + tmp += (int32_t)((int32_t)1 << (16 - 1)); \ + tmp = ((int32_t)_a[i] << 16) + tmp; \ + _d[i] = saturate_int16(tmp >> 16);\ + /*_d[i] = sat_sub(_a[i], (int16_t)(tmp >> 16));*/ \ + } \ + d = vqrdmlsh_laneq_s16(a, b, c, IDX); \ + print_s16_64("_d", _d); \ + print_s16_64(" d", d); \ + CHECK_RESULT(validate_int16(d, _d[0], _d[1], _d[2], _d[3])) + + IMM_8_ITER +#undef TEST_IMPL + + return TEST_SUCCESS; + // #endif + // #else + // return TEST_UNIMPL; + // #endif +} + +result_t test_vqrdmlshq_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { + // #ifdef ENABLE_TEST_ALL + // #if defined(__GNUC__) + // return TEST_UNIMPL; + // #else + const int16_t *_a = (int16_t *)impl.test_cases_int_pointer1; + const int16_t *_b = (int16_t *)impl.test_cases_int_pointer2; + const int16_t *_c = (int16_t *)impl.test_cases_int_pointer3; + int16_t _d[8]; + int16x8_t d; + int16x8_t a = vld1q_s16(_a); + int16x8_t b = vld1q_s16(_b); + int16x8_t c = vld1q_s16(_c); +#define TEST_IMPL(IDX) \ + for (int i = 0; i < 8; i++) { \ + _d[i] = sat_rdmlsh(_a[i], _b[i], _c[IDX]); \ + } \ + d = vqrdmlshq_laneq_s16(a, b, c, IDX); \ + CHECK_RESULT(validate_int16(d, _d[0], _d[1], _d[2], _d[3], _d[4], _d[5], _d[6], _d[7])) -result_t test_vqrdmlshq_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; } + IMM_8_ITER +#undef TEST_IMPL + + return TEST_SUCCESS; + // #endif + // #else + // return TEST_UNIMPL; + // #endif +} result_t test_vqrdmlshq_lane_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { #ifdef ENABLE_TEST_ALL diff --git a/tests/impl.h b/tests/impl.h index b053816d..0bd62c66 100644 --- a/tests/impl.h +++ b/tests/impl.h @@ -2059,16 +2059,16 @@ _(vqrdmulh_lane_s16) \ _(vqrdmulh_lane_s32) \ _(vqrdmlahq_lane_s16) \ - /*_(vqrdmlah_laneq_s16) */ \ - /*_(vqrdmlahq_laneq_s16) */ \ + _(vqrdmlah_laneq_s16) \ + _(vqrdmlahq_laneq_s16) \ _(vqrdmlahq_lane_s32) \ - /*_(vqrdmlah_laneq_s32) */ \ - /*_(vqrdmlahq_laneq_s32) */ \ + _(vqrdmlah_laneq_s32) \ + _(vqrdmlahq_laneq_s32) \ _(vqrdmlah_lane_s16) \ _(vqrdmlah_lane_s32) \ _(vqrdmlshq_lane_s16) \ - /*_(vqrdmlsh_laneq_s16) */ \ - /*_(vqrdmlshq_laneq_s16) */ \ + _(vqrdmlsh_laneq_s16) \ + _(vqrdmlshq_laneq_s16) \ _(vqrdmlshq_lane_s32) \ /*_(vqrdmlsh_laneq_s32) */ \ /*_(vqrdmlshq_laneq_s32) */ \