Skip to content

Commit

Permalink
Merge pull request #491 from howjmay/vqdmlal
Browse files Browse the repository at this point in the history
feat: Add vqdmlal[h|s]_lane[q]_[s16|s32]
  • Loading branch information
howjmay authored Jul 30, 2024
2 parents ad630ec + 1d79156 commit fbef716
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 12 deletions.
28 changes: 24 additions & 4 deletions neon2rvv.h
Original file line number Diff line number Diff line change
Expand Up @@ -9210,9 +9210,19 @@ FORCE_INLINE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c, c
return __riscv_vadd_vv_i64m1(a, bc_mulx2, 2);
}

// FORCE_INLINE int32_t vqdmlalh_lane_s16(int32_t a, int16_t b, int16x4_t v, const int lane);
FORCE_INLINE int32_t vqdmlalh_lane_s16(int32_t a, int16_t b, int16x4_t c, const int lane) {
int16_t c_lane = vget_lane_s16(c, lane);
int32_t dmull = (int32_t)b * (int32_t)c_lane;
dmull = dmull > INT32_MAX / 2 ? INT32_MAX : dmull < INT32_MIN / 2 ? INT32_MIN : dmull * 2;
return sat_add_int32(a, dmull);
}

// FORCE_INLINE int64_t vqdmlals_lane_s32(int64_t a, int32_t b, int32x2_t v, const int lane);
FORCE_INLINE int64_t vqdmlals_lane_s32(int64_t a, int32_t b, int32x2_t c, const int lane) {
int32_t c_lane = vget_lane_s32(c, lane);
int64_t dmull = (int64_t)b * (int64_t)c_lane;
dmull = dmull > INT64_MAX / 2 ? INT64_MAX : dmull < INT64_MIN / 2 ? INT64_MIN : dmull * 2;
return sat_add_int64(a, dmull);
}

FORCE_INLINE int32x4_t vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t c, const int lane) {
vint16m1_t b_high = __riscv_vslidedown_vx_i16m1(b, 4, 8);
Expand Down Expand Up @@ -9244,9 +9254,19 @@ FORCE_INLINE int64x2_t vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t c,
return __riscv_vadd_vv_i64m1(a, bc_mulx2, 2);
}

// FORCE_INLINE int32_t vqdmlalh_laneq_s16(int32_t a, int16_t b, int16x8_t c, const int lane);
FORCE_INLINE int32_t vqdmlalh_laneq_s16(int32_t a, int16_t b, int16x8_t c, const int lane) {
int16_t c_lane = vgetq_lane_s16(c, lane);
int32_t dmull = (int32_t)b * (int32_t)c_lane;
dmull = dmull > INT32_MAX / 2 ? INT32_MAX : dmull < INT32_MIN / 2 ? INT32_MIN : dmull * 2;
return sat_add_int32(a, dmull);
}

// FORCE_INLINE int64_t vqdmlals_laneq_s32(int64_t a, int32_t b, int32x4_t c, const int lane);
FORCE_INLINE int64_t vqdmlals_laneq_s32(int64_t a, int32_t b, int32x4_t c, const int lane) {
int32_t c_lane = vgetq_lane_s32(c, lane);
int64_t dmull = (int64_t)b * (int64_t)c_lane;
dmull = dmull > INT64_MAX / 2 ? INT64_MAX : dmull < INT64_MIN / 2 ? INT64_MIN : dmull * 2;
return sat_add_int64(a, dmull);
}

FORCE_INLINE int32x4_t vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t c, const int lane) {
vint16m1_t b_high = __riscv_vslidedown_vx_i16m1(b, 4, 8);
Expand Down
88 changes: 84 additions & 4 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33423,9 +33423,49 @@ result_t test_vqdmlal_lane_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#endif // ENABLE_TEST_ALL
}

result_t test_vqdmlalh_lane_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_vqdmlalh_lane_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const int32_t *_a = (int32_t *)impl.test_cases_int_pointer1;
const int16_t *_b = (int16_t *)impl.test_cases_int_pointer2;
const int16_t *_c = (int16_t *)impl.test_cases_int_pointer3;
int32_t _d, d;
int16x4_t c = vld1_s16(_c);

#define TEST_IMPL(IDX) \
_d = sat_add(_a[0], sat_dmull(_b[0], _c[IDX])); \
d = vqdmlalh_lane_s16(_a[0], _b[0], c, IDX); \
CHECK_RESULT(d == _d ? TEST_SUCCESS : TEST_FAIL)

IMM_4_ITER
#undef TEST_IMPL

return TEST_SUCCESS;
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vqdmlals_lane_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_vqdmlals_lane_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const int64_t *_a = (int64_t *)impl.test_cases_int_pointer1;
const int32_t *_b = (int32_t *)impl.test_cases_int_pointer2;
const int32_t *_c = (int32_t *)impl.test_cases_int_pointer3;
int64_t _d, d;
int32x2_t c = vld1_s32(_c);

#define TEST_IMPL(IDX) \
_d = sat_add(_a[0], sat_dmull(_b[0], _c[IDX])); \
d = vqdmlals_lane_s32(_a[0], _b[0], c, IDX); \
CHECK_RESULT(d == _d ? TEST_SUCCESS : TEST_FAIL)

IMM_2_ITER
#undef TEST_IMPL

return TEST_SUCCESS;
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vqdmlal_high_lane_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
Expand Down Expand Up @@ -33535,9 +33575,49 @@ result_t test_vqdmlal_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#endif // ENABLE_TEST_ALL
}

result_t test_vqdmlalh_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_vqdmlalh_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const int32_t *_a = (int32_t *)impl.test_cases_int_pointer1;
const int16_t *_b = (int16_t *)impl.test_cases_int_pointer2;
const int16_t *_c = (int16_t *)impl.test_cases_int_pointer3;
int32_t _d, d;
int16x8_t c = vld1q_s16(_c);

#define TEST_IMPL(IDX) \
_d = sat_add(_a[0], sat_dmull(_b[0], _c[IDX])); \
d = vqdmlalh_laneq_s16(_a[0], _b[0], c, IDX); \
CHECK_RESULT(d == _d ? TEST_SUCCESS : TEST_FAIL)

IMM_8_ITER
#undef TEST_IMPL

return TEST_SUCCESS;
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vqdmlals_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_vqdmlals_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const int64_t *_a = (int64_t *)impl.test_cases_int_pointer1;
const int32_t *_b = (int32_t *)impl.test_cases_int_pointer2;
const int32_t *_c = (int32_t *)impl.test_cases_int_pointer3;
int64_t _d, d;
int32x4_t c = vld1q_s32(_c);

#define TEST_IMPL(IDX) \
_d = sat_add(_a[0], sat_dmull(_b[0], _c[IDX])); \
d = vqdmlals_laneq_s32(_a[0], _b[0], c, IDX); \
CHECK_RESULT(d == _d ? TEST_SUCCESS : TEST_FAIL)

IMM_4_ITER
#undef TEST_IMPL

return TEST_SUCCESS;
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vqdmlal_high_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
Expand Down
8 changes: 4 additions & 4 deletions tests/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1948,14 +1948,14 @@
_(vmlal_high_laneq_u32) \
_(vqdmlal_lane_s16) \
_(vqdmlal_lane_s32) \
/*_(vqdmlalh_lane_s16) */ \
/*_(vqdmlals_lane_s32) */ \
_(vqdmlalh_lane_s16) \
_(vqdmlals_lane_s32) \
_(vqdmlal_high_lane_s16) \
_(vqdmlal_high_lane_s32) \
_(vqdmlal_laneq_s16) \
_(vqdmlal_laneq_s32) \
/*_(vqdmlalh_laneq_s16) */ \
/*_(vqdmlals_laneq_s32) */ \
_(vqdmlalh_laneq_s16) \
_(vqdmlals_laneq_s32) \
_(vqdmlal_high_laneq_s16) \
_(vqdmlal_high_laneq_s32) \
_(vmls_lane_s16) \
Expand Down

0 comments on commit fbef716

Please sign in to comment.