Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add vqrdml[a|s]h[q]_lane[q]_[s16|s32] #499

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 18 additions & 18 deletions neon2rvv.h
Original file line number Diff line number Diff line change
Expand Up @@ -9844,60 +9844,60 @@ FORCE_INLINE int32x2_t vqrdmulh_lane_s32(int32x2_t a, int32x2_t b, const int c)
return __riscv_vnclip_wx_i32m1(ab_mulx2, 32, __RISCV_VXRM_RNU, 2);
}

FORCE_INLINE int16x8_t vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c, const int __d) {
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, __d, 8);
FORCE_INLINE int16x8_t vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c, const int lane) {
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 8);
vint32m2_t bc_mul = __riscv_vwmul_vv_i32m2(b, c_dup, 8);
vint32m2_t bc_mulx2 = __riscv_vsll_vx_i32m2(bc_mul, 1, 8);
vint16m1_t bc_s = __riscv_vnclip_wx_i16m1(bc_mulx2, 16, __RISCV_VXRM_RNU, 8);
return __riscv_vsadd_vv_i16m1(a, bc_s, 8);
}

// FORCE_INLINE int16x4_t vqrdmlah_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v, const int lane);
FORCE_INLINE int16x4_t vqrdmlah_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t c, const int lane);

// FORCE_INLINE int16x8_t vqrdmlahq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v, const int lane);
FORCE_INLINE int16x8_t vqrdmlahq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t c, const int lane);

FORCE_INLINE int32x4_t vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c, const int __d) {
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, __d, 4);
FORCE_INLINE int32x4_t vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c, const int lane) {
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 4);
vint64m2_t bc_mul = __riscv_vwmul_vv_i64m2(b, c_dup, 4);
vint64m2_t bc_mulx2 = __riscv_vsll_vx_i64m2(bc_mul, 1, 4);
vint32m1_t bc_s = __riscv_vnclip_wx_i32m1(bc_mulx2, 32, __RISCV_VXRM_RNU, 4);
return __riscv_vsadd_vv_i32m1(a, bc_s, 4);
}

// FORCE_INLINE int32x2_t vqrdmlah_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v, const int lane);
FORCE_INLINE int32x2_t vqrdmlah_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t c, const int lane);

// FORCE_INLINE int32x4_t vqrdmlahq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v, const int lane);
FORCE_INLINE int32x4_t vqrdmlahq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t c, const int lane);

FORCE_INLINE int16x4_t vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c, const int __d) {
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, __d, 4);
FORCE_INLINE int16x4_t vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c, const int lane) {
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 4);
vint32m2_t bc_mul = __riscv_vwmul_vv_i32m2(b, c_dup, 4);
vint32m2_t bc_mulx2 = __riscv_vsll_vx_i32m2(bc_mul, 1, 4);
vint16m1_t bc_s = __riscv_vnclip_wx_i16m1(bc_mulx2, 16, __RISCV_VXRM_RNU, 4);
return __riscv_vsadd_vv_i16m1(a, bc_s, 4);
}

FORCE_INLINE int32x2_t vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c, const int __d) {
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, __d, 2);
FORCE_INLINE int32x2_t vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c, const int lane) {
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 2);
vint64m2_t bc_mul = __riscv_vwmul_vv_i64m2(b, c_dup, 2);
vint64m2_t bc_mulx2 = __riscv_vsll_vx_i64m2(bc_mul, 1, 2);
vint32m1_t bc_s = __riscv_vnclip_wx_i32m1(bc_mulx2, 32, __RISCV_VXRM_RNU, 2);
return __riscv_vsadd_vv_i32m1(a, bc_s, 2);
}

FORCE_INLINE int16x8_t vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c, const int __d) {
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, __d, 4);
FORCE_INLINE int16x8_t vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c, const int lane) {
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 4);
vint32m2_t bc_mul = __riscv_vwmul_vv_i32m2(b, c_dup, 4);
vint32m2_t bc_mulx2 = __riscv_vsll_vx_i32m2(bc_mul, 1, 4);
vint16m1_t bc_s = __riscv_vnclip_wx_i16m1(bc_mulx2, 16, __RISCV_VXRM_RNU, 4);
return __riscv_vssub_vv_i16m1(a, bc_s, 4);
}

// FORCE_INLINE int16x4_t vqrdmlsh_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v, const int lane);
FORCE_INLINE int16x4_t vqrdmlsh_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t c, const int lane);

// FORCE_INLINE int16x8_t vqrdmlshq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v, const int lane);
FORCE_INLINE int16x8_t vqrdmlshq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t c, const int lane);

FORCE_INLINE int32x4_t vqrdmlshq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c, const int __d) {
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, __d, 2);
FORCE_INLINE int32x4_t vqrdmlshq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c, const int lane) {
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 2);
vint64m2_t bc_mul = __riscv_vwmul_vv_i64m2(b, c_dup, 2);
vint64m2_t bc_mulx2 = __riscv_vsll_vx_i64m2(bc_mul, 1, 2);
vint32m1_t bc_s = __riscv_vnclip_wx_i32m1(bc_mulx2, 32, __RISCV_VXRM_RNU, 2);
Expand Down
4 changes: 2 additions & 2 deletions tests/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,12 +202,12 @@ TEST_SATURATE_ADD_SUB(64)
} \
static inline int##CBIT##_t sat_rdmlah(int##CBIT##_t a, int##CBIT##_t b, int##CBIT##_t c) { \
int##HBIT##_t tmp = sat_dmull(b, c); \
tmp = sat_add(tmp, (int##HBIT##_t)(1 << (CBIT - 1))); \
tmp = sat_add(tmp, (int##HBIT##_t)((int##HBIT##_t)1 << (CBIT - 1))); \
return sat_add(a, (int##CBIT##_t)(tmp >> CBIT)); \
} \
static inline int##CBIT##_t sat_rdmlsh(int##CBIT##_t a, int##CBIT##_t b, int##CBIT##_t c) { \
int##HBIT##_t tmp = sat_dmull(b, c); \
tmp = sat_sub(tmp, (int##HBIT##_t)(1 << (CBIT - 1))); \
tmp = sat_add(tmp, (int##HBIT##_t)((int##HBIT##_t)1 << (CBIT - 1))); \
return sat_sub(a, (int##CBIT##_t)(tmp >> CBIT)); \
}
TEST_SATURATE_DMUL(8, 16)
Expand Down
Loading
Loading