From 53277241b53a19a7694c756f5aea931bbb3c95ad Mon Sep 17 00:00:00 2001 From: Yang Hau Date: Sat, 6 Jan 2024 13:09:03 +0800 Subject: [PATCH] feat: Add _mm_hadd* and _mm_hsub* --- sse2rvv.h | 174 ++++++++++-- tests/debug_tools.cpp | 6 + tests/debug_tools.h | 10 + tests/impl.cpp | 598 +++++++++++++++++++++--------------------- 4 files changed, 469 insertions(+), 319 deletions(-) diff --git a/sse2rvv.h b/sse2rvv.h index bf66ab7..a04e6cd 100644 --- a/sse2rvv.h +++ b/sse2rvv.h @@ -1456,37 +1456,173 @@ FORCE_INLINE int _mm_extract_ps(__m128 a, const int imm8) { // FORCE_INLINE unsigned int _mm_getcsr (void) {} -// FORCE_INLINE __m128i _mm_hadd_epi16 (__m128i a, __m128i b) {} - -// FORCE_INLINE __m128i _mm_hadd_epi32 (__m128i a, __m128i b) {} - -// FORCE_INLINE __m128d _mm_hadd_pd (__m128d a, __m128d b) {} - -// FORCE_INLINE __m64 _mm_hadd_pi16 (__m64 a, __m64 b) {} +FORCE_INLINE __m128i _mm_hadd_epi16(__m128i a, __m128i b) { + vint16m2_t _a = __riscv_vlmul_ext_v_i16m1_i16m2(vreinterpretq_m128i_i16(a)); + vint16m2_t _b = __riscv_vlmul_ext_v_i16m1_i16m2(vreinterpretq_m128i_i16(b)); + vint16m2_t ab = __riscv_vslideup_vx_i16m2(_a, _b, 8, 16); + vint16m2_t ab_s = __riscv_vslidedown_vx_i16m2(ab, 1, 16); + vint32m2_t ab_add = + __riscv_vreinterpret_v_i16m2_i32m2(__riscv_vadd_vv_i16m2(ab, ab_s, 16)); + return vreinterpretq_i16_m128i(__riscv_vnsra_wx_i16m1(ab_add, 0, 8)); +} + +FORCE_INLINE __m128i _mm_hadd_epi32(__m128i a, __m128i b) { + vint32m2_t _a = __riscv_vlmul_ext_v_i32m1_i32m2(vreinterpretq_m128i_i32(a)); + vint32m2_t _b = __riscv_vlmul_ext_v_i32m1_i32m2(vreinterpretq_m128i_i32(b)); + vint32m2_t ab = __riscv_vslideup_vx_i32m2(_a, _b, 4, 8); + vint32m2_t ab_s = __riscv_vslidedown_vx_i32m2(ab, 1, 8); + vint64m2_t ab_add = + __riscv_vreinterpret_v_i32m2_i64m2(__riscv_vadd_vv_i32m2(ab, ab_s, 8)); + return vreinterpretq_i32_m128i(__riscv_vnsra_wx_i32m1(ab_add, 0, 4)); +} + +FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) { + vfloat64m2_t _a = __riscv_vlmul_ext_v_f64m1_f64m2(vreinterpretq_m128d_f64(a)); + vfloat64m2_t _b = __riscv_vlmul_ext_v_f64m1_f64m2(vreinterpretq_m128d_f64(b)); + vfloat64m2_t ab = __riscv_vslideup_vx_f64m2(_a, _b, 2, 4); + vfloat64m2_t ab_s = __riscv_vslidedown_vx_f64m2(ab, 1, 4); + vfloat64m2_t ab_add = __riscv_vfadd_vv_f64m2(ab, ab_s, 4); + vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(__riscv_vmv_s_x_u8m1(85, 2)); + return vreinterpretq_f64_m128d(__riscv_vlmul_trunc_v_f64m2_f64m1( + __riscv_vcompress_vm_f64m2(ab_add, mask, 4))); +} + +FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b) { + vint16m1_t _a = vreinterpretq_m64_i16(a); + vint16m1_t _b = vreinterpretq_m64_i16(b); + vint16m1_t ab = __riscv_vslideup_vx_i16m1(_a, _b, 4, 8); + vint16m1_t ab_s = __riscv_vslidedown_vx_i16m1(ab, 1, 8); + vint32m1_t ab_add = + __riscv_vreinterpret_v_i16m1_i32m1(__riscv_vadd_vv_i16m1(ab, ab_s, 8)); + return vreinterpretq_i16_m64( + __riscv_vlmul_ext_v_i16mf2_i16m1(__riscv_vnsra_wx_i16mf2(ab_add, 0, 4))); +} -// FORCE_INLINE __m64 _mm_hadd_pi32 (__m64 a, __m64 b) {} +FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b) { + vint32m1_t _a = vreinterpretq_m64_i32(a); + vint32m1_t _b = vreinterpretq_m64_i32(b); + vint32m1_t ab = __riscv_vslideup_vx_i32m1(_a, _b, 2, 4); + vint32m1_t ab_s = __riscv_vslidedown_vx_i32m1(ab, 1, 4); + vint64m1_t ab_add = + __riscv_vreinterpret_v_i32m1_i64m1(__riscv_vadd_vv_i32m1(ab, ab_s, 4)); + return vreinterpretq_i32_m64( + __riscv_vlmul_ext_v_i32mf2_i32m1(__riscv_vnsra_wx_i32mf2(ab_add, 0, 2))); +} -// FORCE_INLINE __m128 _mm_hadd_ps (__m128 a, __m128 b) {} +FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) { + vfloat32m2_t _a = __riscv_vlmul_ext_v_f32m1_f32m2(vreinterpretq_m128_f32(a)); + vfloat32m2_t _b = __riscv_vlmul_ext_v_f32m1_f32m2(vreinterpretq_m128_f32(b)); + vfloat32m2_t ab = __riscv_vslideup_vx_f32m2(_a, _b, 4, 8); + vfloat32m2_t ab_s = __riscv_vslidedown_vx_f32m2(ab, 1, 8); + vint64m2_t ab_add = __riscv_vreinterpret_v_i32m2_i64m2( + __riscv_vreinterpret_v_f32m2_i32m2(__riscv_vfadd_vv_f32m2(ab, ab_s, 8))); + return vreinterpretq_i32_m128(__riscv_vnsra_wx_i32m1(ab_add, 0, 4)); +} -// FORCE_INLINE __m128i _mm_hadds_epi16 (__m128i a, __m128i b) {} +FORCE_INLINE __m128i _mm_hadds_epi16(__m128i a, __m128i b) { + vint16m2_t _a = __riscv_vlmul_ext_v_i16m1_i16m2(vreinterpretq_m128i_i16(a)); + vint16m2_t _b = __riscv_vlmul_ext_v_i16m1_i16m2(vreinterpretq_m128i_i16(b)); + vint16m2_t ab = __riscv_vslideup_vx_i16m2(_a, _b, 8, 16); + vint16m2_t ab_s = __riscv_vslidedown_vx_i16m2(ab, 1, 16); + vint32m2_t ab_add = + __riscv_vreinterpret_v_i16m2_i32m2(__riscv_vsadd_vv_i16m2(ab, ab_s, 16)); + return vreinterpretq_i16_m128i(__riscv_vnsra_wx_i16m1(ab_add, 0, 8)); +} -// FORCE_INLINE __m64 _mm_hadds_pi16 (__m64 a, __m64 b) {} +FORCE_INLINE __m64 _mm_hadds_pi16(__m64 a, __m64 b) { + vint16m1_t _a = vreinterpretq_m64_i16(a); + vint16m1_t _b = vreinterpretq_m64_i16(b); + vint16m1_t ab = __riscv_vslideup_vx_i16m1(_a, _b, 4, 8); + vint16m1_t ab_s = __riscv_vslidedown_vx_i16m1(ab, 1, 8); + vint32m1_t ab_add = + __riscv_vreinterpret_v_i16m1_i32m1(__riscv_vsadd_vv_i16m1(ab, ab_s, 8)); + return vreinterpretq_i16_m64( + __riscv_vlmul_ext_v_i16mf2_i16m1(__riscv_vnsra_wx_i16mf2(ab_add, 0, 4))); +} -// FORCE_INLINE __m128i _mm_hsub_epi16 (__m128i a, __m128i b) {} +FORCE_INLINE __m128i _mm_hsub_epi16(__m128i a, __m128i b) { + vint16m2_t _a = __riscv_vlmul_ext_v_i16m1_i16m2(vreinterpretq_m128i_i16(a)); + vint16m2_t _b = __riscv_vlmul_ext_v_i16m1_i16m2(vreinterpretq_m128i_i16(b)); + vint16m2_t ab = __riscv_vslideup_vx_i16m2(_a, _b, 8, 16); + vint16m2_t ab_s = __riscv_vslidedown_vx_i16m2(ab, 1, 16); + vint32m2_t ab_sub = + __riscv_vreinterpret_v_i16m2_i32m2(__riscv_vsub_vv_i16m2(ab, ab_s, 16)); + return vreinterpretq_i16_m128i(__riscv_vnsra_wx_i16m1(ab_sub, 0, 8)); +} -// FORCE_INLINE __m128i _mm_hsub_epi32 (__m128i a, __m128i b) {} +FORCE_INLINE __m128i _mm_hsub_epi32(__m128i a, __m128i b) { + vint32m2_t _a = __riscv_vlmul_ext_v_i32m1_i32m2(vreinterpretq_m128i_i32(a)); + vint32m2_t _b = __riscv_vlmul_ext_v_i32m1_i32m2(vreinterpretq_m128i_i32(b)); + vint32m2_t ab = __riscv_vslideup_vx_i32m2(_a, _b, 4, 8); + vint32m2_t ab_s = __riscv_vslidedown_vx_i32m2(ab, 1, 8); + vint64m2_t ab_sub = + __riscv_vreinterpret_v_i32m2_i64m2(__riscv_vsub_vv_i32m2(ab, ab_s, 8)); + return vreinterpretq_i32_m128i(__riscv_vnsra_wx_i32m1(ab_sub, 0, 4)); +} -// FORCE_INLINE __m128d _mm_hsub_pd (__m128d a, __m128d b) {} +FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b) { + vfloat64m2_t _a = __riscv_vlmul_ext_v_f64m1_f64m2(vreinterpretq_m128d_f64(a)); + vfloat64m2_t _b = __riscv_vlmul_ext_v_f64m1_f64m2(vreinterpretq_m128d_f64(b)); + vfloat64m2_t ab = __riscv_vslideup_vx_f64m2(_a, _b, 2, 4); + vfloat64m2_t ab_s = __riscv_vslidedown_vx_f64m2(ab, 1, 4); + vfloat64m2_t ab_sub = __riscv_vfsub_vv_f64m2(ab, ab_s, 4); + vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(__riscv_vmv_s_x_u8m1(85, 2)); + return vreinterpretq_f64_m128d(__riscv_vlmul_trunc_v_f64m2_f64m1( + __riscv_vcompress_vm_f64m2(ab_sub, mask, 4))); +} -// FORCE_INLINE __m64 _mm_hsub_pi16 (__m64 a, __m64 b) {} +FORCE_INLINE __m64 _mm_hsub_pi16(__m64 a, __m64 b) { + vint16m1_t _a = vreinterpretq_m64_i16(a); + vint16m1_t _b = vreinterpretq_m64_i16(b); + vint16m1_t ab = __riscv_vslideup_vx_i16m1(_a, _b, 4, 8); + vint16m1_t ab_s = __riscv_vslidedown_vx_i16m1(ab, 1, 8); + vint32m1_t ab_sub = + __riscv_vreinterpret_v_i16m1_i32m1(__riscv_vsub_vv_i16m1(ab, ab_s, 8)); + return vreinterpretq_i16_m64( + __riscv_vlmul_ext_v_i16mf2_i16m1(__riscv_vnsra_wx_i16mf2(ab_sub, 0, 4))); +} -// FORCE_INLINE __m64 _mm_hsub_pi32 (__m64 a, __m64 b) {} +FORCE_INLINE __m64 _mm_hsub_pi32(__m64 a, __m64 b) { + vint32m1_t _a = vreinterpretq_m64_i32(a); + vint32m1_t _b = vreinterpretq_m64_i32(b); + vint32m1_t ab = __riscv_vslideup_vx_i32m1(_a, _b, 2, 4); + vint32m1_t ab_s = __riscv_vslidedown_vx_i32m1(ab, 1, 4); + vint64m1_t ab_sub = + __riscv_vreinterpret_v_i32m1_i64m1(__riscv_vsub_vv_i32m1(ab, ab_s, 4)); + return vreinterpretq_i32_m64( + __riscv_vlmul_ext_v_i32mf2_i32m1(__riscv_vnsra_wx_i32mf2(ab_sub, 0, 2))); +} -// FORCE_INLINE __m128 _mm_hsub_ps (__m128 a, __m128 b) {} +FORCE_INLINE __m128 _mm_hsub_ps(__m128 a, __m128 b) { + vfloat32m2_t _a = __riscv_vlmul_ext_v_f32m1_f32m2(vreinterpretq_m128_f32(a)); + vfloat32m2_t _b = __riscv_vlmul_ext_v_f32m1_f32m2(vreinterpretq_m128_f32(b)); + vfloat32m2_t ab = __riscv_vslideup_vx_f32m2(_a, _b, 4, 8); + vfloat32m2_t ab_s = __riscv_vslidedown_vx_f32m2(ab, 1, 8); + vint64m2_t ab_sub = __riscv_vreinterpret_v_i32m2_i64m2( + __riscv_vreinterpret_v_f32m2_i32m2(__riscv_vfsub_vv_f32m2(ab, ab_s, 8))); + return vreinterpretq_i32_m128(__riscv_vnsra_wx_i32m1(ab_sub, 0, 4)); +} -// FORCE_INLINE __m128i _mm_hsubs_epi16 (__m128i a, __m128i b) {} +FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i a, __m128i b) { + vint16m2_t _a = __riscv_vlmul_ext_v_i16m1_i16m2(vreinterpretq_m128i_i16(a)); + vint16m2_t _b = __riscv_vlmul_ext_v_i16m1_i16m2(vreinterpretq_m128i_i16(b)); + vint16m2_t ab = __riscv_vslideup_vx_i16m2(_a, _b, 8, 16); + vint16m2_t ab_s = __riscv_vslidedown_vx_i16m2(ab, 1, 16); + vint32m2_t ab_sub = + __riscv_vreinterpret_v_i16m2_i32m2(__riscv_vssub_vv_i16m2(ab, ab_s, 16)); + return vreinterpretq_i16_m128i(__riscv_vnsra_wx_i16m1(ab_sub, 0, 8)); +} -// FORCE_INLINE __m64 _mm_hsubs_pi16 (__m64 a, __m64 b) {} +FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 a, __m64 b) { + vint16m1_t _a = vreinterpretq_m64_i16(a); + vint16m1_t _b = vreinterpretq_m64_i16(b); + vint16m1_t ab = __riscv_vslideup_vx_i16m1(_a, _b, 4, 8); + vint16m1_t ab_s = __riscv_vslidedown_vx_i16m1(ab, 1, 8); + vint32m1_t ab_sub = + __riscv_vreinterpret_v_i16m1_i32m1(__riscv_vssub_vv_i16m1(ab, ab_s, 8)); + return vreinterpretq_i16_m64( + __riscv_vlmul_ext_v_i16mf2_i16m1(__riscv_vnsra_wx_i16mf2(ab_sub, 0, 4))); +} FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int i, int imm8) { vint16m1_t _a = vreinterpretq_m128i_i16(a); diff --git a/tests/debug_tools.cpp b/tests/debug_tools.cpp index 590e2b7..6fdcb55 100644 --- a/tests/debug_tools.cpp +++ b/tests/debug_tools.cpp @@ -47,6 +47,9 @@ void print_64_bits_s64_arr(const char *var_name, const int64_t *u) { void print_64_bits_f32_arr(const char *var_name, const float *f) { printf("%s0: %.3f, %s1: %.3f\n", var_name, f[0], var_name, f[1]); } +void print_64_bits_f64_arr(const char *var_name, const double *f) { + printf("%s0: %.6f\n", var_name, f[0]); +} void print_128_bits_u8_arr(const char *var_name, const uint8_t *u) { printf("%s0: %3u, %s1: %3u, %s2: %3u, %s3: %3u, %s4: %3u, %s5: %3u, " "%s6: %3u, %s7: %3u, %s8: %3u, %s9: %3u, %s10: %3u, %s11: %3u, " @@ -97,6 +100,9 @@ void print_128_bits_f32_arr(const char *var_name, const float *f) { printf("%s0: %.3f, %s1: %.3f, %s2: %.3f, %s3: %.3f\n", var_name, f[0], var_name, f[1], var_name, f[2], var_name, f[3]); } +void print_128_bits_f64_arr(const char *var_name, const double *f) { + printf("%s0: %.6f, %s1: %.6f\n", var_name, f[0], var_name, f[1]); +} void print_u8_64(const char *var_name, uint8_t u0, uint8_t u1, uint8_t u2, uint8_t u3, uint8_t u4, uint8_t u5, uint8_t u6, uint8_t u7) { diff --git a/tests/debug_tools.h b/tests/debug_tools.h index 3f8a639..6d95cff 100644 --- a/tests/debug_tools.h +++ b/tests/debug_tools.h @@ -26,6 +26,7 @@ void print_64_bits_s32_arr(const char *var_name, const int32_t *u); void print_64_bits_u64_arr(const char *var_name, const uint64_t *u); void print_64_bits_s64_arr(const char *var_name, const int64_t *u); void print_64_bits_f32_arr(const char *var_name, const float *f); +void print_64_bits_f64_arr(const char *var_name, const double *f); void print_128_bits_u8_arr(const char *var_name, const uint8_t *u); void print_128_bits_s8_arr(const char *var_name, const int8_t *u); void print_128_bits_u16_arr(const char *var_name, const uint16_t *u); @@ -35,6 +36,7 @@ void print_128_bits_s32_arr(const char *var_name, const int32_t *u); void print_128_bits_u64_arr(const char *var_name, const uint64_t *u); void print_128_bits_s64_arr(const char *var_name, const int64_t *u); void print_128_bits_f32_arr(const char *var_name, const float *f); +void print_128_bits_f64_arr(const char *var_name, const double *f); void print_u8_64(const char *var_name, uint8_t u0, uint8_t u1, uint8_t u2, uint8_t u3, uint8_t u4, uint8_t u5, uint8_t u6, uint8_t u7); @@ -213,6 +215,14 @@ template void print_f32_128(const char *var_name, T *a) { const float *f = (const float *)a; print_128_bits_f32_arr(var_name, f); } +template void print_f64_128(const char *var_name, T a) { + const double *f = (const double *)&a; + print_128_bits_f64_arr(var_name, f); +} +template void print_f64_128(const char *var_name, T *a) { + const double *f = (const double *)a; + print_128_bits_f64_arr(var_name, f); +} } // namespace SSE2RVV diff --git a/tests/impl.cpp b/tests/impl.cpp index ed6c190..8ace021 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -8290,81 +8290,79 @@ result_t test_mm_addsub_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { } result_t test_mm_hadd_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const double *_a = (const double *)impl.test_cases_float_pointer1; - // const double *_b = (const double *)impl.test_cases_float_pointer2; - // - // double f0 = _a[0] + _a[1]; - // double f1 = _b[0] + _b[1]; - // - // __m128d a = load_m128d(_a); - // __m128d b = load_m128d(_b); - // __m128d c = _mm_hadd_pd(a, b); - // - // return validate_double(c, f0, f1); - // #else +#ifdef ENABLE_TEST_ALL + const double *_a = (const double *)impl.test_cases_float_pointer1; + const double *_b = (const double *)impl.test_cases_float_pointer2; + + double _c[2]; + _c[0] = _a[0] + _a[1]; + _c[1] = _b[0] + _b[1]; + + __m128d a = load_m128d(_a); + __m128d b = load_m128d(_b); + __m128d c = _mm_hadd_pd(a, b); + return validate_double(c, _c[0], _c[1]); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_hadd_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); - // const float *_a = impl.test_cases_float_pointer1; - // const float *_b = impl.test_cases_float_pointer2; - // - // float f0 = _a[0] + _a[1]; - // float f1 = _a[2] + _a[3]; - // float f2 = _b[0] + _b[1]; - // float f3 = _b[2] + _b[3]; - // - // __m128 a = load_m128(_a); - // __m128 b = load_m128(_b); - // __m128 c = _mm_hadd_ps(a, b); - // - // return validate_float(c, f0, f1, f2, f3); - // #else +#ifdef ENABLE_TEST_ALL + const float *_a = impl.test_cases_float_pointer1; + const float *_b = impl.test_cases_float_pointer2; + + float f0 = _a[0] + _a[1]; + float f1 = _a[2] + _a[3]; + float f2 = _b[0] + _b[1]; + float f3 = _b[2] + _b[3]; + + __m128 a = load_m128(_a); + __m128 b = load_m128(_b); + __m128 c = _mm_hadd_ps(a, b); + + return validate_float(c, f0, f1, f2, f3); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_hsub_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const double *_a = (const double *)impl.test_cases_float_pointer1; - // const double *_b = (const double *)impl.test_cases_float_pointer2; - // - // double f0 = _a[0] - _a[1]; - // double f1 = _b[0] - _b[1]; - // - // __m128d a = load_m128d(_a); - // __m128d b = load_m128d(_b); - // __m128d c = _mm_hsub_pd(a, b); - // - // return validate_double(c, f0, f1); - // #else +#ifdef ENABLE_TEST_ALL + const double *_a = (const double *)impl.test_cases_float_pointer1; + const double *_b = (const double *)impl.test_cases_float_pointer2; + + double f0 = _a[0] - _a[1]; + double f1 = _b[0] - _b[1]; + + __m128d a = load_m128d(_a); + __m128d b = load_m128d(_b); + __m128d c = _mm_hsub_pd(a, b); + + return validate_double(c, f0, f1); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_hsub_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); - // const float *_a = impl.test_cases_float_pointer1; - // const float *_b = impl.test_cases_float_pointer2; - // - // float f0 = _a[0] - _a[1]; - // float f1 = _a[2] - _a[3]; - // float f2 = _b[0] - _b[1]; - // float f3 = _b[2] - _b[3]; - // - // __m128 a = load_m128(_a); - // __m128 b = load_m128(_b); - // __m128 c = _mm_hsub_ps(a, b); - // - // return validate_float(c, f0, f1, f2, f3); - // #else +#ifdef ENABLE_TEST_ALL + const float *_a = impl.test_cases_float_pointer1; + const float *_b = impl.test_cases_float_pointer2; + + float f0 = _a[0] - _a[1]; + float f1 = _a[2] - _a[3]; + float f2 = _b[0] - _b[1]; + float f3 = _b[2] - _b[3]; + + __m128 a = load_m128(_a); + __m128 b = load_m128(_b); + __m128 c = _mm_hsub_ps(a, b); + + return validate_float(c, f0, f1, f2, f3); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_lddqu_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { @@ -8636,289 +8634,289 @@ result_t test_mm_alignr_pi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { } result_t test_mm_hadd_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; - // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; - // int16_t d[8]; - // d[0] = _a[0] + _a[1]; - // d[1] = _a[2] + _a[3]; - // d[2] = _a[4] + _a[5]; - // d[3] = _a[6] + _a[7]; - // d[4] = _b[0] + _b[1]; - // d[5] = _b[2] + _b[3]; - // d[6] = _b[4] + _b[5]; - // d[7] = _b[6] + _b[7]; - // __m128i a = load_m128i(_a); - // __m128i b = load_m128i(_b); - // __m128i ret = _mm_hadd_epi16(a, b); - // return VALIDATE_INT16_M128(ret, d); - // #else +#ifdef ENABLE_TEST_ALL + const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + int16_t d[8]; + d[0] = _a[0] + _a[1]; + d[1] = _a[2] + _a[3]; + d[2] = _a[4] + _a[5]; + d[3] = _a[6] + _a[7]; + d[4] = _b[0] + _b[1]; + d[5] = _b[2] + _b[3]; + d[6] = _b[4] + _b[5]; + d[7] = _b[6] + _b[7]; + __m128i a = load_m128i(_a); + __m128i b = load_m128i(_b); + __m128i ret = _mm_hadd_epi16(a, b); + return VALIDATE_INT16_M128(ret, d); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_hadd_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; - // const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2; - // int32_t d[4]; - // d[0] = _a[0] + _a[1]; - // d[1] = _a[2] + _a[3]; - // d[2] = _b[0] + _b[1]; - // d[3] = _b[2] + _b[3]; - // __m128i a = load_m128i(_a); - // __m128i b = load_m128i(_b); - // __m128i ret = _mm_hadd_epi32(a, b); - // return VALIDATE_INT32_M128(ret, d); - // #else +#ifdef ENABLE_TEST_ALL + const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2; + int32_t d[4]; + d[0] = _a[0] + _a[1]; + d[1] = _a[2] + _a[3]; + d[2] = _b[0] + _b[1]; + d[3] = _b[2] + _b[3]; + __m128i a = load_m128i(_a); + __m128i b = load_m128i(_b); + __m128i ret = _mm_hadd_epi32(a, b); + return VALIDATE_INT32_M128(ret, d); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_hadd_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; - // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; - // int16_t d[4]; - // d[0] = _a[0] + _a[1]; - // d[1] = _a[2] + _a[3]; - // d[2] = _b[0] + _b[1]; - // d[3] = _b[2] + _b[3]; - // __m64 a = load_m64(_a); - // __m64 b = load_m64(_b); - // __m64 ret = _mm_hadd_pi16(a, b); - // return VALIDATE_INT16_M64(ret, d); - // #else +#ifdef ENABLE_TEST_ALL + const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + int16_t d[4]; + d[0] = _a[0] + _a[1]; + d[1] = _a[2] + _a[3]; + d[2] = _b[0] + _b[1]; + d[3] = _b[2] + _b[3]; + __m64 a = load_m64(_a); + __m64 b = load_m64(_b); + __m64 ret = _mm_hadd_pi16(a, b); + return VALIDATE_INT16_M64(ret, d); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_hadd_pi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; - // const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2; - // int32_t d[2]; - // d[0] = _a[0] + _a[1]; - // d[1] = _b[0] + _b[1]; - // __m64 a = load_m64(_a); - // __m64 b = load_m64(_b); - // __m64 ret = _mm_hadd_pi32(a, b); - // return VALIDATE_INT32_M64(ret, d); - // #else +#ifdef ENABLE_TEST_ALL + const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2; + int32_t d[2]; + d[0] = _a[0] + _a[1]; + d[1] = _b[0] + _b[1]; + __m64 a = load_m64(_a); + __m64 b = load_m64(_b); + __m64 ret = _mm_hadd_pi32(a, b); + return VALIDATE_INT32_M64(ret, d); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_hadds_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; - // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer1; - // - // int16_t d16[8]; - // int32_t d32[8]; - // d32[0] = (int32_t)_a[0] + (int32_t)_a[1]; - // d32[1] = (int32_t)_a[2] + (int32_t)_a[3]; - // d32[2] = (int32_t)_a[4] + (int32_t)_a[5]; - // d32[3] = (int32_t)_a[6] + (int32_t)_a[7]; - // d32[4] = (int32_t)_b[0] + (int32_t)_b[1]; - // d32[5] = (int32_t)_b[2] + (int32_t)_b[3]; - // d32[6] = (int32_t)_b[4] + (int32_t)_b[5]; - // d32[7] = (int32_t)_b[6] + (int32_t)_b[7]; - // for (int i = 0; i < 8; i++) { - // if (d32[i] > (int32_t)INT16_MAX) - // d16[i] = INT16_MAX; - // else if (d32[i] < (int32_t)INT16_MIN) - // d16[i] = INT16_MIN; - // else - // d16[i] = (int16_t)d32[i]; - // } - // - // __m128i a = load_m128i(_a); - // __m128i b = load_m128i(_b); - // __m128i c = _mm_hadds_epi16(a, b); - // - // return VALIDATE_INT16_M128(c, d16); - // #else +#ifdef ENABLE_TEST_ALL + const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer1; + + int16_t d16[8]; + int32_t d32[8]; + d32[0] = (int32_t)_a[0] + (int32_t)_a[1]; + d32[1] = (int32_t)_a[2] + (int32_t)_a[3]; + d32[2] = (int32_t)_a[4] + (int32_t)_a[5]; + d32[3] = (int32_t)_a[6] + (int32_t)_a[7]; + d32[4] = (int32_t)_b[0] + (int32_t)_b[1]; + d32[5] = (int32_t)_b[2] + (int32_t)_b[3]; + d32[6] = (int32_t)_b[4] + (int32_t)_b[5]; + d32[7] = (int32_t)_b[6] + (int32_t)_b[7]; + for (int i = 0; i < 8; i++) { + if (d32[i] > (int32_t)INT16_MAX) + d16[i] = INT16_MAX; + else if (d32[i] < (int32_t)INT16_MIN) + d16[i] = INT16_MIN; + else + d16[i] = (int16_t)d32[i]; + } + + __m128i a = load_m128i(_a); + __m128i b = load_m128i(_b); + __m128i c = _mm_hadds_epi16(a, b); + + return VALIDATE_INT16_M128(c, d16); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_hadds_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; - // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer1; - // - // int16_t d16[8]; - // int32_t d32[8]; - // d32[0] = (int32_t)_a[0] + (int32_t)_a[1]; - // d32[1] = (int32_t)_a[2] + (int32_t)_a[3]; - // d32[2] = (int32_t)_b[0] + (int32_t)_b[1]; - // d32[3] = (int32_t)_b[2] + (int32_t)_b[3]; - // for (int i = 0; i < 8; i++) { - // if (d32[i] > (int32_t)INT16_MAX) - // d16[i] = INT16_MAX; - // else if (d32[i] < (int32_t)INT16_MIN) - // d16[i] = INT16_MIN; - // else - // d16[i] = (int16_t)d32[i]; - // } - // - // __m64 a = load_m64(_a); - // __m64 b = load_m64(_b); - // __m64 c = _mm_hadds_pi16(a, b); - // - // return VALIDATE_INT16_M64(c, d16); - // #else +#ifdef ENABLE_TEST_ALL + const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer1; + + int16_t d16[8]; + int32_t d32[8]; + d32[0] = (int32_t)_a[0] + (int32_t)_a[1]; + d32[1] = (int32_t)_a[2] + (int32_t)_a[3]; + d32[2] = (int32_t)_b[0] + (int32_t)_b[1]; + d32[3] = (int32_t)_b[2] + (int32_t)_b[3]; + for (int i = 0; i < 8; i++) { + if (d32[i] > (int32_t)INT16_MAX) + d16[i] = INT16_MAX; + else if (d32[i] < (int32_t)INT16_MIN) + d16[i] = INT16_MIN; + else + d16[i] = (int16_t)d32[i]; + } + + __m64 a = load_m64(_a); + __m64 b = load_m64(_b); + __m64 c = _mm_hadds_pi16(a, b); + + return VALIDATE_INT16_M64(c, d16); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_hsub_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; - // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer1; - // - // int16_t d[8]; - // d[0] = _a[0] - _a[1]; - // d[1] = _a[2] - _a[3]; - // d[2] = _a[4] - _a[5]; - // d[3] = _a[6] - _a[7]; - // d[4] = _b[0] - _b[1]; - // d[5] = _b[2] - _b[3]; - // d[6] = _b[4] - _b[5]; - // d[7] = _b[6] - _b[7]; - // - // __m128i a = load_m128i(_a); - // __m128i b = load_m128i(_b); - // __m128i c = _mm_hsub_epi16(a, b); - // - // return VALIDATE_INT16_M128(c, d); - // #else +#ifdef ENABLE_TEST_ALL + const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer1; + + int16_t d[8]; + d[0] = _a[0] - _a[1]; + d[1] = _a[2] - _a[3]; + d[2] = _a[4] - _a[5]; + d[3] = _a[6] - _a[7]; + d[4] = _b[0] - _b[1]; + d[5] = _b[2] - _b[3]; + d[6] = _b[4] - _b[5]; + d[7] = _b[6] - _b[7]; + + __m128i a = load_m128i(_a); + __m128i b = load_m128i(_b); + __m128i c = _mm_hsub_epi16(a, b); + + return VALIDATE_INT16_M128(c, d); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_hsub_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int32_t *_a = impl.test_cases_int_pointer1; - // const int32_t *_b = impl.test_cases_int_pointer1; - // - // int32_t d[4]; - // d[0] = _a[0] - _a[1]; - // d[1] = _a[2] - _a[3]; - // d[2] = _b[0] - _b[1]; - // d[3] = _b[2] - _b[3]; - // - // __m128i a = load_m128i(_a); - // __m128i b = load_m128i(_b); - // __m128i c = _mm_hsub_epi32(a, b); - // - // return VALIDATE_INT32_M128(c, d); - // #else +#ifdef ENABLE_TEST_ALL + const int32_t *_a = impl.test_cases_int_pointer1; + const int32_t *_b = impl.test_cases_int_pointer1; + + int32_t d[4]; + d[0] = _a[0] - _a[1]; + d[1] = _a[2] - _a[3]; + d[2] = _b[0] - _b[1]; + d[3] = _b[2] - _b[3]; + + __m128i a = load_m128i(_a); + __m128i b = load_m128i(_b); + __m128i c = _mm_hsub_epi32(a, b); + + return VALIDATE_INT32_M128(c, d); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_hsub_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; - // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; - // - // int16_t d[4]; - // d[0] = _a[0] - _a[1]; - // d[1] = _a[2] - _a[3]; - // d[2] = _b[0] - _b[1]; - // d[3] = _b[2] - _b[3]; - // __m64 a = load_m64(_a); - // __m64 b = load_m64(_b); - // __m64 c = _mm_hsub_pi16(a, b); - // - // return VALIDATE_INT16_M64(c, d); - // #else +#ifdef ENABLE_TEST_ALL + const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + + int16_t d[4]; + d[0] = _a[0] - _a[1]; + d[1] = _a[2] - _a[3]; + d[2] = _b[0] - _b[1]; + d[3] = _b[2] - _b[3]; + __m64 a = load_m64(_a); + __m64 b = load_m64(_b); + __m64 c = _mm_hsub_pi16(a, b); + + return VALIDATE_INT16_M64(c, d); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_hsub_pi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int32_t *_a = impl.test_cases_int_pointer1; - // const int32_t *_b = impl.test_cases_int_pointer2; - // - // int32_t d[2]; - // d[0] = _a[0] - _a[1]; - // d[1] = _b[0] - _b[1]; - // - // __m64 a = load_m64(_a); - // __m64 b = load_m64(_b); - // __m64 c = _mm_hsub_pi32(a, b); - // - // return VALIDATE_INT32_M64(c, d); - // #else +#ifdef ENABLE_TEST_ALL + const int32_t *_a = impl.test_cases_int_pointer1; + const int32_t *_b = impl.test_cases_int_pointer2; + + int32_t d[2]; + d[0] = _a[0] - _a[1]; + d[1] = _b[0] - _b[1]; + + __m64 a = load_m64(_a); + __m64 b = load_m64(_b); + __m64 c = _mm_hsub_pi32(a, b); + + return VALIDATE_INT32_M64(c, d); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_hsubs_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; - // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer1; - // - // int16_t d16[8]; - // int32_t d32[8]; - // d32[0] = (int32_t)_a[0] - (int32_t)_a[1]; - // d32[1] = (int32_t)_a[2] - (int32_t)_a[3]; - // d32[2] = (int32_t)_a[4] - (int32_t)_a[5]; - // d32[3] = (int32_t)_a[6] - (int32_t)_a[7]; - // d32[4] = (int32_t)_b[0] - (int32_t)_b[1]; - // d32[5] = (int32_t)_b[2] - (int32_t)_b[3]; - // d32[6] = (int32_t)_b[4] - (int32_t)_b[5]; - // d32[7] = (int32_t)_b[6] - (int32_t)_b[7]; - // for (int i = 0; i < 8; i++) { - // if (d32[i] > (int32_t)INT16_MAX) - // d16[i] = INT16_MAX; - // else if (d32[i] < (int32_t)INT16_MIN) - // d16[i] = INT16_MIN; - // else - // d16[i] = (int16_t)d32[i]; - // } - // - // __m128i a = load_m128i(_a); - // __m128i b = load_m128i(_b); - // __m128i c = _mm_hsubs_epi16(a, b); - // - // return VALIDATE_INT16_M128(c, d16); - // #else +#ifdef ENABLE_TEST_ALL + const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer1; + + int16_t d16[8]; + int32_t d32[8]; + d32[0] = (int32_t)_a[0] - (int32_t)_a[1]; + d32[1] = (int32_t)_a[2] - (int32_t)_a[3]; + d32[2] = (int32_t)_a[4] - (int32_t)_a[5]; + d32[3] = (int32_t)_a[6] - (int32_t)_a[7]; + d32[4] = (int32_t)_b[0] - (int32_t)_b[1]; + d32[5] = (int32_t)_b[2] - (int32_t)_b[3]; + d32[6] = (int32_t)_b[4] - (int32_t)_b[5]; + d32[7] = (int32_t)_b[6] - (int32_t)_b[7]; + for (int i = 0; i < 8; i++) { + if (d32[i] > (int32_t)INT16_MAX) + d16[i] = INT16_MAX; + else if (d32[i] < (int32_t)INT16_MIN) + d16[i] = INT16_MIN; + else + d16[i] = (int16_t)d32[i]; + } + + __m128i a = load_m128i(_a); + __m128i b = load_m128i(_b); + __m128i c = _mm_hsubs_epi16(a, b); + + return VALIDATE_INT16_M128(c, d16); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_hsubs_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; - // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer1; - // - // int32_t _d[4]; - // _d[0] = (int32_t)_a[0] - (int32_t)_a[1]; - // _d[1] = (int32_t)_a[2] - (int32_t)_a[3]; - // _d[2] = (int32_t)_b[0] - (int32_t)_b[1]; - // _d[3] = (int32_t)_b[2] - (int32_t)_b[3]; - // - // for (int i = 0; i < 4; i++) { - // if (_d[i] > (int32_t)INT16_MAX) { - // _d[i] = INT16_MAX; - // } else if (_d[i] < (int32_t)INT16_MIN) { - // _d[i] = INT16_MIN; - // } - // } - // - // __m64 a = load_m64(_a); - // __m64 b = load_m64(_b); - // __m64 c = _mm_hsubs_pi16(a, b); - // - // return VALIDATE_INT16_M64(c, _d); - // #else +#ifdef ENABLE_TEST_ALL + const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer1; + + int32_t _d[4]; + _d[0] = (int32_t)_a[0] - (int32_t)_a[1]; + _d[1] = (int32_t)_a[2] - (int32_t)_a[3]; + _d[2] = (int32_t)_b[0] - (int32_t)_b[1]; + _d[3] = (int32_t)_b[2] - (int32_t)_b[3]; + + for (int i = 0; i < 4; i++) { + if (_d[i] > (int32_t)INT16_MAX) { + _d[i] = INT16_MAX; + } else if (_d[i] < (int32_t)INT16_MIN) { + _d[i] = INT16_MIN; + } + } + + __m64 a = load_m64(_a); + __m64 b = load_m64(_b); + __m64 c = _mm_hsubs_pi16(a, b); + + return VALIDATE_INT16_M64(c, _d); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_maddubs_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {