From 650cfca557a5e7642da624019f88359ac1d86b16 Mon Sep 17 00:00:00 2001 From: Yang Hau Date: Sun, 24 Dec 2023 11:41:04 +0800 Subject: [PATCH] feat: Add _mm_and_* --- sse2rvv.h | 24 ++++++++-- tests/impl.cpp | 124 ++++++++++++++++++++++++------------------------- 2 files changed, 81 insertions(+), 67 deletions(-) diff --git a/sse2rvv.h b/sse2rvv.h index 0631b7a..d9945a0 100644 --- a/sse2rvv.h +++ b/sse2rvv.h @@ -97,7 +97,7 @@ typedef vint32m1_t __m128i; /* 128-bit vector containing integers */ __riscv_vreinterpret_v_i32m1_i16m1(__riscv_vreinterpret_v_f32m1_i32m1(x)) #define vreinterpretq_m128_i32(x) __riscv_vreinterpret_v_f32m1_i32m1(x) #define vreinterpretq_m128_i64(x) \ - __riscv_vreinterpret_v_f64m1_i64m1(__riscv_vreinterpret_v_f32m1_f64m1(x)) + __riscv_vreinterpret_v_i32m1_i64m1(__riscv_vreinterpret_v_f32m1_i32m1(x)) #define vreinterpretq_m128_f32(x) (x) #define vreinterpretq_m128_f64(x) __riscv_vreinterpret_v_f32m1_f64m1(x) @@ -474,7 +474,9 @@ typedef struct { // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) { - return __riscv_vfadd_vv_f32m1(a, b, 4); + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + return vreinterpretq_f32_m128(__riscv_vfadd_vv_f32m1(a, b, 4)); } // Add the lower single-precision (32-bit) floating-point element in a and b, @@ -486,7 +488,11 @@ FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) { // Compute the bitwise AND of packed single-precision (32-bit) floating-point // elements in a and b, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps -// FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) {} +FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) { + vint32m1_t _a = vreinterpretq_m128_i32(a); + vint32m1_t _b = vreinterpretq_m128_i32(b); + return vreinterpretq_i32_m128(__riscv_vand_vv_i32m1(_a, _b, 4)); +} // Compute the bitwise NOT of packed single-precision (32-bit) floating-point // elements in a and then AND with b, and store the results in dst. @@ -1456,12 +1462,20 @@ FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) { // Compute the bitwise AND of packed double-precision (64-bit) floating-point // elements in a and b, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd -// FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) {} +FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) { + vint64m1_t _a = vreinterpretq_m128d_i64(a); + vint64m1_t _b = vreinterpretq_m128d_i64(b); + return vreinterpretq_i64_m128d(__riscv_vand_vv_i64m1(_a, _b, 2)); +} // Compute the bitwise AND of 128 bits (representing integer data) in a and b, // and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128 -// FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) {} +FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t _b = vreinterpretq_m128i_i32(b); + return vreinterpretq_i32_m128i(__riscv_vand_vv_i32m1(_a, _b, 4)); +} // Compute the bitwise NOT of packed double-precision (64-bit) floating-point // elements in a and then AND with b, and store the results in dst. diff --git a/tests/impl.cpp b/tests/impl.cpp index 757d375..e86f3c8 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -374,11 +374,11 @@ result_t test_mm_shuffle_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter); // This function is not called from "run_single_test", but for other intrinsic // tests that might need to call "_mm_set_epi32". -// __m128i do_mm_set_epi32(int32_t x, int32_t y, int32_t z, int32_t w) { -// __m128i a = _mm_set_epi32(x, y, z, w); -// validate_int32(a, w, z, y, x); -// return a; -// } +__m128i do_mm_set_epi32(int32_t x, int32_t y, int32_t z, int32_t w) { + __m128i a = _mm_set_epi32(x, y, z, w); + validate_int32(a, w, z, y, x); + return a; +} // This function is not called from "run_single_test", but for other intrinsic // tests that might need to load __m64 data. @@ -780,29 +780,29 @@ result_t test_mm_add_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { } result_t test_mm_and_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const float *_a = impl.test_cases_float_pointer1; - // const float *_b = impl.test_cases_float_pointer2; - // __m128 a = load_m128(_a); - // __m128 b = load_m128(_b); - // __m128 c = _mm_and_ps(a, b); +#ifdef ENABLE_TEST_ALL + const float *_a = impl.test_cases_float_pointer1; + const float *_b = impl.test_cases_float_pointer2; + __m128 a = load_m128(_a); + __m128 b = load_m128(_b); + __m128 c = _mm_and_ps(a, b); // now for the assertion... - // const uint32_t *ia = (const uint32_t *)&a; - // const uint32_t *ib = (const uint32_t *)&b; - // uint32_t r[4]; - // r[0] = ia[0] & ib[0]; - // r[1] = ia[1] & ib[1]; - // r[2] = ia[2] & ib[2]; - // r[3] = ia[3] & ib[3]; - // __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]); - // result_t res = VALIDATE_INT32_M128(*(const __m128i *)&c, r); - // if (res) { - // res = VALIDATE_INT32_M128(ret, r); - // } - // return res; - // #else + const uint32_t *ia = (const uint32_t *)&a; + const uint32_t *ib = (const uint32_t *)&b; + uint32_t r[4]; + r[0] = ia[0] & ib[0]; + r[1] = ia[1] & ib[1]; + r[2] = ia[2] & ib[2]; + r[3] = ia[3] & ib[3]; + __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]); + result_t res = VALIDATE_INT32_M128(*(const __m128i *)&c, r); + if (res) { + res = VALIDATE_INT32_M128(ret, r); + } + return res; +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } // r0 := ~a0 & b0 @@ -3971,48 +3971,48 @@ result_t test_mm_adds_epu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { } result_t test_mm_and_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int64_t *_a = (const int64_t *)impl.test_cases_float_pointer1; - // const int64_t *_b = (const int64_t *)impl.test_cases_float_pointer2; - // - // int64_t d0 = _a[0] & _b[0]; - // int64_t d1 = _a[1] & _b[1]; - // - // __m128d a = load_m128d(_a); - // __m128d b = load_m128d(_b); - // __m128d c = _mm_and_pd(a, b); - // - // return validate_double(c, *((double *)&d0), *((double *)&d1)); - // #else +#ifdef ENABLE_TEST_ALL + const int64_t *_a = (const int64_t *)impl.test_cases_float_pointer1; + const int64_t *_b = (const int64_t *)impl.test_cases_float_pointer2; + + int64_t d0 = _a[0] & _b[0]; + int64_t d1 = _a[1] & _b[1]; + + __m128d a = load_m128d(_a); + __m128d b = load_m128d(_b); + __m128d c = _mm_and_pd(a, b); + + return validate_double(c, *((double *)&d0), *((double *)&d1)); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_and_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int32_t *_a = impl.test_cases_int_pointer1; - // const int32_t *_b = impl.test_cases_int_pointer2; - // __m128i a = load_m128i(_a); - // __m128i b = load_m128i(_b); - // __m128 fc = _mm_and_ps(*(const __m128 *)&a, *(const __m128 *)&b); - // __m128i c = *(const __m128i *)&fc; +#ifdef ENABLE_TEST_ALL + const int32_t *_a = impl.test_cases_int_pointer1; + const int32_t *_b = impl.test_cases_int_pointer2; + __m128i a = load_m128i(_a); + __m128i b = load_m128i(_b); + __m128 fc = _mm_and_ps(*(const __m128 *)&a, *(const __m128 *)&b); + __m128i c = *(const __m128i *)&fc; // now for the assertion... - // const uint32_t *ia = (const uint32_t *)&a; - // const uint32_t *ib = (const uint32_t *)&b; - // uint32_t r[4]; - // r[0] = ia[0] & ib[0]; - // r[1] = ia[1] & ib[1]; - // r[2] = ia[2] & ib[2]; - // r[3] = ia[3] & ib[3]; - // __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]); - // result_t res = VALIDATE_INT32_M128(c, r); - // if (res) { - // res = VALIDATE_INT32_M128(ret, r); - // } - // return res; - // #else + const uint32_t *ia = (const uint32_t *)&a; + const uint32_t *ib = (const uint32_t *)&b; + uint32_t r[4]; + r[0] = ia[0] & ib[0]; + r[1] = ia[1] & ib[1]; + r[2] = ia[2] & ib[2]; + r[3] = ia[3] & ib[3]; + __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]); + result_t res = VALIDATE_INT32_M128(c, r); + if (res) { + res = VALIDATE_INT32_M128(ret, r); + } + return res; +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_andnot_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {