diff --git a/sse2rvv.h b/sse2rvv.h index 285cec5..51519a4 100644 --- a/sse2rvv.h +++ b/sse2rvv.h @@ -2004,13 +2004,33 @@ FORCE_INLINE __m128 _mm_movelh_ps(__m128 a, __m128 b) { return vreinterpretq_f64_m128(__riscv_vslideup_vx_f64m1(_a, _b, 1, 2)); } -// FORCE_INLINE int _mm_movemask_epi8 (__m128i a) {} +FORCE_INLINE int _mm_movemask_epi8(__m128i a) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vuint16m1_t nonzeros = + __riscv_vreinterpret_v_b8_u16m1(__riscv_vmslt_vx_i8m1_b8(_a, 0, 16)); + return (int)__riscv_vmv_x_s_u16m1_u16(nonzeros); +} -// FORCE_INLINE int _mm_movemask_pd (__m128d a) {} +FORCE_INLINE int _mm_movemask_pd(__m128d a) { + vint64m1_t _a = vreinterpretq_m128d_i64(a); + vuint8m1_t nonzeros = + __riscv_vreinterpret_v_b64_u8m1(__riscv_vmslt_vx_i64m1_b64(_a, 0, 2)); + return (int)(__riscv_vmv_x_s_u8m1_u8(nonzeros) & 0x3); +} -// FORCE_INLINE int _mm_movemask_pi8 (__m64 a) {} +FORCE_INLINE int _mm_movemask_pi8(__m64 a) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vuint8m1_t nonzeros = + __riscv_vreinterpret_v_b8_u8m1(__riscv_vmslt_vx_i8m1_b8(_a, 0, 8)); + return (int)__riscv_vmv_x_s_u8m1_u8(nonzeros); +} -// FORCE_INLINE int _mm_movemask_ps (__m128 a) {} +FORCE_INLINE int _mm_movemask_ps(__m128 a) { + vint32m1_t _a = vreinterpretq_m128_i32(a); + vuint8m1_t nonzeros = + __riscv_vreinterpret_v_b32_u8m1(__riscv_vmslt_vx_i32m1_b32(_a, 0, 4)); + return (int)(__riscv_vmv_x_s_u8m1_u8(nonzeros) & 0xf); +} // FORCE_INLINE __m64 _mm_movepi64_pi64 (__m128i a) {} @@ -2081,7 +2101,7 @@ FORCE_INLINE __m64 _m_pminsw(__m64 a, __m64 b) { return _mm_min_pi16(a, b); } FORCE_INLINE __m64 _m_pminub(__m64 a, __m64 b) { return _mm_min_pu8(a, b); } -// FORCE_INLINE int _m_pmovmskb (__m64 a) {} +FORCE_INLINE int _m_pmovmskb(__m64 a) { return _mm_movemask_pi8(a); } // FORCE_INLINE __m64 _m_pmulhuw (__m64 a, __m64 b) {} diff --git a/tests/impl.cpp b/tests/impl.cpp index fd7a86e..22bf414 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -2556,49 +2556,49 @@ result_t test_mm_movelh_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { } result_t test_mm_movemask_pi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1; - // unsigned int _c = 0; - // for (int i = 0; i < 8; i++) { - // if (_a[i] & 0x80) { - // _c |= (1 << i); - // } - // } - // - // const __m64 *a = (const __m64 *)_a; - // int c = _mm_movemask_pi8(*a); - // - // ASSERT_RETURN((unsigned int)c == _c); - // return TEST_SUCCESS; - // #else +#ifdef ENABLE_TEST_ALL + const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1; + unsigned int _c = 0; + for (int i = 0; i < 8; i++) { + if (_a[i] & 0x80) { + _c |= (1 << i); + } + } + + const __m64 *a = (const __m64 *)_a; + int c = _mm_movemask_pi8(*a); + + ASSERT_RETURN((unsigned int)c == _c); + return TEST_SUCCESS; +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_movemask_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const float *p = impl.test_cases_float_pointer1; - // int ret = 0; - // - // const uint32_t *ip = (const uint32_t *)p; - // if (ip[0] & 0x80000000) { - // ret |= 1; - // } - // if (ip[1] & 0x80000000) { - // ret |= 2; - // } - // if (ip[2] & 0x80000000) { - // ret |= 4; - // } - // if (ip[3] & 0x80000000) { - // ret |= 8; - // } - // __m128 a = load_m128(p); - // int val = _mm_movemask_ps(a); - // return val == ret ? TEST_SUCCESS : TEST_FAIL; - // #else +#ifdef ENABLE_TEST_ALL + const float *p = impl.test_cases_float_pointer1; + int ret = 0; + + const uint32_t *ip = (const uint32_t *)p; + if (ip[0] & 0x80000000) { + ret |= 1; + } + if (ip[1] & 0x80000000) { + ret |= 2; + } + if (ip[2] & 0x80000000) { + ret |= 4; + } + if (ip[3] & 0x80000000) { + ret |= 8; + } + __m128 a = load_m128(p); + int val = _mm_movemask_ps(a); + return val == ret ? TEST_SUCCESS : TEST_FAIL; +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_mul_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { @@ -2825,7 +2825,18 @@ result_t test_m_pminub(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { result_t test_m_pmovmskb(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { // #ifdef ENABLE_TEST_ALL - // return test_mm_movemask_pi8(impl, iter); + const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1; + unsigned int _c = 0; + for (int i = 0; i < 8; i++) { + if (_a[i] & 0x80) { + _c |= (1 << i); + } + } + + const __m64 *a = (const __m64 *)_a; + int c = _m_pmovmskb(*a); + + ASSERT_RETURN((unsigned int)c == _c); // #else return TEST_UNIMPL; // #endif // ENABLE_TEST_ALL @@ -6035,42 +6046,41 @@ result_t test_mm_move_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { } result_t test_mm_movemask_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int32_t *_a = impl.test_cases_int_pointer1; - // __m128i a = load_m128i(_a); - // - // const uint8_t *ip = (const uint8_t *)_a; - // int ret = 0; - // uint32_t mask = 1; - // for (uint32_t i = 0; i < 16; i++) { - // if (ip[i] & 0x80) { - // ret |= mask; - // } - // mask = mask << 1; - // } - // int test = _mm_movemask_epi8(a); - // ASSERT_RETURN(test == ret); - // return TEST_SUCCESS; - // #else +#ifdef ENABLE_TEST_ALL + const int32_t *_a = impl.test_cases_int_pointer1; + __m128i a = load_m128i(_a); + + const uint8_t *_a_u8 = (const uint8_t *)_a; + int _c = 0; + uint32_t mask = 1; + for (uint32_t i = 0; i < 16; i++) { + if (_a_u8[i] & 0x80) { + _c |= mask; + } + mask = mask << 1; + } + int c = _mm_movemask_epi8(a); + ASSERT_RETURN(c == _c); + return TEST_SUCCESS; +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_movemask_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const double *_a = (const double *)impl.test_cases_float_pointer1; - // unsigned int _c = 0; - // _c |= ((*(const uint64_t *)_a) >> 63) & 0x1; - // _c |= (((*(const uint64_t *)(_a + 1)) >> 62) & 0x2); - // - // __m128d a = load_m128d(_a); - // int c = _mm_movemask_pd(a); - // - // ASSERT_RETURN((unsigned int)c == _c); - // return TEST_SUCCESS; - // #else +#ifdef ENABLE_TEST_ALL + const double *_a = (const double *)impl.test_cases_float_pointer1; + unsigned int _c = 0; + _c |= ((*(const uint64_t *)_a) >> 63) & 0x1; + _c |= (((*(const uint64_t *)(_a + 1)) >> 62) & 0x2); + + __m128d a = load_m128d(_a); + int c = _mm_movemask_pd(a); + ASSERT_RETURN((unsigned int)c == _c); + return TEST_SUCCESS; +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_movepi64_pi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {