Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add _mm_movemask* #48

Merged
merged 1 commit into from
Jan 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 25 additions & 5 deletions sse2rvv.h
Original file line number Diff line number Diff line change
Expand Up @@ -2004,13 +2004,33 @@ FORCE_INLINE __m128 _mm_movelh_ps(__m128 a, __m128 b) {
return vreinterpretq_f64_m128(__riscv_vslideup_vx_f64m1(_a, _b, 1, 2));
}

// FORCE_INLINE int _mm_movemask_epi8 (__m128i a) {}
FORCE_INLINE int _mm_movemask_epi8(__m128i a) {
vint8m1_t _a = vreinterpretq_m128i_i8(a);
vuint16m1_t nonzeros =
__riscv_vreinterpret_v_b8_u16m1(__riscv_vmslt_vx_i8m1_b8(_a, 0, 16));
return (int)__riscv_vmv_x_s_u16m1_u16(nonzeros);
}

// FORCE_INLINE int _mm_movemask_pd (__m128d a) {}
FORCE_INLINE int _mm_movemask_pd(__m128d a) {
vint64m1_t _a = vreinterpretq_m128d_i64(a);
vuint8m1_t nonzeros =
__riscv_vreinterpret_v_b64_u8m1(__riscv_vmslt_vx_i64m1_b64(_a, 0, 2));
return (int)(__riscv_vmv_x_s_u8m1_u8(nonzeros) & 0x3);
}

// FORCE_INLINE int _mm_movemask_pi8 (__m64 a) {}
FORCE_INLINE int _mm_movemask_pi8(__m64 a) {
vint8m1_t _a = vreinterpretq_m128i_i8(a);
vuint8m1_t nonzeros =
__riscv_vreinterpret_v_b8_u8m1(__riscv_vmslt_vx_i8m1_b8(_a, 0, 8));
return (int)__riscv_vmv_x_s_u8m1_u8(nonzeros);
}

// FORCE_INLINE int _mm_movemask_ps (__m128 a) {}
FORCE_INLINE int _mm_movemask_ps(__m128 a) {
vint32m1_t _a = vreinterpretq_m128_i32(a);
vuint8m1_t nonzeros =
__riscv_vreinterpret_v_b32_u8m1(__riscv_vmslt_vx_i32m1_b32(_a, 0, 4));
return (int)(__riscv_vmv_x_s_u8m1_u8(nonzeros) & 0xf);
}

// FORCE_INLINE __m64 _mm_movepi64_pi64 (__m128i a) {}

Expand Down Expand Up @@ -2081,7 +2101,7 @@ FORCE_INLINE __m64 _m_pminsw(__m64 a, __m64 b) { return _mm_min_pi16(a, b); }

FORCE_INLINE __m64 _m_pminub(__m64 a, __m64 b) { return _mm_min_pu8(a, b); }

// FORCE_INLINE int _m_pmovmskb (__m64 a) {}
FORCE_INLINE int _m_pmovmskb(__m64 a) { return _mm_movemask_pi8(a); }

// FORCE_INLINE __m64 _m_pmulhuw (__m64 a, __m64 b) {}

Expand Down
150 changes: 80 additions & 70 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2556,49 +2556,49 @@ result_t test_mm_movelh_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
}

result_t test_mm_movemask_pi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
// #ifdef ENABLE_TEST_ALL
// const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
// unsigned int _c = 0;
// for (int i = 0; i < 8; i++) {
// if (_a[i] & 0x80) {
// _c |= (1 << i);
// }
// }
//
// const __m64 *a = (const __m64 *)_a;
// int c = _mm_movemask_pi8(*a);
//
// ASSERT_RETURN((unsigned int)c == _c);
// return TEST_SUCCESS;
// #else
#ifdef ENABLE_TEST_ALL
const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
unsigned int _c = 0;
for (int i = 0; i < 8; i++) {
if (_a[i] & 0x80) {
_c |= (1 << i);
}
}

const __m64 *a = (const __m64 *)_a;
int c = _mm_movemask_pi8(*a);

ASSERT_RETURN((unsigned int)c == _c);
return TEST_SUCCESS;
#else
return TEST_UNIMPL;
// #endif // ENABLE_TEST_ALL
#endif // ENABLE_TEST_ALL
}

result_t test_mm_movemask_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
// #ifdef ENABLE_TEST_ALL
// const float *p = impl.test_cases_float_pointer1;
// int ret = 0;
//
// const uint32_t *ip = (const uint32_t *)p;
// if (ip[0] & 0x80000000) {
// ret |= 1;
// }
// if (ip[1] & 0x80000000) {
// ret |= 2;
// }
// if (ip[2] & 0x80000000) {
// ret |= 4;
// }
// if (ip[3] & 0x80000000) {
// ret |= 8;
// }
// __m128 a = load_m128(p);
// int val = _mm_movemask_ps(a);
// return val == ret ? TEST_SUCCESS : TEST_FAIL;
// #else
#ifdef ENABLE_TEST_ALL
const float *p = impl.test_cases_float_pointer1;
int ret = 0;

const uint32_t *ip = (const uint32_t *)p;
if (ip[0] & 0x80000000) {
ret |= 1;
}
if (ip[1] & 0x80000000) {
ret |= 2;
}
if (ip[2] & 0x80000000) {
ret |= 4;
}
if (ip[3] & 0x80000000) {
ret |= 8;
}
__m128 a = load_m128(p);
int val = _mm_movemask_ps(a);
return val == ret ? TEST_SUCCESS : TEST_FAIL;
#else
return TEST_UNIMPL;
// #endif // ENABLE_TEST_ALL
#endif // ENABLE_TEST_ALL
}

result_t test_mm_mul_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
Expand Down Expand Up @@ -2825,7 +2825,18 @@ result_t test_m_pminub(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {

result_t test_m_pmovmskb(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
// #ifdef ENABLE_TEST_ALL
// return test_mm_movemask_pi8(impl, iter);
const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
unsigned int _c = 0;
for (int i = 0; i < 8; i++) {
if (_a[i] & 0x80) {
_c |= (1 << i);
}
}

const __m64 *a = (const __m64 *)_a;
int c = _m_pmovmskb(*a);

ASSERT_RETURN((unsigned int)c == _c);
// #else
return TEST_UNIMPL;
// #endif // ENABLE_TEST_ALL
Expand Down Expand Up @@ -6035,42 +6046,41 @@ result_t test_mm_move_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
}

result_t test_mm_movemask_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
// #ifdef ENABLE_TEST_ALL
// const int32_t *_a = impl.test_cases_int_pointer1;
// __m128i a = load_m128i(_a);
//
// const uint8_t *ip = (const uint8_t *)_a;
// int ret = 0;
// uint32_t mask = 1;
// for (uint32_t i = 0; i < 16; i++) {
// if (ip[i] & 0x80) {
// ret |= mask;
// }
// mask = mask << 1;
// }
// int test = _mm_movemask_epi8(a);
// ASSERT_RETURN(test == ret);
// return TEST_SUCCESS;
// #else
#ifdef ENABLE_TEST_ALL
const int32_t *_a = impl.test_cases_int_pointer1;
__m128i a = load_m128i(_a);

const uint8_t *_a_u8 = (const uint8_t *)_a;
int _c = 0;
uint32_t mask = 1;
for (uint32_t i = 0; i < 16; i++) {
if (_a_u8[i] & 0x80) {
_c |= mask;
}
mask = mask << 1;
}
int c = _mm_movemask_epi8(a);
ASSERT_RETURN(c == _c);
return TEST_SUCCESS;
#else
return TEST_UNIMPL;
// #endif // ENABLE_TEST_ALL
#endif // ENABLE_TEST_ALL
}

result_t test_mm_movemask_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
// #ifdef ENABLE_TEST_ALL
// const double *_a = (const double *)impl.test_cases_float_pointer1;
// unsigned int _c = 0;
// _c |= ((*(const uint64_t *)_a) >> 63) & 0x1;
// _c |= (((*(const uint64_t *)(_a + 1)) >> 62) & 0x2);
//
// __m128d a = load_m128d(_a);
// int c = _mm_movemask_pd(a);
//
// ASSERT_RETURN((unsigned int)c == _c);
// return TEST_SUCCESS;
// #else
#ifdef ENABLE_TEST_ALL
const double *_a = (const double *)impl.test_cases_float_pointer1;
unsigned int _c = 0;
_c |= ((*(const uint64_t *)_a) >> 63) & 0x1;
_c |= (((*(const uint64_t *)(_a + 1)) >> 62) & 0x2);

__m128d a = load_m128d(_a);
int c = _mm_movemask_pd(a);
ASSERT_RETURN((unsigned int)c == _c);
return TEST_SUCCESS;
#else
return TEST_UNIMPL;
// #endif // ENABLE_TEST_ALL
#endif // ENABLE_TEST_ALL
}

result_t test_mm_movepi64_pi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
Expand Down