Skip to content

Commit

Permalink
Merge pull request #47 from howjmay/move
Browse files Browse the repository at this point in the history
feat: Add mm_move*
  • Loading branch information
howjmay authored Jan 19, 2024
2 parents 169677b + b4796cb commit 9a8610f
Show file tree
Hide file tree
Showing 2 changed files with 145 additions and 106 deletions.
64 changes: 52 additions & 12 deletions sse2rvv.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ typedef union ALIGN_STRUCT(16) SIMDVec {
__riscv_vreinterpret_v_u32m1_u16m1(__riscv_vreinterpret_v_f32m1_u32m1(x))
#define vreinterpretq_m128_u32(x) __riscv_vreinterpret_v_f32m1_u32m1(x)
#define vreinterpretq_m128_u64(x) \
__riscv_vreinterpret_v_f64m1_u64m1(__riscv_vreinterpret_v_f32m1_f64m1(x))
__riscv_vreinterpret_v_u32m1_u64m1(__riscv_vreinterpret_v_f32m1_u32m1(x))
#define vreinterpretq_m128_i8(x) \
__riscv_vreinterpret_v_i32m1_i8m1(__riscv_vreinterpret_v_f32m1_i32m1(x))
#define vreinterpretq_m128_i16(x) \
Expand All @@ -110,15 +110,17 @@ typedef union ALIGN_STRUCT(16) SIMDVec {
#define vreinterpretq_m128_i64(x) \
__riscv_vreinterpret_v_i32m1_i64m1(__riscv_vreinterpret_v_f32m1_i32m1(x))
#define vreinterpretq_m128_f32(x) (x)
#define vreinterpretq_m128_f64(x) __riscv_vreinterpret_v_f32m1_f64m1(x)
#define vreinterpretq_m128_f64(x) \
__riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u32m1_u64m1( \
__riscv_vreinterpret_v_f32m1_u32m1(x)))

#define vreinterpretq_u8_m128(x) \
__riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u8m1_u32m1(x))
#define vreinterpretq_u16_m128(x) \
__riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u16m1_u32m1(x))
#define vreinterpretq_u32_m128(x) __riscv_vreinterpret_v_u32m1_f32m1(x)
#define vreinterpretq_u64_m128(x) \
__riscv_vreinterpret_v_f64m1_f32m1(__riscv_vreinterpret_v_u64m1_f64m1(x))
__riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u64m1_u32m1(x))
#define vreinterpretq_i8_m128(x) \
__riscv_vreinterpret_v_i32m1_f32m1(__riscv_vreinterpret_v_i8m1_i32m1(x))
#define vreinterpretq_i16_m128(x) \
Expand All @@ -127,7 +129,9 @@ typedef union ALIGN_STRUCT(16) SIMDVec {
#define vreinterpretq_i64_m128(x) \
__riscv_vreinterpret_v_f64m1_f32m1(__riscv_vreinterpret_v_i64m1_f64m1(x))
#define vreinterpretq_f32_m128(x) (x)
#define vreinterpretq_f64_m128(x) __riscv_vreinterpret_v_f64m1_f32m1(x)
#define vreinterpretq_f64_m128(x) \
__riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u64m1_u32m1( \
__riscv_vreinterpret_v_f64m1_u64m1(x)))

#define vreinterpretq_m128d_u8(x) \
__riscv_vreinterpret_v_u64m1_u8m1(__riscv_vreinterpret_v_f64m1_u64m1(x))
Expand Down Expand Up @@ -1948,21 +1952,57 @@ FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a) {
__riscv_vslideup_vx_u16m1(zeros, min_index, 0, 2));
}

// FORCE_INLINE __m128i _mm_move_epi64 (__m128i a) {}
FORCE_INLINE __m128i _mm_move_epi64(__m128i a) {
vuint64m1_t _a = vreinterpretq_m128i_u64(a);
vuint64m1_t zeros = __riscv_vmv_v_x_u64m1(0, 2);
return vreinterpretq_u64_m128i(__riscv_vslideup_vx_u64m1(zeros, _a, 0, 1));
}

// FORCE_INLINE __m128d _mm_move_sd (__m128d a, __m128d b) {}
FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b) {
vfloat64m1_t _a = vreinterpretq_m128d_f64(a);
vfloat64m1_t _b = vreinterpretq_m128d_f64(b);
return vreinterpretq_f64_m128d(__riscv_vslideup_vx_f64m1(_a, _b, 0, 1));
}

// FORCE_INLINE __m128 _mm_move_ss (__m128 a, __m128 b) {}
FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b) {
vfloat32m1_t _a = vreinterpretq_m128_f32(a);
vfloat32m1_t _b = vreinterpretq_m128_f32(b);
return vreinterpretq_f32_m128(__riscv_vslideup_vx_f32m1(_a, _b, 0, 1));
}

// FORCE_INLINE __m128d _mm_movedup_pd (__m128d a) {}
FORCE_INLINE __m128d _mm_movedup_pd(__m128d a) {
vfloat64m1_t _a = vreinterpretq_m128d_f64(a);
return vreinterpretq_f64_m128d(__riscv_vslideup_vx_f64m1(_a, _a, 1, 2));
}

// FORCE_INLINE __m128 _mm_movehdup_ps (__m128 a) {}
FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) {
// TODO optimize with vmacc
vuint64m1_t _a = vreinterpretq_m128_u64(a);
vuint64m1_t a_low = __riscv_vsrl_vx_u64m1(_a, 32, 2);
vuint64m1_t a_high = __riscv_vsll_vx_u64m1(a_low, 32, 2);
return vreinterpretq_u64_m128(__riscv_vor_vv_u64m1(a_high, a_low, 2));
}

// FORCE_INLINE __m128 _mm_movehl_ps (__m128 a, __m128 b) {}
FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b) {
vfloat64m1_t _a = vreinterpretq_m128_f64(a);
vfloat64m1_t _b = vreinterpretq_m128_f64(b);
vfloat64m1_t b_s = __riscv_vslidedown_vx_f64m1(_b, 1, 2);
return vreinterpretq_f64_m128(__riscv_vslideup_vx_f64m1(_a, b_s, 0, 1));
}

// FORCE_INLINE __m128 _mm_moveldup_ps (__m128 a) {}
FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) {
// TODO optimize with vmacc
vuint64m1_t _a = vreinterpretq_m128_u64(a);
vuint64m1_t a_high = __riscv_vsll_vx_u64m1(_a, 32, 2);
vuint64m1_t a_low = __riscv_vsrl_vx_u64m1(a_high, 32, 2);
return vreinterpretq_u64_m128(__riscv_vor_vv_u64m1(a_high, a_low, 2));
}

// FORCE_INLINE __m128 _mm_movelh_ps (__m128 a, __m128 b) {}
FORCE_INLINE __m128 _mm_movelh_ps(__m128 a, __m128 b) {
vfloat64m1_t _a = vreinterpretq_m128_f64(a);
vfloat64m1_t _b = vreinterpretq_m128_f64(b);
return vreinterpretq_f64_m128(__riscv_vslideup_vx_f64m1(_a, _b, 1, 2));
}

// FORCE_INLINE int _mm_movemask_epi8 (__m128i a) {}

Expand Down
187 changes: 93 additions & 94 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2496,63 +2496,63 @@ result_t test_mm_min_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
}

result_t test_mm_move_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
// #ifdef ENABLE_TEST_ALL
// const float *_a = impl.test_cases_float_pointer1;
// const float *_b = impl.test_cases_float_pointer2;
// __m128 a = load_m128(_a);
// __m128 b = load_m128(_b);
//
// float result[4];
// result[0] = _b[0];
// result[1] = _a[1];
// result[2] = _a[2];
// result[3] = _a[3];
//
// __m128 ret = _mm_move_ss(a, b);
// return validate_float(ret, result[0], result[1], result[2], result[3]);
// #else
#ifdef ENABLE_TEST_ALL
const float *_a = impl.test_cases_float_pointer1;
const float *_b = impl.test_cases_float_pointer2;
__m128 a = load_m128(_a);
__m128 b = load_m128(_b);

float _c[4];
_c[0] = _b[0];
_c[1] = _a[1];
_c[2] = _a[2];
_c[3] = _a[3];

__m128 c = _mm_move_ss(a, b);
return validate_float(c, _c[0], _c[1], _c[2], _c[3]);
#else
return TEST_UNIMPL;
// #endif // ENABLE_TEST_ALL
#endif // ENABLE_TEST_ALL
}

result_t test_mm_movehl_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
// #ifdef ENABLE_TEST_ALL
// const float *_a = impl.test_cases_float_pointer1;
// const float *_b = impl.test_cases_float_pointer2;
//
// float f0 = _b[2];
// float f1 = _b[3];
// float f2 = _a[2];
// float f3 = _a[3];
//
// __m128 a = load_m128(_a);
// __m128 b = load_m128(_b);
// __m128 ret = _mm_movehl_ps(a, b);
//
// return validate_float(ret, f0, f1, f2, f3);
// #else
#ifdef ENABLE_TEST_ALL
const float *_a = impl.test_cases_float_pointer1;
const float *_b = impl.test_cases_float_pointer2;

float f0 = _b[2];
float f1 = _b[3];
float f2 = _a[2];
float f3 = _a[3];

__m128 a = load_m128(_a);
__m128 b = load_m128(_b);
__m128 ret = _mm_movehl_ps(a, b);

return validate_float(ret, f0, f1, f2, f3);
#else
return TEST_UNIMPL;
// #endif // ENABLE_TEST_ALL
#endif // ENABLE_TEST_ALL
}

result_t test_mm_movelh_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
// #ifdef ENABLE_TEST_ALL
// const float *_a = impl.test_cases_float_pointer1;
// const float *_b = impl.test_cases_float_pointer2;
//
// float f0 = _a[0];
// float f1 = _a[1];
// float f2 = _b[0];
// float f3 = _b[1];
//
// __m128 a = load_m128(_a);
// __m128 b = load_m128(_b);
// __m128 ret = _mm_movelh_ps(a, b);
//
// return validate_float(ret, f0, f1, f2, f3);
// #else
#ifdef ENABLE_TEST_ALL
const float *_a = impl.test_cases_float_pointer1;
const float *_b = impl.test_cases_float_pointer2;

float f0 = _a[0];
float f1 = _a[1];
float f2 = _b[0];
float f3 = _b[1];

__m128 a = load_m128(_a);
__m128 b = load_m128(_b);
__m128 ret = _mm_movelh_ps(a, b);

return validate_float(ret, f0, f1, f2, f3);
#else
return TEST_UNIMPL;
// #endif // ENABLE_TEST_ALL
#endif // ENABLE_TEST_ALL
}

result_t test_mm_movemask_pi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
Expand Down Expand Up @@ -6002,37 +6002,36 @@ result_t test_mm_min_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
}

result_t test_mm_move_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
// #ifdef ENABLE_TEST_ALL
// const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
//
// int64_t d0 = _a[0];
// int64_t d1 = 0;
//
// __m128i a = load_m128i(_a);
// __m128i c = _mm_move_epi64(a);
//
// return validate_int64(c, d0, d1);
// #else
#ifdef ENABLE_TEST_ALL
const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
int64_t d0 = _a[0];
int64_t d1 = 0;

__m128i a = load_m128i(_a);
__m128i c = _mm_move_epi64(a);

return validate_int64(c, d0, d1);
#else
return TEST_UNIMPL;
// #endif // ENABLE_TEST_ALL
#endif // ENABLE_TEST_ALL
}

result_t test_mm_move_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
// #ifdef ENABLE_TEST_ALL
// const double *_a = (const double *)impl.test_cases_float_pointer1;
// const double *_b = (const double *)impl.test_cases_float_pointer2;
// __m128d a = load_m128d(_a);
// __m128d b = load_m128d(_b);
//
// double result[2];
// result[0] = _b[0];
// result[1] = _a[1];
//
// __m128d ret = _mm_move_sd(a, b);
// return validate_double(ret, result[0], result[1]);
// #else
#ifdef ENABLE_TEST_ALL
const double *_a = (const double *)impl.test_cases_float_pointer1;
const double *_b = (const double *)impl.test_cases_float_pointer2;
__m128d a = load_m128d(_a);
__m128d b = load_m128d(_b);

double _c[2];
_c[0] = _b[0];
_c[1] = _a[1];

__m128d c = _mm_move_sd(a, b);
return validate_double(c, _c[0], _c[1]);
#else
return TEST_UNIMPL;
// #endif // ENABLE_TEST_ALL
#endif // ENABLE_TEST_ALL
}

result_t test_mm_movemask_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
Expand Down Expand Up @@ -8322,35 +8321,35 @@ result_t test_mm_loaddup_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
}

result_t test_mm_movedup_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
// #ifdef ENABLE_TEST_ALL
// const double *p = (const double *)impl.test_cases_float_pointer1;
// __m128d a = load_m128d(p);
// __m128d b = _mm_movedup_pd(a);
//
// return validate_double(b, p[0], p[0]);
// #else
#ifdef ENABLE_TEST_ALL
const double *_a = (const double *)impl.test_cases_float_pointer1;
__m128d a = load_m128d(_a);
__m128d c = _mm_movedup_pd(a);

return validate_double(c, _a[0], _a[0]);
#else
return TEST_UNIMPL;
// #endif // ENABLE_TEST_ALL
#endif // ENABLE_TEST_ALL
}

result_t test_mm_movehdup_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
// #ifdef ENABLE_TEST_ALL
// const float *p = impl.test_cases_float_pointer1;
// __m128 a = load_m128(p);
// return validate_float(_mm_movehdup_ps(a), p[1], p[1], p[3], p[3]);
// #else
#ifdef ENABLE_TEST_ALL
const float *_a = impl.test_cases_float_pointer1;
__m128 a = load_m128(_a);
return validate_float(_mm_movehdup_ps(a), _a[1], _a[1], _a[3], _a[3]);
#else
return TEST_UNIMPL;
// #endif // ENABLE_TEST_ALL
#endif // ENABLE_TEST_ALL
}

result_t test_mm_moveldup_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
// #ifdef ENABLE_TEST_ALL
// const float *p = impl.test_cases_float_pointer1;
// __m128 a = load_m128(p);
// return validate_float(_mm_moveldup_ps(a), p[0], p[0], p[2], p[2]);
// #else
#ifdef ENABLE_TEST_ALL
const float *_a = impl.test_cases_float_pointer1;
__m128 a = load_m128(_a);
return validate_float(_mm_moveldup_ps(a), _a[0], _a[0], _a[2], _a[2]);
#else
return TEST_UNIMPL;
// #endif // ENABLE_TEST_ALL
#endif // ENABLE_TEST_ALL
}

/* SSSE3 */
Expand Down

0 comments on commit 9a8610f

Please sign in to comment.