Merge pull request #47 from howjmay/move

feat: Add mm_move*
pattonkan · Jan 19, 2024 · 9a8610f · 9a8610f
2 parents 169677b + b4796cb
commit 9a8610f
Show file tree

Hide file tree

Showing 2 changed files with 145 additions and 106 deletions.
diff --git a/sse2rvv.h b/sse2rvv.h
@@ -101,7 +101,7 @@ typedef union ALIGN_STRUCT(16) SIMDVec {
   __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vreinterpret_v_f32m1_u32m1(x))
 #define vreinterpretq_m128_u32(x) __riscv_vreinterpret_v_f32m1_u32m1(x)
 #define vreinterpretq_m128_u64(x)                                              \
-  __riscv_vreinterpret_v_f64m1_u64m1(__riscv_vreinterpret_v_f32m1_f64m1(x))
+  __riscv_vreinterpret_v_u32m1_u64m1(__riscv_vreinterpret_v_f32m1_u32m1(x))
 #define vreinterpretq_m128_i8(x)                                               \
   __riscv_vreinterpret_v_i32m1_i8m1(__riscv_vreinterpret_v_f32m1_i32m1(x))
 #define vreinterpretq_m128_i16(x)                                              \
@@ -110,15 +110,17 @@ typedef union ALIGN_STRUCT(16) SIMDVec {
 #define vreinterpretq_m128_i64(x)                                              \
   __riscv_vreinterpret_v_i32m1_i64m1(__riscv_vreinterpret_v_f32m1_i32m1(x))
 #define vreinterpretq_m128_f32(x) (x)
-#define vreinterpretq_m128_f64(x) __riscv_vreinterpret_v_f32m1_f64m1(x)
+#define vreinterpretq_m128_f64(x)                                              \
+  __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u32m1_u64m1(       \
+      __riscv_vreinterpret_v_f32m1_u32m1(x)))
 
 #define vreinterpretq_u8_m128(x)                                               \
   __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u8m1_u32m1(x))
 #define vreinterpretq_u16_m128(x)                                              \
   __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u16m1_u32m1(x))
 #define vreinterpretq_u32_m128(x) __riscv_vreinterpret_v_u32m1_f32m1(x)
 #define vreinterpretq_u64_m128(x)                                              \
-  __riscv_vreinterpret_v_f64m1_f32m1(__riscv_vreinterpret_v_u64m1_f64m1(x))
+  __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u64m1_u32m1(x))
 #define vreinterpretq_i8_m128(x)                                               \
   __riscv_vreinterpret_v_i32m1_f32m1(__riscv_vreinterpret_v_i8m1_i32m1(x))
 #define vreinterpretq_i16_m128(x)                                              \
@@ -127,7 +129,9 @@ typedef union ALIGN_STRUCT(16) SIMDVec {
 #define vreinterpretq_i64_m128(x)                                              \
   __riscv_vreinterpret_v_f64m1_f32m1(__riscv_vreinterpret_v_i64m1_f64m1(x))
 #define vreinterpretq_f32_m128(x) (x)
-#define vreinterpretq_f64_m128(x) __riscv_vreinterpret_v_f64m1_f32m1(x)
+#define vreinterpretq_f64_m128(x)                                              \
+  __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u64m1_u32m1(       \
+      __riscv_vreinterpret_v_f64m1_u64m1(x)))
 
 #define vreinterpretq_m128d_u8(x)                                              \
   __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vreinterpret_v_f64m1_u64m1(x))
@@ -1948,21 +1952,57 @@ FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a) {
       __riscv_vslideup_vx_u16m1(zeros, min_index, 0, 2));
 }
 
-// FORCE_INLINE __m128i _mm_move_epi64 (__m128i a) {}
+FORCE_INLINE __m128i _mm_move_epi64(__m128i a) {
+  vuint64m1_t _a = vreinterpretq_m128i_u64(a);
+  vuint64m1_t zeros = __riscv_vmv_v_x_u64m1(0, 2);
+  return vreinterpretq_u64_m128i(__riscv_vslideup_vx_u64m1(zeros, _a, 0, 1));
+}
 
-// FORCE_INLINE __m128d _mm_move_sd (__m128d a, __m128d b) {}
+FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b) {
+  vfloat64m1_t _a = vreinterpretq_m128d_f64(a);
+  vfloat64m1_t _b = vreinterpretq_m128d_f64(b);
+  return vreinterpretq_f64_m128d(__riscv_vslideup_vx_f64m1(_a, _b, 0, 1));
+}
 
-// FORCE_INLINE __m128 _mm_move_ss (__m128 a, __m128 b) {}
+FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b) {
+  vfloat32m1_t _a = vreinterpretq_m128_f32(a);
+  vfloat32m1_t _b = vreinterpretq_m128_f32(b);
+  return vreinterpretq_f32_m128(__riscv_vslideup_vx_f32m1(_a, _b, 0, 1));
+}
 
-// FORCE_INLINE __m128d _mm_movedup_pd (__m128d a) {}
+FORCE_INLINE __m128d _mm_movedup_pd(__m128d a) {
+  vfloat64m1_t _a = vreinterpretq_m128d_f64(a);
+  return vreinterpretq_f64_m128d(__riscv_vslideup_vx_f64m1(_a, _a, 1, 2));
+}
 
-// FORCE_INLINE __m128 _mm_movehdup_ps (__m128 a) {}
+FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) {
+  // TODO optimize with vmacc
+  vuint64m1_t _a = vreinterpretq_m128_u64(a);
+  vuint64m1_t a_low = __riscv_vsrl_vx_u64m1(_a, 32, 2);
+  vuint64m1_t a_high = __riscv_vsll_vx_u64m1(a_low, 32, 2);
+  return vreinterpretq_u64_m128(__riscv_vor_vv_u64m1(a_high, a_low, 2));
+}
 
-// FORCE_INLINE __m128 _mm_movehl_ps (__m128 a, __m128 b) {}
+FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b) {
+  vfloat64m1_t _a = vreinterpretq_m128_f64(a);
+  vfloat64m1_t _b = vreinterpretq_m128_f64(b);
+  vfloat64m1_t b_s = __riscv_vslidedown_vx_f64m1(_b, 1, 2);
+  return vreinterpretq_f64_m128(__riscv_vslideup_vx_f64m1(_a, b_s, 0, 1));
+}
 
-// FORCE_INLINE __m128 _mm_moveldup_ps (__m128 a) {}
+FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) {
+  // TODO optimize with vmacc
+  vuint64m1_t _a = vreinterpretq_m128_u64(a);
+  vuint64m1_t a_high = __riscv_vsll_vx_u64m1(_a, 32, 2);
+  vuint64m1_t a_low = __riscv_vsrl_vx_u64m1(a_high, 32, 2);
+  return vreinterpretq_u64_m128(__riscv_vor_vv_u64m1(a_high, a_low, 2));
+}
 
-// FORCE_INLINE __m128 _mm_movelh_ps (__m128 a, __m128 b) {}
+FORCE_INLINE __m128 _mm_movelh_ps(__m128 a, __m128 b) {
+  vfloat64m1_t _a = vreinterpretq_m128_f64(a);
+  vfloat64m1_t _b = vreinterpretq_m128_f64(b);
+  return vreinterpretq_f64_m128(__riscv_vslideup_vx_f64m1(_a, _b, 1, 2));
+}
 
 // FORCE_INLINE int _mm_movemask_epi8 (__m128i a) {}
 

diff --git a/tests/impl.cpp b/tests/impl.cpp
@@ -2496,63 +2496,63 @@ result_t test_mm_min_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
 }
 
 result_t test_mm_move_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
-  // #ifdef ENABLE_TEST_ALL
-  //   const float *_a = impl.test_cases_float_pointer1;
-  //   const float *_b = impl.test_cases_float_pointer2;
-  //   __m128 a = load_m128(_a);
-  //   __m128 b = load_m128(_b);
-  //
-  //   float result[4];
-  //   result[0] = _b[0];
-  //   result[1] = _a[1];
-  //   result[2] = _a[2];
-  //   result[3] = _a[3];
-  //
-  //   __m128 ret = _mm_move_ss(a, b);
-  //   return validate_float(ret, result[0], result[1], result[2], result[3]);
-  // #else
+#ifdef ENABLE_TEST_ALL
+  const float *_a = impl.test_cases_float_pointer1;
+  const float *_b = impl.test_cases_float_pointer2;
+  __m128 a = load_m128(_a);
+  __m128 b = load_m128(_b);
+
+  float _c[4];
+  _c[0] = _b[0];
+  _c[1] = _a[1];
+  _c[2] = _a[2];
+  _c[3] = _a[3];
+
+  __m128 c = _mm_move_ss(a, b);
+  return validate_float(c, _c[0], _c[1], _c[2], _c[3]);
+#else
   return TEST_UNIMPL;
-  // #endif  // ENABLE_TEST_ALL
+#endif // ENABLE_TEST_ALL
 }
 
 result_t test_mm_movehl_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
-  // #ifdef ENABLE_TEST_ALL
-  //   const float *_a = impl.test_cases_float_pointer1;
-  //   const float *_b = impl.test_cases_float_pointer2;
-  //
-  //   float f0 = _b[2];
-  //   float f1 = _b[3];
-  //   float f2 = _a[2];
-  //   float f3 = _a[3];
-  //
-  //   __m128 a = load_m128(_a);
-  //   __m128 b = load_m128(_b);
-  //   __m128 ret = _mm_movehl_ps(a, b);
-  //
-  //   return validate_float(ret, f0, f1, f2, f3);
-  // #else
+#ifdef ENABLE_TEST_ALL
+  const float *_a = impl.test_cases_float_pointer1;
+  const float *_b = impl.test_cases_float_pointer2;
+
+  float f0 = _b[2];
+  float f1 = _b[3];
+  float f2 = _a[2];
+  float f3 = _a[3];
+
+  __m128 a = load_m128(_a);
+  __m128 b = load_m128(_b);
+  __m128 ret = _mm_movehl_ps(a, b);
+
+  return validate_float(ret, f0, f1, f2, f3);
+#else
   return TEST_UNIMPL;
-  // #endif  // ENABLE_TEST_ALL
+#endif // ENABLE_TEST_ALL
 }
 
 result_t test_mm_movelh_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
-  // #ifdef ENABLE_TEST_ALL
-  //   const float *_a = impl.test_cases_float_pointer1;
-  //   const float *_b = impl.test_cases_float_pointer2;
-  //
-  //   float f0 = _a[0];
-  //   float f1 = _a[1];
-  //   float f2 = _b[0];
-  //   float f3 = _b[1];
-  //
-  //   __m128 a = load_m128(_a);
-  //   __m128 b = load_m128(_b);
-  //   __m128 ret = _mm_movelh_ps(a, b);
-  //
-  //   return validate_float(ret, f0, f1, f2, f3);
-  // #else
+#ifdef ENABLE_TEST_ALL
+  const float *_a = impl.test_cases_float_pointer1;
+  const float *_b = impl.test_cases_float_pointer2;
+
+  float f0 = _a[0];
+  float f1 = _a[1];
+  float f2 = _b[0];
+  float f3 = _b[1];
+
+  __m128 a = load_m128(_a);
+  __m128 b = load_m128(_b);
+  __m128 ret = _mm_movelh_ps(a, b);
+
+  return validate_float(ret, f0, f1, f2, f3);
+#else
   return TEST_UNIMPL;
-  // #endif  // ENABLE_TEST_ALL
+#endif // ENABLE_TEST_ALL
 }
 
 result_t test_mm_movemask_pi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
@@ -6002,37 +6002,36 @@ result_t test_mm_min_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
 }
 
 result_t test_mm_move_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
-  // #ifdef ENABLE_TEST_ALL
-  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
-  //
-  //   int64_t d0 = _a[0];
-  //   int64_t d1 = 0;
-  //
-  //   __m128i a = load_m128i(_a);
-  //   __m128i c = _mm_move_epi64(a);
-  //
-  //   return validate_int64(c, d0, d1);
-  // #else
+#ifdef ENABLE_TEST_ALL
+  const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  int64_t d0 = _a[0];
+  int64_t d1 = 0;
+
+  __m128i a = load_m128i(_a);
+  __m128i c = _mm_move_epi64(a);
+
+  return validate_int64(c, d0, d1);
+#else
   return TEST_UNIMPL;
-  // #endif  // ENABLE_TEST_ALL
+#endif // ENABLE_TEST_ALL
 }
 
 result_t test_mm_move_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
-  // #ifdef ENABLE_TEST_ALL
-  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
-  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
-  //   __m128d a = load_m128d(_a);
-  //   __m128d b = load_m128d(_b);
-  //
-  //   double result[2];
-  //   result[0] = _b[0];
-  //   result[1] = _a[1];
-  //
-  //   __m128d ret = _mm_move_sd(a, b);
-  //   return validate_double(ret, result[0], result[1]);
-  // #else
+#ifdef ENABLE_TEST_ALL
+  const double *_a = (const double *)impl.test_cases_float_pointer1;
+  const double *_b = (const double *)impl.test_cases_float_pointer2;
+  __m128d a = load_m128d(_a);
+  __m128d b = load_m128d(_b);
+
+  double _c[2];
+  _c[0] = _b[0];
+  _c[1] = _a[1];
+
+  __m128d c = _mm_move_sd(a, b);
+  return validate_double(c, _c[0], _c[1]);
+#else
   return TEST_UNIMPL;
-  // #endif  // ENABLE_TEST_ALL
+#endif // ENABLE_TEST_ALL
 }
 
 result_t test_mm_movemask_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
@@ -8322,35 +8321,35 @@ result_t test_mm_loaddup_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
 }
 
 result_t test_mm_movedup_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
-  // #ifdef ENABLE_TEST_ALL
-  //   const double *p = (const double *)impl.test_cases_float_pointer1;
-  //   __m128d a = load_m128d(p);
-  //   __m128d b = _mm_movedup_pd(a);
-  //
-  //   return validate_double(b, p[0], p[0]);
-  // #else
+#ifdef ENABLE_TEST_ALL
+  const double *_a = (const double *)impl.test_cases_float_pointer1;
+  __m128d a = load_m128d(_a);
+  __m128d c = _mm_movedup_pd(a);
+
+  return validate_double(c, _a[0], _a[0]);
+#else
   return TEST_UNIMPL;
-  // #endif  // ENABLE_TEST_ALL
+#endif // ENABLE_TEST_ALL
 }
 
 result_t test_mm_movehdup_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
-  // #ifdef ENABLE_TEST_ALL
-  //   const float *p = impl.test_cases_float_pointer1;
-  //   __m128 a = load_m128(p);
-  //   return validate_float(_mm_movehdup_ps(a), p[1], p[1], p[3], p[3]);
-  // #else
+#ifdef ENABLE_TEST_ALL
+  const float *_a = impl.test_cases_float_pointer1;
+  __m128 a = load_m128(_a);
+  return validate_float(_mm_movehdup_ps(a), _a[1], _a[1], _a[3], _a[3]);
+#else
   return TEST_UNIMPL;
-  // #endif  // ENABLE_TEST_ALL
+#endif // ENABLE_TEST_ALL
 }
 
 result_t test_mm_moveldup_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
-  // #ifdef ENABLE_TEST_ALL
-  //   const float *p = impl.test_cases_float_pointer1;
-  //   __m128 a = load_m128(p);
-  //   return validate_float(_mm_moveldup_ps(a), p[0], p[0], p[2], p[2]);
-  // #else
+#ifdef ENABLE_TEST_ALL
+  const float *_a = impl.test_cases_float_pointer1;
+  __m128 a = load_m128(_a);
+  return validate_float(_mm_moveldup_ps(a), _a[0], _a[0], _a[2], _a[2]);
+#else
   return TEST_UNIMPL;
-  // #endif  // ENABLE_TEST_ALL
+#endif // ENABLE_TEST_ALL
 }
 
 /* SSSE3 */