From 650cfca557a5e7642da624019f88359ac1d86b16 Mon Sep 17 00:00:00 2001
From: Yang Hau <yuanyanghau@gmail.com>
Date: Sun, 24 Dec 2023 11:41:04 +0800
Subject: [PATCH] feat: Add _mm_and_*

---
 sse2rvv.h      |  24 ++++++++--
 tests/impl.cpp | 124 ++++++++++++++++++++++++-------------------------
 2 files changed, 81 insertions(+), 67 deletions(-)

diff --git a/sse2rvv.h b/sse2rvv.h
index 0631b7a..d9945a0 100644
--- a/sse2rvv.h
+++ b/sse2rvv.h
@@ -97,7 +97,7 @@ typedef vint32m1_t __m128i;   /* 128-bit vector containing integers */
   __riscv_vreinterpret_v_i32m1_i16m1(__riscv_vreinterpret_v_f32m1_i32m1(x))
 #define vreinterpretq_m128_i32(x) __riscv_vreinterpret_v_f32m1_i32m1(x)
 #define vreinterpretq_m128_i64(x)                                              \
-  __riscv_vreinterpret_v_f64m1_i64m1(__riscv_vreinterpret_v_f32m1_f64m1(x))
+  __riscv_vreinterpret_v_i32m1_i64m1(__riscv_vreinterpret_v_f32m1_i32m1(x))
 #define vreinterpretq_m128_f32(x) (x)
 #define vreinterpretq_m128_f64(x) __riscv_vreinterpret_v_f32m1_f64m1(x)
 
@@ -474,7 +474,9 @@ typedef struct {
 // store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
 FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) {
-  return __riscv_vfadd_vv_f32m1(a, b, 4);
+  vfloat32m1_t _a = vreinterpretq_m128_f32(a);
+  vfloat32m1_t _b = vreinterpretq_m128_f32(b);
+  return vreinterpretq_f32_m128(__riscv_vfadd_vv_f32m1(a, b, 4));
 }
 
 // Add the lower single-precision (32-bit) floating-point element in a and b,
@@ -486,7 +488,11 @@ FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) {
 // Compute the bitwise AND of packed single-precision (32-bit) floating-point
 // elements in a and b, and store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
-// FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) {}
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) {
+  vint32m1_t _a = vreinterpretq_m128_i32(a);
+  vint32m1_t _b = vreinterpretq_m128_i32(b);
+  return vreinterpretq_i32_m128(__riscv_vand_vv_i32m1(_a, _b, 4));
+}
 
 // Compute the bitwise NOT of packed single-precision (32-bit) floating-point
 // elements in a and then AND with b, and store the results in dst.
@@ -1456,12 +1462,20 @@ FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) {
 // Compute the bitwise AND of packed double-precision (64-bit) floating-point
 // elements in a and b, and store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
-// FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) {}
+FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) {
+  vint64m1_t _a = vreinterpretq_m128d_i64(a);
+  vint64m1_t _b = vreinterpretq_m128d_i64(b);
+  return vreinterpretq_i64_m128d(__riscv_vand_vv_i64m1(_a, _b, 2));
+}
 
 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
 // and store the result in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
-// FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) {}
+FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) {
+  vint32m1_t _a = vreinterpretq_m128i_i32(a);
+  vint32m1_t _b = vreinterpretq_m128i_i32(b);
+  return vreinterpretq_i32_m128i(__riscv_vand_vv_i32m1(_a, _b, 4));
+}
 
 // Compute the bitwise NOT of packed double-precision (64-bit) floating-point
 // elements in a and then AND with b, and store the results in dst.
diff --git a/tests/impl.cpp b/tests/impl.cpp
index 757d375..e86f3c8 100644
--- a/tests/impl.cpp
+++ b/tests/impl.cpp
@@ -374,11 +374,11 @@ result_t test_mm_shuffle_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter);
 
 // This function is not called from "run_single_test", but for other intrinsic
 // tests that might need to call "_mm_set_epi32".
-// __m128i do_mm_set_epi32(int32_t x, int32_t y, int32_t z, int32_t w) {
-//   __m128i a = _mm_set_epi32(x, y, z, w);
-//   validate_int32(a, w, z, y, x);
-//   return a;
-// }
+__m128i do_mm_set_epi32(int32_t x, int32_t y, int32_t z, int32_t w) {
+  __m128i a = _mm_set_epi32(x, y, z, w);
+  validate_int32(a, w, z, y, x);
+  return a;
+}
 
 // This function is not called from "run_single_test", but for other intrinsic
 // tests that might need to load __m64 data.
@@ -780,29 +780,29 @@ result_t test_mm_add_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
 }
 
 result_t test_mm_and_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
-  // #ifdef ENABLE_TEST_ALL
-  //   const float *_a = impl.test_cases_float_pointer1;
-  //   const float *_b = impl.test_cases_float_pointer2;
-  //   __m128 a = load_m128(_a);
-  //   __m128 b = load_m128(_b);
-  //   __m128 c = _mm_and_ps(a, b);
+#ifdef ENABLE_TEST_ALL
+  const float *_a = impl.test_cases_float_pointer1;
+  const float *_b = impl.test_cases_float_pointer2;
+  __m128 a = load_m128(_a);
+  __m128 b = load_m128(_b);
+  __m128 c = _mm_and_ps(a, b);
   // now for the assertion...
-  //   const uint32_t *ia = (const uint32_t *)&a;
-  //   const uint32_t *ib = (const uint32_t *)&b;
-  //   uint32_t r[4];
-  //   r[0] = ia[0] & ib[0];
-  //   r[1] = ia[1] & ib[1];
-  //   r[2] = ia[2] & ib[2];
-  //   r[3] = ia[3] & ib[3];
-  //   __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]);
-  //   result_t res = VALIDATE_INT32_M128(*(const __m128i *)&c, r);
-  //   if (res) {
-  //     res = VALIDATE_INT32_M128(ret, r);
-  //   }
-  //   return res;
-  // #else
+  const uint32_t *ia = (const uint32_t *)&a;
+  const uint32_t *ib = (const uint32_t *)&b;
+  uint32_t r[4];
+  r[0] = ia[0] & ib[0];
+  r[1] = ia[1] & ib[1];
+  r[2] = ia[2] & ib[2];
+  r[3] = ia[3] & ib[3];
+  __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]);
+  result_t res = VALIDATE_INT32_M128(*(const __m128i *)&c, r);
+  if (res) {
+    res = VALIDATE_INT32_M128(ret, r);
+  }
+  return res;
+#else
   return TEST_UNIMPL;
-  // #endif  // ENABLE_TEST_ALL
+#endif // ENABLE_TEST_ALL
 }
 
 // r0 := ~a0 & b0
@@ -3971,48 +3971,48 @@ result_t test_mm_adds_epu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
 }
 
 result_t test_mm_and_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
-  // #ifdef ENABLE_TEST_ALL
-  //   const int64_t *_a = (const int64_t *)impl.test_cases_float_pointer1;
-  //   const int64_t *_b = (const int64_t *)impl.test_cases_float_pointer2;
-  //
-  //   int64_t d0 = _a[0] & _b[0];
-  //   int64_t d1 = _a[1] & _b[1];
-  //
-  //   __m128d a = load_m128d(_a);
-  //   __m128d b = load_m128d(_b);
-  //   __m128d c = _mm_and_pd(a, b);
-  //
-  //   return validate_double(c, *((double *)&d0), *((double *)&d1));
-  // #else
+#ifdef ENABLE_TEST_ALL
+  const int64_t *_a = (const int64_t *)impl.test_cases_float_pointer1;
+  const int64_t *_b = (const int64_t *)impl.test_cases_float_pointer2;
+
+  int64_t d0 = _a[0] & _b[0];
+  int64_t d1 = _a[1] & _b[1];
+
+  __m128d a = load_m128d(_a);
+  __m128d b = load_m128d(_b);
+  __m128d c = _mm_and_pd(a, b);
+
+  return validate_double(c, *((double *)&d0), *((double *)&d1));
+#else
   return TEST_UNIMPL;
-  // #endif  // ENABLE_TEST_ALL
+#endif // ENABLE_TEST_ALL
 }
 
 result_t test_mm_and_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
-  // #ifdef ENABLE_TEST_ALL
-  //   const int32_t *_a = impl.test_cases_int_pointer1;
-  //   const int32_t *_b = impl.test_cases_int_pointer2;
-  //   __m128i a = load_m128i(_a);
-  //   __m128i b = load_m128i(_b);
-  //   __m128 fc = _mm_and_ps(*(const __m128 *)&a, *(const __m128 *)&b);
-  //   __m128i c = *(const __m128i *)&fc;
+#ifdef ENABLE_TEST_ALL
+  const int32_t *_a = impl.test_cases_int_pointer1;
+  const int32_t *_b = impl.test_cases_int_pointer2;
+  __m128i a = load_m128i(_a);
+  __m128i b = load_m128i(_b);
+  __m128 fc = _mm_and_ps(*(const __m128 *)&a, *(const __m128 *)&b);
+  __m128i c = *(const __m128i *)&fc;
   // now for the assertion...
-  //   const uint32_t *ia = (const uint32_t *)&a;
-  //   const uint32_t *ib = (const uint32_t *)&b;
-  //   uint32_t r[4];
-  //   r[0] = ia[0] & ib[0];
-  //   r[1] = ia[1] & ib[1];
-  //   r[2] = ia[2] & ib[2];
-  //   r[3] = ia[3] & ib[3];
-  //   __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]);
-  //   result_t res = VALIDATE_INT32_M128(c, r);
-  //   if (res) {
-  //     res = VALIDATE_INT32_M128(ret, r);
-  //   }
-  //   return res;
-  // #else
+  const uint32_t *ia = (const uint32_t *)&a;
+  const uint32_t *ib = (const uint32_t *)&b;
+  uint32_t r[4];
+  r[0] = ia[0] & ib[0];
+  r[1] = ia[1] & ib[1];
+  r[2] = ia[2] & ib[2];
+  r[3] = ia[3] & ib[3];
+  __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]);
+  result_t res = VALIDATE_INT32_M128(c, r);
+  if (res) {
+    res = VALIDATE_INT32_M128(ret, r);
+  }
+  return res;
+#else
   return TEST_UNIMPL;
-  // #endif  // ENABLE_TEST_ALL
+#endif // ENABLE_TEST_ALL
 }
 
 result_t test_mm_andnot_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {