Merge pull request #19 from howjmay/andnot

feat: Add _mm_andnot_*
pattonkan · Dec 24, 2023 · 92933b4 · 92933b4
2 parents 3fbe37a + 1008158
commit 92933b4
Show file tree

Hide file tree

Showing 2 changed files with 80 additions and 65 deletions.
diff --git a/sse2rvv.h b/sse2rvv.h
@@ -476,7 +476,7 @@ typedef struct {
 FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) {
   vfloat32m1_t _a = vreinterpretq_m128_f32(a);
   vfloat32m1_t _b = vreinterpretq_m128_f32(b);
-  return vreinterpretq_f32_m128(__riscv_vfadd_vv_f32m1(a, b, 4));
+  return vreinterpretq_f32_m128(__riscv_vfadd_vv_f32m1(_a, _b, 4));
 }
 
 // Add the lower single-precision (32-bit) floating-point element in a and b,
@@ -497,7 +497,12 @@ FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) {
 // Compute the bitwise NOT of packed single-precision (32-bit) floating-point
 // elements in a and then AND with b, and store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
-// FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) {}
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) {
+  vint32m1_t _a = vreinterpretq_m128_i32(a);
+  vint32m1_t _b = vreinterpretq_m128_i32(b);
+  return vreinterpretq_i32_m128(
+      __riscv_vand_vv_i32m1(__riscv_vnot_v_i32m1(_a, 4), _b, 4));
+}
 
 // Average packed unsigned 16-bit integers in a and b, and store the results in
 // dst.
@@ -1480,12 +1485,22 @@ FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) {
 // Compute the bitwise NOT of packed double-precision (64-bit) floating-point
 // elements in a and then AND with b, and store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
-// FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b) {}
+FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b) {
+  vint64m1_t _a = vreinterpretq_m128d_i64(a);
+  vint64m1_t _b = vreinterpretq_m128d_i64(b);
+  return vreinterpretq_i64_m128d(
+      __riscv_vand_vv_i64m1(__riscv_vnot_v_i64m1(_a, 2), _b, 2));
+}
 
 // Compute the bitwise NOT of 128 bits (representing integer data) in a and then
 // AND with b, and store the result in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
-// FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) {}
+FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) {
+  vint32m1_t _a = vreinterpretq_m128i_i32(a);
+  vint32m1_t _b = vreinterpretq_m128i_i32(b);
+  return vreinterpretq_i32_m128i(
+      __riscv_vand_vv_i32m1(__riscv_vnot_v_i32m1(_a, 4), _b, 4));
+}
 
 // Average packed unsigned 16-bit integers in a and b, and store the results in
 // dst.

diff --git a/tests/impl.cpp b/tests/impl.cpp
@@ -810,31 +810,31 @@ result_t test_mm_and_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
 // r2 := ~a2 & b2
 // r3 := ~a3 & b3
 result_t test_mm_andnot_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
-  // #ifdef ENABLE_TEST_ALL
-  //   const float *_a = impl.test_cases_float_pointer1;
-  //   const float *_b = impl.test_cases_float_pointer2;
-  //
-  //   __m128 a = load_m128(_a);
-  //   __m128 b = load_m128(_b);
-  //   __m128 c = _mm_andnot_ps(a, b);
+#ifdef ENABLE_TEST_ALL
+  const float *_a = impl.test_cases_float_pointer1;
+  const float *_b = impl.test_cases_float_pointer2;
+
+  __m128 a = load_m128(_a);
+  __m128 b = load_m128(_b);
+  __m128 c = _mm_andnot_ps(a, b);
   // now for the assertion...
-  //   const uint32_t *ia = (const uint32_t *)&a;
-  //   const uint32_t *ib = (const uint32_t *)&b;
-  //   uint32_t r[4];
-  //   r[0] = ~ia[0] & ib[0];
-  //   r[1] = ~ia[1] & ib[1];
-  //   r[2] = ~ia[2] & ib[2];
-  //   r[3] = ~ia[3] & ib[3];
-  //   __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]);
-  //   result_t res = TEST_FAIL;
-  //   res = VALIDATE_INT32_M128(*(const __m128i *)&c, r);
-  //   if (res) {
-  //     res = VALIDATE_INT32_M128(ret, r);
-  //   }
-  //   return res;
-  // #else
+  const uint32_t *ia = (const uint32_t *)&a;
+  const uint32_t *ib = (const uint32_t *)&b;
+  uint32_t r[4];
+  r[0] = ~ia[0] & ib[0];
+  r[1] = ~ia[1] & ib[1];
+  r[2] = ~ia[2] & ib[2];
+  r[3] = ~ia[3] & ib[3];
+  __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]);
+  result_t res = TEST_FAIL;
+  res = VALIDATE_INT32_M128(*(const __m128i *)&c, r);
+  if (res) {
+    res = VALIDATE_INT32_M128(ret, r);
+  }
+  return res;
+#else
   return TEST_UNIMPL;
-  // #endif  // ENABLE_TEST_ALL
+#endif // ENABLE_TEST_ALL
 }
 
 result_t test_mm_avg_pu16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
@@ -4016,53 +4016,53 @@ result_t test_mm_and_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
 }
 
 result_t test_mm_andnot_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
-  // #ifdef ENABLE_TEST_ALL
-  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
-  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
-  //
-  //   __m128d a = load_m128d(_a);
-  //   __m128d b = load_m128d(_b);
-  //   __m128d c = _mm_andnot_pd(a, b);
-  //
+#ifdef ENABLE_TEST_ALL
+  const double *_a = (const double *)impl.test_cases_float_pointer1;
+  const double *_b = (const double *)impl.test_cases_float_pointer2;
+
+  __m128d a = load_m128d(_a);
+  __m128d b = load_m128d(_b);
+  __m128d c = _mm_andnot_pd(a, b);
+
   // Take AND operation a complement of 'a' and 'b'. Bitwise operations are
   // not allowed on float/double datatype, so 'a' and 'b' are calculated in
   // uint64_t datatype.
-  //   const uint64_t *ia = (const uint64_t *)&a;
-  //   const uint64_t *ib = (const uint64_t *)&b;
-  //   uint64_t r0 = ~ia[0] & ib[0];
-  //   uint64_t r1 = ~ia[1] & ib[1];
-  //   return validate_uint64(*(const __m128i *)&c, r0, r1);
-  // #else
+  const uint64_t *ia = (const uint64_t *)&a;
+  const uint64_t *ib = (const uint64_t *)&b;
+  uint64_t r0 = ~ia[0] & ib[0];
+  uint64_t r1 = ~ia[1] & ib[1];
+  return validate_uint64(*(const __m128i *)&c, r0, r1);
+#else
   return TEST_UNIMPL;
-  // #endif  // ENABLE_TEST_ALL
+#endif // ENABLE_TEST_ALL
 }
 
 result_t test_mm_andnot_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
-  // #ifdef ENABLE_TEST_ALL
-  //   const int32_t *_a = impl.test_cases_int_pointer1;
-  //   const int32_t *_b = impl.test_cases_int_pointer2;
-  //   __m128i a = load_m128i(_a);
-  //   __m128i b = load_m128i(_b);
-  //   __m128 fc = _mm_andnot_ps(*(const __m128 *)&a, *(const __m128 *)&b);
-  //   __m128i c = *(const __m128i *)&fc;
+#ifdef ENABLE_TEST_ALL
+  const int32_t *_a = impl.test_cases_int_pointer1;
+  const int32_t *_b = impl.test_cases_int_pointer2;
+  __m128i a = load_m128i(_a);
+  __m128i b = load_m128i(_b);
+  __m128 fc = _mm_andnot_ps(*(const __m128 *)&a, *(const __m128 *)&b);
+  __m128i c = *(const __m128i *)&fc;
   // now for the assertion...
-  //   const uint32_t *ia = (const uint32_t *)&a;
-  //   const uint32_t *ib = (const uint32_t *)&b;
-  //   uint32_t r[4];
-  //   r[0] = ~ia[0] & ib[0];
-  //   r[1] = ~ia[1] & ib[1];
-  //   r[2] = ~ia[2] & ib[2];
-  //   r[3] = ~ia[3] & ib[3];
-  //   __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]);
-  //   result_t res = TEST_SUCCESS;
-  //   res = VALIDATE_INT32_M128(c, r);
-  //   if (res) {
-  //     res = VALIDATE_INT32_M128(ret, r);
-  //   }
-  //   return res;
-  // #else
+  const uint32_t *ia = (const uint32_t *)&a;
+  const uint32_t *ib = (const uint32_t *)&b;
+  uint32_t r[4];
+  r[0] = ~ia[0] & ib[0];
+  r[1] = ~ia[1] & ib[1];
+  r[2] = ~ia[2] & ib[2];
+  r[3] = ~ia[3] & ib[3];
+  __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]);
+  result_t res = TEST_SUCCESS;
+  res = VALIDATE_INT32_M128(c, r);
+  if (res) {
+    res = VALIDATE_INT32_M128(ret, r);
+  }
+  return res;
+#else
   return TEST_UNIMPL;
-  // #endif  // ENABLE_TEST_ALL
+#endif // ENABLE_TEST_ALL
 }
 
 result_t test_mm_avg_epu16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {