From e8dc58085b58b4e34a89cae7f667194a82bb5ddb Mon Sep 17 00:00:00 2001
From: Yang Hau <yuanyanghau@gmail.com>
Date: Tue, 30 Jul 2024 23:26:16 +0800
Subject: [PATCH] feat: Add vext[q]_f64

---
 neon2rvv.h     | 10 ++++++--
 tests/impl.cpp | 64 ++++++++++++++++++++++++++++++++++++++++++++++++--
 tests/impl.h   |  4 ++--
 3 files changed, 72 insertions(+), 6 deletions(-)

diff --git a/neon2rvv.h b/neon2rvv.h
index 85866033..8374ab53 100644
--- a/neon2rvv.h
+++ b/neon2rvv.h
@@ -11574,9 +11574,15 @@ FORCE_INLINE float32x4_t vextq_f32(float32x4_t a, float32x4_t b, const int c) {
   return __riscv_vslideup_vx_f32m1(a_slidedown, b, 4 - c, 4);
 }
 
-// FORCE_INLINE float64x1_t vext_f64(float64x1_t a, float64x1_t b, const int n);
+FORCE_INLINE float64x1_t vext_f64(float64x1_t a, float64x1_t b, const int c) {
+  vfloat64m1_t a_slidedown = __riscv_vslidedown_vx_f64m1(a, c, 1);
+  return __riscv_vslideup_vx_f64m1(a_slidedown, b, 1 - c, 1);
+}
 
-// FORCE_INLINE float64x2_t vextq_f64(float64x2_t a, float64x2_t b, const int n);
+FORCE_INLINE float64x2_t vextq_f64(float64x2_t a, float64x2_t b, const int c) {
+  vfloat64m1_t a_slidedown = __riscv_vslidedown_vx_f64m1(a, c, 2);
+  return __riscv_vslideup_vx_f64m1(a_slidedown, b, 2 - c, 2);
+}
 
 // FORCE_INLINE poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, const int n);
 
diff --git a/tests/impl.cpp b/tests/impl.cpp
index 71472466..26580e9f 100644
--- a/tests/impl.cpp
+++ b/tests/impl.cpp
@@ -39230,9 +39230,69 @@ result_t test_vextq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
 #endif  // ENABLE_TEST_ALL
 }
 
-result_t test_vext_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+result_t test_vext_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const double *_a = (const double *)impl.test_cases_float_pointer1;
+  const double *_b = (const double *)impl.test_cases_float_pointer2;
+  const int elt_num = 1;
+  double _c[elt_num];
+  float64x1_t a, b, c;
+
+  float temp_arr[elt_num * 2];
+  for (int i = 0; i < elt_num; i++) {
+    temp_arr[i] = _a[i];
+    temp_arr[i + elt_num] = _b[i];
+  }
+
+#define TEST_IMPL(IDX)                \
+  for (int i = 0; i < elt_num; i++) { \
+    _c[i] = temp_arr[i + IDX];        \
+  }                                   \
+  a = vld1_f64(_a);                   \
+  b = vld1_f64(_b);                   \
+  c = vext_f64(a, b, IDX);            \
+  CHECK_RESULT(validate_double(c, _c[0]))
+
+  IMM_1_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
 
-result_t test_vextq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+result_t test_vextq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const double *_a = (const double *)impl.test_cases_float_pointer1;
+  const double *_b = (const double *)impl.test_cases_float_pointer2;
+  const int elt_num = 2;
+  double _c[elt_num];
+  float64x2_t a, b, c;
+
+  float temp_arr[elt_num * 2];
+  for (int i = 0; i < elt_num; i++) {
+    temp_arr[i] = _a[i];
+    temp_arr[i + elt_num] = _b[i];
+  }
+
+#define TEST_IMPL(IDX)                \
+  for (int i = 0; i < elt_num; i++) { \
+    _c[i] = temp_arr[i + IDX];        \
+  }                                   \
+  a = vld1q_f64(_a);                  \
+  b = vld1q_f64(_b);                  \
+  c = vextq_f64(a, b, IDX);           \
+  CHECK_RESULT(validate_double(c, _c[0], _c[1]))
+
+  IMM_2_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
 
 result_t test_vext_p8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
 
diff --git a/tests/impl.h b/tests/impl.h
index 671d2aec..667d8b68 100644
--- a/tests/impl.h
+++ b/tests/impl.h
@@ -2269,8 +2269,8 @@
   _(vextq_s32)                                                                   \
   _(vextq_s64)                                                                   \
   _(vextq_f32)                                                                   \
-  /*_(vext_f64)                                                               */ \
-  /*_(vextq_f64)                                                              */ \
+  _(vext_f64)                                                                    \
+  _(vextq_f64)                                                                   \
   _(vextq_u8)                                                                    \
   _(vextq_u16)                                                                   \
   _(vextq_u32)                                                                   \