Merge pull request #486 from howjmay/vrsqrt

feat: Add vrsqrts[q|s|d]_[f32|f64]
howjmay · Jul 30, 2024 · bf34d52 · bf34d52
2 parents 3dda46b + 42687ec
commit bf34d52
Show file tree

Hide file tree

Showing 4 changed files with 76 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-![coverage badge](https://img.shields.io/badge/coverage-75.4%25-brightgreen)
+![coverage badge](https://img.shields.io/badge/coverage-78.3%25-brightgreen)
 # neon2rvv
 
 A C/C++ header file that converts Arm/Aarch64 NEON intrinsics to RISC-V Vector (RVV) Extension.

diff --git a/neon2rvv.h b/neon2rvv.h
@@ -4816,13 +4816,17 @@ FORCE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) {
   return __riscv_vfdiv_vf_f32m1(__riscv_vfnmsac_vv_f32m1(vdupq_n_f32(3.0), a, b, 4), 2.0, 4);
 }
 
-// FORCE_INLINE float64x1_t vrsqrts_f64(float64x1_t a, float64x1_t b);
+FORCE_INLINE float64x1_t vrsqrts_f64(float64x1_t a, float64x1_t b) {
+  return __riscv_vfdiv_vf_f64m1(__riscv_vfnmsac_vv_f64m1(vdup_n_f64(3.0), a, b, 1), 2.0, 1);
+}
 
-// FORCE_INLINE float64x2_t vrsqrtsq_f64(float64x2_t a, float64x2_t b);
+FORCE_INLINE float64x2_t vrsqrtsq_f64(float64x2_t a, float64x2_t b) {
+  return __riscv_vfdiv_vf_f64m1(__riscv_vfnmsac_vv_f64m1(vdupq_n_f64(3.0), a, b, 2), 2.0, 2);
+}
 
-// FORCE_INLINE float32_t vrsqrtss_f32(float32_t a, float32_t b);
+FORCE_INLINE float32_t vrsqrtss_f32(float32_t a, float32_t b) { return (3.0 - a * b) / 2.0; }
 
-// FORCE_INLINE float64_t vrsqrtsd_f64(float64_t a, float64_t b);
+FORCE_INLINE float64_t vrsqrtsd_f64(float64_t a, float64_t b) { return (3.0 - a * b) / 2.0; }
 
 FORCE_INLINE int8x8_t vshl_s8(int8x8_t a, int8x8_t b) {
   // implementation only works within defined range 'b' in [0, 7]

diff --git a/tests/impl.cpp b/tests/impl.cpp
@@ -16998,8 +16998,8 @@ result_t test_vsqrtq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return
 
 result_t test_vrsqrts_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
 #ifdef ENABLE_TEST_ALL
-  const float *_a = impl.test_cases_float_pointer1;
-  const float *_b = impl.test_cases_float_pointer2;
+  const float *_a = (const float *)impl.test_cases_float_pointer1;
+  const float *_b = (const float *)impl.test_cases_float_pointer2;
   float _c[2];
   _c[0] = (3.0 - _a[0] * _b[0]) / 2.0;
   _c[1] = (3.0 - _a[1] * _b[1]) / 2.0;
@@ -17016,8 +17016,8 @@ result_t test_vrsqrts_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
 
 result_t test_vrsqrtsq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
 #ifdef ENABLE_TEST_ALL
-  const float *_a = impl.test_cases_float_pointer1;
-  const float *_b = impl.test_cases_float_pointer2;
+  const float *_a = (const float *)impl.test_cases_float_pointer1;
+  const float *_b = (const float *)impl.test_cases_float_pointer2;
   float _c[4];
   _c[0] = (3.0 - _a[0] * _b[0]) / 2.0;
   _c[1] = (3.0 - _a[1] * _b[1]) / 2.0;
@@ -17034,13 +17034,68 @@ result_t test_vrsqrtsq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
 #endif  // ENABLE_TEST_ALL
 }
 
-result_t test_vrsqrts_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+result_t test_vrsqrts_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const double *_a = (const double *)impl.test_cases_float_pointer1;
+  const double *_b = (const double *)impl.test_cases_float_pointer2;
+  double _c[1];
+  _c[0] = (3.0 - _a[0] * _b[0]) / 2.0;
 
-result_t test_vrsqrtsq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+  float64x1_t a = vld1_f64(_a);
+  float64x1_t b = vld1_f64(_b);
+  float64x1_t c = vrsqrts_f64(a, b);
 
-result_t test_vrsqrtss_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+  return validate_double_error(c, _c[0], 0.0001f);
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
+
+result_t test_vrsqrtsq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const double *_a = (const double *)impl.test_cases_float_pointer1;
+  const double *_b = (const double *)impl.test_cases_float_pointer2;
+  double _c[2];
+  _c[0] = (3.0 - _a[0] * _b[0]) / 2.0;
+  _c[1] = (3.0 - _a[1] * _b[1]) / 2.0;
 
-result_t test_vrsqrtsd_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+  float64x2_t a = vld1q_f64(_a);
+  float64x2_t b = vld1q_f64(_b);
+  float64x2_t c = vrsqrtsq_f64(a, b);
+
+  return validate_double_error(c, _c[0], _c[1], 0.0001f);
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
+
+result_t test_vrsqrtss_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const float *_a = (const float *)impl.test_cases_float_pointer1;
+  const float *_b = (const float *)impl.test_cases_float_pointer2;
+  float _c, c;
+  _c = (3.0 - _a[0] * _b[0]) / 2.0;
+
+  c = vrsqrtss_f32(_a[0], _b[0]);
+  return validate_double_error(c, _c, 0.0001f);
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
+
+result_t test_vrsqrtsd_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const double *_a = (const double *)impl.test_cases_float_pointer1;
+  const double *_b = (const double *)impl.test_cases_float_pointer2;
+  double _c, c;
+  _c = (3.0 - _a[0] * _b[0]) / 2.0;
+
+  c = vrsqrtsd_f64(_a[0], _b[0]);
+  return validate_double_error(c, _c, 0.0001f);
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
 
 result_t test_vshl_s8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
 #ifdef ENABLE_TEST_ALL

diff --git a/tests/impl.h b/tests/impl.h
@@ -967,10 +967,10 @@
   /*_(vsqrtq_f64)                                                             */ \
   _(vrsqrts_f32)                                                                 \
   _(vrsqrtsq_f32)                                                                \
-  /*_(vrsqrts_f64)                                                            */ \
-  /*_(vrsqrtsq_f64)                                                           */ \
-  /*_(vrsqrtss_f32)                                                           */ \
-  /*_(vrsqrtsd_f64)                                                           */ \
+  _(vrsqrts_f64)                                                                 \
+  _(vrsqrtsq_f64)                                                                \
+  _(vrsqrtss_f32)                                                                \
+  _(vrsqrtsd_f64)                                                                \
   _(vshl_s8)                                                                     \
   _(vshl_s16)                                                                    \
   _(vshl_s32)                                                                    \