Skip to content

Commit

Permalink
Merge pull request #500 from howjmay/vsqrt
Browse files Browse the repository at this point in the history
feat: Add vsqrt[q]_[f32|f64]
  • Loading branch information
howjmay authored Aug 1, 2024
2 parents 4e39149 + 3f8d547 commit c235c2a
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 12 deletions.
8 changes: 4 additions & 4 deletions neon2rvv.h
Original file line number Diff line number Diff line change
Expand Up @@ -4800,13 +4800,13 @@ FORCE_INLINE float32_t vrecpss_f32(float32_t a, float32_t b) { return 2.0 - a *

FORCE_INLINE float64_t vrecpsd_f64(float64_t a, float64_t b) { return 2.0 - a * b; }

// FORCE_INLINE float32x2_t vsqrt_f32(float32x2_t a);
FORCE_INLINE float32x2_t vsqrt_f32(float32x2_t a) { return __riscv_vfsqrt_v_f32m1(a, 2); }

// FORCE_INLINE float32x4_t vsqrtq_f32(float32x4_t a);
FORCE_INLINE float32x4_t vsqrtq_f32(float32x4_t a) { return __riscv_vfsqrt_v_f32m1(a, 4); }

// FORCE_INLINE float64x1_t vsqrt_f64(float64x1_t a);
FORCE_INLINE float64x1_t vsqrt_f64(float64x1_t a) { return __riscv_vfsqrt_v_f64m1(a, 1); }

// FORCE_INLINE float64x2_t vsqrtq_f64(float64x2_t a);
FORCE_INLINE float64x2_t vsqrtq_f64(float64x2_t a) { return __riscv_vfsqrt_v_f64m1(a, 2); }

FORCE_INLINE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b) {
return __riscv_vfdiv_vf_f32m1(__riscv_vfnmsac_vv_f32m1(vdup_n_f32(3.0), a, b, 2), 2.0, 2);
Expand Down
68 changes: 64 additions & 4 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17010,13 +17010,73 @@ result_t test_vrecpsd_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#endif // ENABLE_TEST_ALL
}

result_t test_vsqrt_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_vsqrt_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const float *_a = (const float *)impl.test_cases_float_pointer1;
float _c[2];
for (int i = 0; i < 2; i++) {
_c[i] = sqrtf(_a[i]);
}

float32x2_t a = vld1_f32(_a);
float32x2_t c = vsqrt_f32(a);

return validate_float_error(c, _c[0], _c[1], 0.0001f);
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vsqrtq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const float *_a = (const float *)impl.test_cases_float_pointer1;
float _c[4];
for (int i = 0; i < 4; i++) {
_c[i] = sqrtf(_a[i]);
}

float32x4_t a = vld1q_f32(_a);
float32x4_t c = vsqrtq_f32(a);

return validate_float_error(c, _c[0], _c[1], _c[2], _c[3], 0.0001f);
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vsqrt_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const double *_a = (const double *)impl.test_cases_float_pointer1;
double _c[2];
for (int i = 0; i < 1; i++) {
_c[i] = sqrt(_a[i]);
}

float64x1_t a = vld1_f64(_a);
float64x1_t c = vsqrt_f64(a);

return validate_double_error(c, _c[0], 0.0001f);
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vsqrtq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_vsqrtq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const double *_a = (const double *)impl.test_cases_float_pointer1;
double _c[4];
for (int i = 0; i < 4; i++) {
_c[i] = sqrt(_a[i]);
}

result_t test_vsqrt_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
float64x2_t a = vld1q_f64(_a);
float64x2_t c = vsqrtq_f64(a);

result_t test_vsqrtq_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
return validate_double_error(c, _c[0], _c[1], 0.0001f);
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vrsqrts_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
Expand Down
8 changes: 4 additions & 4 deletions tests/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -961,10 +961,10 @@
_(vrecpsq_f64) \
_(vrecpss_f32) \
_(vrecpsd_f64) \
/*_(vsqrt_f32) */ \
/*_(vsqrtq_f32) */ \
/*_(vsqrt_f64) */ \
/*_(vsqrtq_f64) */ \
_(vsqrt_f32) \
_(vsqrtq_f32) \
_(vsqrt_f64) \
_(vsqrtq_f64) \
_(vrsqrts_f32) \
_(vrsqrtsq_f32) \
_(vrsqrts_f64) \
Expand Down

0 comments on commit c235c2a

Please sign in to comment.