diff --git a/sse2rvv.h b/sse2rvv.h index 9d016ac..b6ed1ec 100644 --- a/sse2rvv.h +++ b/sse2rvv.h @@ -127,7 +127,7 @@ typedef union ALIGN_STRUCT(16) SIMDVec { __riscv_vreinterpret_v_i32m1_f32m1(__riscv_vreinterpret_v_i16m1_i32m1(x)) #define vreinterpretq_i32_m128(x) __riscv_vreinterpret_v_i32m1_f32m1(x) #define vreinterpretq_i64_m128(x) \ - __riscv_vreinterpret_v_f64m1_f32m1(__riscv_vreinterpret_v_i64m1_f64m1(x)) + __riscv_vreinterpret_v_i32m1_f32m1(__riscv_vreinterpret_v_i64m1_i32m1(x)) #define vreinterpretq_f32_m128(x) (x) #define vreinterpretq_f64_m128(x) \ __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u64m1_u32m1( \ @@ -1685,8 +1685,7 @@ FORCE_INLINE __m128d _mm_load_pd(double const *mem_addr) { } FORCE_INLINE __m128d _mm_load_pd1(double const *mem_addr) { - double p[2] = {mem_addr[0], mem_addr[0]}; - return vreinterpretq_f64_m128d(__riscv_vle64_v_f64m1(p, 2)); + return vreinterpretq_f64_m128d(__riscv_vfmv_v_f_f64m1(mem_addr[0], 2)); } FORCE_INLINE __m128 _mm_load_ps(float const *mem_addr) { @@ -1694,13 +1693,13 @@ FORCE_INLINE __m128 _mm_load_ps(float const *mem_addr) { } FORCE_INLINE __m128 _mm_load_ps1(float const *mem_addr) { - float p[4] = {mem_addr[0], mem_addr[0], mem_addr[0], mem_addr[0]}; - return vreinterpretq_f32_m128(__riscv_vle32_v_f32m1(p, 4)); + return vreinterpretq_f32_m128(__riscv_vfmv_v_f_f32m1(mem_addr[0], 4)); } FORCE_INLINE __m128d _mm_load_sd(double const *mem_addr) { - double p[2] = {mem_addr[0], 0}; - return vreinterpretq_f64_m128d(__riscv_vle64_v_f64m1(p, 2)); + vfloat64m1_t addr = __riscv_vle64_v_f64m1(mem_addr, 1); + vfloat64m1_t zeros = __riscv_vfmv_v_f_f64m1(0, 2); + return vreinterpretq_f64_m128d(__riscv_vslideup_vx_f64m1(zeros, addr, 0, 1)); } FORCE_INLINE __m128i _mm_load_si128(__m128i const *mem_addr) { @@ -1709,8 +1708,9 @@ FORCE_INLINE __m128i _mm_load_si128(__m128i const *mem_addr) { } FORCE_INLINE __m128 _mm_load_ss(float const *mem_addr) { - float p[4] = {mem_addr[0], 0, 0, 0}; - return vreinterpretq_f32_m128(__riscv_vle32_v_f32m1(p, 4)); + vfloat32m1_t addr = __riscv_vle32_v_f32m1(mem_addr, 1); + vfloat32m1_t zeros = __riscv_vfmv_v_f_f32m1(0, 4); + return vreinterpretq_f32_m128(__riscv_vslideup_vx_f32m1(zeros, addr, 0, 1)); } FORCE_INLINE __m128d _mm_load1_pd(double const *mem_addr) { @@ -1723,19 +1723,49 @@ FORCE_INLINE __m128 _mm_load1_ps(float const *mem_addr) { // FORCE_INLINE __m128d _mm_loaddup_pd (double const* mem_addr) {} -// FORCE_INLINE __m128d _mm_loadh_pd (__m128d a, double const* mem_addr) {} +FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, double const *mem_addr) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t addr = __riscv_vle64_v_f64m1(mem_addr, 1); + return vreinterpretq_f64_m128d(__riscv_vslideup_vx_f64m1(_a, addr, 1, 2)); +} -// FORCE_INLINE __m128 _mm_loadh_pi (__m128 a, __m64 const* mem_addr) {} +FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *mem_addr) { + vint64m1_t _a = vreinterpretq_m128_i64(a); + vint64m1_t addr = vreinterpretq_m64_i64(*mem_addr); + return vreinterpretq_i64_m128(__riscv_vslideup_vx_i64m1(_a, addr, 1, 2)); +} -// FORCE_INLINE __m128i _mm_loadl_epi64 (__m128i const* mem_addr) {} +FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *mem_addr) { + vint64m1_t addr = vreinterpretq_m128i_i64(*mem_addr); + vint64m1_t zeros = __riscv_vmv_v_x_i64m1(0, 2); + return vreinterpretq_i64_m128i(__riscv_vslideup_vx_i64m1(addr, zeros, 1, 2)); +} -// FORCE_INLINE __m128d _mm_loadl_pd (__m128d a, double const* mem_addr) {} +FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, double const *mem_addr) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t addr = __riscv_vle64_v_f64m1(mem_addr, 1); + return vreinterpretq_f64_m128d(__riscv_vslideup_vx_f64m1(_a, addr, 0, 1)); +} -// FORCE_INLINE __m128 _mm_loadl_pi (__m128 a, __m64 const* mem_addr) {} +FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *mem_addr) { + vint64m1_t _a = vreinterpretq_m128_i64(a); + vint64m1_t addr = vreinterpretq_m64_i64(*mem_addr); + return vreinterpretq_i64_m128(__riscv_vslideup_vx_i64m1(_a, addr, 0, 1)); +} -// FORCE_INLINE __m128d _mm_loadr_pd (double const* mem_addr) {} +FORCE_INLINE __m128d _mm_loadr_pd(double const *mem_addr) { + vfloat64m1_t addr = __riscv_vle64_v_f64m1(mem_addr, 2); + vfloat64m1_t addr_high = __riscv_vslidedown_vx_f64m1(addr, 1, 2); + return vreinterpretq_f64_m128d( + __riscv_vslideup_vx_f64m1(addr_high, addr, 1, 2)); +} -// FORCE_INLINE __m128 _mm_loadr_ps (float const* mem_addr) {} +FORCE_INLINE __m128 _mm_loadr_ps(float const *mem_addr) { + vuint32m1_t addr = __riscv_vle32_v_u32m1((uint32_t const *)mem_addr, 4); + vuint32m1_t vid = __riscv_vid_v_u32m1(4); + vuint32m1_t vid_rev = __riscv_vrsub_vx_u32m1(vid, 3, 4); + return vreinterpretq_u32_m128(__riscv_vrgather_vv_u32m1(addr, vid_rev, 4)); +} FORCE_INLINE __m128d _mm_loadu_pd(double const *mem_addr) { return vreinterpretq_f64_m128d(__riscv_vle64_v_f64m1(mem_addr, 2)); diff --git a/tests/impl.cpp b/tests/impl.cpp index 9ff40f2..88c8322 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -2194,43 +2194,43 @@ result_t test_mm_load1_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { } result_t test_mm_loadh_pi(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const float *p1 = impl.test_cases_float_pointer1; - // const float *p2 = impl.test_cases_float_pointer2; - // const __m64 *b = (const __m64 *)p2; - // __m128 a = _mm_load_ps(p1); - // __m128 c = _mm_loadh_pi(a, b); - // - // return validate_float(c, p1[0], p1[1], p2[0], p2[1]); - // #else +#ifdef ENABLE_TEST_ALL + const float *p1 = impl.test_cases_float_pointer1; + const float *p2 = impl.test_cases_float_pointer2; + const __m64 *b = (const __m64 *)p2; + __m128 a = _mm_load_ps(p1); + __m128 c = _mm_loadh_pi(a, b); + + return validate_float(c, p1[0], p1[1], p2[0], p2[1]); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_loadl_pi(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const float *p1 = impl.test_cases_float_pointer1; - // const float *p2 = impl.test_cases_float_pointer2; - // __m128 a = _mm_load_ps(p1); - // const __m64 *b = (const __m64 *)p2; - // __m128 c = _mm_loadl_pi(a, b); - // - // return validate_float(c, p2[0], p2[1], p1[2], p1[3]); - // #else +#ifdef ENABLE_TEST_ALL + const float *p1 = impl.test_cases_float_pointer1; + const float *p2 = impl.test_cases_float_pointer2; + __m128 a = _mm_load_ps(p1); + const __m64 *b = (const __m64 *)p2; + __m128 c = _mm_loadl_pi(a, b); + + return validate_float(c, p2[0], p2[1], p1[2], p1[3]); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_loadr_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const float *addr = impl.test_cases_float_pointer1; - // - // __m128 ret = _mm_loadr_ps(addr); - // - // return validate_float(ret, addr[3], addr[2], addr[1], addr[0]); - // #else +#ifdef ENABLE_TEST_ALL + const float *addr = impl.test_cases_float_pointer1; + + __m128 ret = _mm_loadr_ps(addr); + + return validate_float(ret, addr[3], addr[2], addr[1], addr[0]); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_loadu_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { @@ -5664,55 +5664,55 @@ result_t test_mm_load1_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { } result_t test_mm_loadh_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const double *_a = (const double *)impl.test_cases_float_pointer1; - // const double *addr = (const double *)impl.test_cases_float_pointer2; - // - // __m128d a = load_m128d(_a); - // __m128d ret = _mm_loadh_pd(a, addr); - // - // return validate_double(ret, _a[0], addr[0]); - // #else +#ifdef ENABLE_TEST_ALL + const double *_a = (const double *)impl.test_cases_float_pointer1; + const double *addr = (const double *)impl.test_cases_float_pointer2; + + __m128d a = load_m128d(_a); + __m128d ret = _mm_loadh_pd(a, addr); + + return validate_double(ret, _a[0], addr[0]); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_loadl_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int64_t *addr = (const int64_t *)impl.test_cases_int_pointer1; - // - // __m128i ret = _mm_loadl_epi64((const __m128i *)addr); - // - // return validate_int64(ret, addr[0], 0); - // #else +#ifdef ENABLE_TEST_ALL + const int64_t *addr = (const int64_t *)impl.test_cases_int_pointer1; + + __m128i ret = _mm_loadl_epi64((const __m128i *)addr); + + return validate_int64(ret, addr[0], 0); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_loadl_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const double *_a = (const double *)impl.test_cases_float_pointer1; - // const double *addr = (const double *)impl.test_cases_float_pointer2; - // - // __m128d a = load_m128d(_a); - // __m128d ret = _mm_loadl_pd(a, addr); - // - // return validate_double(ret, addr[0], _a[1]); - // #else +#ifdef ENABLE_TEST_ALL + const double *_a = (const double *)impl.test_cases_float_pointer1; + const double *addr = (const double *)impl.test_cases_float_pointer2; + + __m128d a = load_m128d(_a); + __m128d ret = _mm_loadl_pd(a, addr); + + return validate_double(ret, addr[0], _a[1]); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_loadr_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const double *addr = (const double *)impl.test_cases_float_pointer1; - // - // __m128d ret = _mm_loadr_pd(addr); - // - // return validate_double(ret, addr[1], addr[0]); - // #else +#ifdef ENABLE_TEST_ALL + const double *addr = (const double *)impl.test_cases_float_pointer1; + + __m128d ret = _mm_loadr_pd(addr); + + return validate_double(ret, addr[1], addr[0]); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_loadu_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {