diff --git a/sse2rvv.h b/sse2rvv.h index b2ffd7e..7fdb164 100644 --- a/sse2rvv.h +++ b/sse2rvv.h @@ -149,7 +149,9 @@ typedef vint32m1_t __m128i; /* 128-bit vector containing integers */ #define vreinterpretq_i32_m128d(x) \ __riscv_vreinterpret_v_i64m1_f64m1(__riscv_vreinterpret_v_i32m1_i64m1(x)) #define vreinterpretq_i64_m128d(x) __riscv_vreinterpret_v_i64m1_f64m1(x) -#define vreinterpretq_f32_m128d(x) __riscv_vreinterpret_v_f32m1_f64m1(x) +#define vreinterpretq_f32_m128d(x) \ + __riscv_vreinterpret_v_i64m1_f64m1(__riscv_vreinterpret_v_i32m1_i64m1( \ + __riscv_vreinterpret_v_f32m1_i32m1(x))) #define vreinterpretq_f64_m128d(x) (x) #define vreinterpretq_m128i_u8(x) \ @@ -180,7 +182,7 @@ typedef vint32m1_t __m128i; /* 128-bit vector containing integers */ #define vreinterpretq_i64_m128i(x) __riscv_vreinterpret_v_i64m1_i32m1(x) #define vreinterpretq_f32_m128i(x) __riscv_vreinterpret_v_f32m1_i32m1(x) #define vreinterpretq_f64_m128i(x) \ - __riscv_vreinterpret_v_f32m1_i32m1(__riscv_vreinterpret_v_f64m1_f32m1(x)) + __riscv_vreinterpret_v_i64m1_i32m1(__riscv_vreinterpret_v_f64m1_i64m1(x)) #define vreinterpretq_m64_u8(x) \ __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vreinterpret_v_f32m1_u32m1(x)) @@ -849,7 +851,9 @@ FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) { // elements) from memory into dst. mem_addr must be aligned on a 16-byte // boundary or a general-protection exception may be generated. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps -// FORCE_INLINE __m128 _mm_load_ps(const float *p) {} +FORCE_INLINE __m128 _mm_load_ps(float const *mem_addr) { + return vreinterpretq_f32_m128(__riscv_vle32_v_f32m1(mem_addr, 4)); +} // Load a single-precision (32-bit) floating-point element from memory into all // elements of dst. @@ -860,18 +864,23 @@ FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) { // dst[127:96] := MEM[mem_addr+31:mem_addr] // // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1 -// #define _mm_load_ps1 _mm_load1_ps +#define _mm_load_ps1 _mm_load1_ps // Load a single-precision (32-bit) floating-point element from memory into the // lower of dst, and zero the upper 3 elements. mem_addr does not need to be // aligned on any particular boundary. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss -// FORCE_INLINE __m128 _mm_load_ss(const float *p) {} +FORCE_INLINE __m128 _mm_load_ss(float const *mem_addr) { + float p[4] = {mem_addr[0], 0, 0, 0}; + return vreinterpretq_f32_m128(__riscv_vle32_v_f32m1(p, 4)); +} // Load a single-precision (32-bit) floating-point element from memory into all // elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps -// FORCE_INLINE __m128 _mm_load1_ps(const float *p) {} +FORCE_INLINE __m128 _mm_load1_ps(float const *mem_addr) { + return vreinterpretq_f32_m128(__riscv_vfmv_v_f_f32m1(mem_addr[0], 4)); +} // Load 2 single-precision (32-bit) floating-point elements from memory into the // upper 2 elements of dst, and copy the lower 2 elements from a to dst. @@ -1931,28 +1940,38 @@ FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) { // elements) from memory into dst. mem_addr must be aligned on a 16-byte // boundary or a general-protection exception may be generated. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd -// FORCE_INLINE __m128d _mm_load_pd(const double *p) {} +FORCE_INLINE __m128d _mm_load_pd(double const *mem_addr) { + return vreinterpretq_f64_m128d(__riscv_vle64_v_f64m1(mem_addr, 2)); +} // Load a double-precision (64-bit) floating-point element from memory into both // elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1 -// #define _mm_load_pd1 _mm_load1_pd +#define _mm_load_pd1 _mm_load1_pd // Load a double-precision (64-bit) floating-point element from memory into the // lower of dst, and zero the upper element. mem_addr does not need to be // aligned on any particular boundary. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd -// FORCE_INLINE __m128d _mm_load_sd(const double *p) {} +FORCE_INLINE __m128d _mm_load_sd(double const *mem_addr) { + double p[2] = {mem_addr[0], 0}; + return vreinterpretq_f64_m128d(__riscv_vle64_v_f64m1(p, 2)); +} // Load 128-bits of integer data from memory into dst. mem_addr must be aligned // on a 16-byte boundary or a general-protection exception may be generated. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128 -// FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) {} +FORCE_INLINE __m128i _mm_load_si128(__m128i const *mem_addr) { + return vreinterpretq_f64_m128i( + __riscv_vle64_v_f64m1((double const *)mem_addr, 2)); +} // Load a double-precision (64-bit) floating-point element from memory into both // elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd -// FORCE_INLINE __m128d _mm_load1_pd(const double *p) {} +FORCE_INLINE __m128d _mm_load1_pd(double const *mem_addr) { + return vreinterpretq_f64_m128d(__riscv_vfmv_v_f_f64m1(mem_addr[0], 2)); +} // Load a double-precision (64-bit) floating-point element from memory into the // upper element of dst, and copy the lower element from a to dst. mem_addr does @@ -1979,7 +1998,7 @@ FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) { // Loads two double-precision from unaligned memory, floating-point values. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd FORCE_INLINE __m128d _mm_loadu_pd(const double *p) { - return __riscv_vle64_v_f64m1(p, 2); + return vreinterpretq_f64_m128d(__riscv_vle64_v_f64m1(p, 2)); } // Load 128-bits of integer data from memory into dst. mem_addr does not need to diff --git a/tests/impl.cpp b/tests/impl.cpp index 2560a6e..4233cb5 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -2217,49 +2217,49 @@ result_t test_mm_insert_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { } result_t test_mm_load_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const float *addr = impl.test_cases_float_pointer1; - // - // __m128 ret = _mm_load_ps(addr); - // - // return validate_float(ret, addr[0], addr[1], addr[2], addr[3]); - // #else +#ifdef ENABLE_TEST_ALL + const float *addr = impl.test_cases_float_pointer1; + + __m128 ret = _mm_load_ps(addr); + + return validate_float(ret, addr[0], addr[1], addr[2], addr[3]); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_load_ps1(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const float *addr = impl.test_cases_float_pointer1; - // - // __m128 ret = _mm_load_ps1(addr); - // - // return validate_float(ret, addr[0], addr[0], addr[0], addr[0]); - // #else +#ifdef ENABLE_TEST_ALL + const float *addr = impl.test_cases_float_pointer1; + + __m128 ret = _mm_load_ps1(addr); + + return validate_float(ret, addr[0], addr[0], addr[0], addr[0]); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_load_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const float *addr = impl.test_cases_float_pointer1; - // - // __m128 ret = _mm_load_ss(addr); - // - // return validate_float(ret, addr[0], 0, 0, 0); - // #else +#ifdef ENABLE_TEST_ALL + const float *addr = impl.test_cases_float_pointer1; + + __m128 ret = _mm_load_ss(addr); + + return validate_float(ret, addr[0], 0, 0, 0); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_load1_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const float *p = impl.test_cases_float_pointer1; - // __m128 a = _mm_load1_ps(p); - // return validate_float(a, p[0], p[0], p[0], p[0]); - // #else +#ifdef ENABLE_TEST_ALL + const float *p = impl.test_cases_float_pointer1; + __m128 a = _mm_load1_ps(p); + return validate_float(a, p[0], p[0], p[0], p[0]); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_loadh_pi(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { @@ -5641,57 +5641,57 @@ result_t test_mm_lfence(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { } result_t test_mm_load_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const double *p = (const double *)impl.test_cases_float_pointer1; - // __m128d a = _mm_load_pd(p); - // return validate_double(a, p[0], p[1]); - // #else +#ifdef ENABLE_TEST_ALL + const double *p = (const double *)impl.test_cases_float_pointer1; + __m128d a = _mm_load_pd(p); + return validate_double(a, p[0], p[1]); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_load_pd1(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const double *p = (const double *)impl.test_cases_float_pointer1; - // __m128d a = _mm_load_pd1(p); - // return validate_double(a, p[0], p[0]); - // #else +#ifdef ENABLE_TEST_ALL + const double *p = (const double *)impl.test_cases_float_pointer1; + __m128d a = _mm_load_pd1(p); + return validate_double(a, p[0], p[0]); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_load_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const double *p = (const double *)impl.test_cases_float_pointer1; - // __m128d a = _mm_load_sd(p); - // return validate_double(a, p[0], 0); - // #else +#ifdef ENABLE_TEST_ALL + const double *p = (const double *)impl.test_cases_float_pointer1; + __m128d a = _mm_load_sd(p); + return validate_double(a, p[0], 0); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_load_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int32_t *addr = impl.test_cases_int_pointer1; - // - // __m128i ret = _mm_load_si128((const __m128i *)addr); - // - // return VALIDATE_INT32_M128(ret, addr); - // #else +#ifdef ENABLE_TEST_ALL + const int32_t *addr = impl.test_cases_int_pointer1; + + __m128i ret = _mm_load_si128((const __m128i *)addr); + + return VALIDATE_INT32_M128(ret, addr); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_load1_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const double *addr = (const double *)impl.test_cases_float_pointer1; - // - // __m128d ret = _mm_load1_pd(addr); - // - // return validate_double(ret, addr[0], addr[0]); - // #else +#ifdef ENABLE_TEST_ALL + const double *addr = (const double *)impl.test_cases_float_pointer1; + + __m128d ret = _mm_load1_pd(addr); + + return validate_double(ret, addr[0], addr[0]); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_loadh_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {