diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..8e75eee --- /dev/null +++ b/.clang-format @@ -0,0 +1,12 @@ +ColumnLimit: 80 +BraceWrapping: + AfterFunction: true + AfterNamespace: true + AfterStruct: true + AfterClass: true + AfterControlStatement: true + AfterEnum: true + AfterUnion: true + AfterExternBlock: true + SplitEmptyFunction: false + SplitEmptyRecord: false diff --git a/.github/workflows/github_actions.yml b/.github/workflows/github_actions.yml new file mode 100644 index 0000000..923d520 --- /dev/null +++ b/.github/workflows/github_actions.yml @@ -0,0 +1,45 @@ +name: Github Actions + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + cross_compile_tests: + runs-on: ubuntu-20.04 + steps: + - name: checkout code + uses: actions/checkout@v3.2.0 + - name: setup riscv toolchain + run: | + mkdir /opt/riscv + export PATH=$PATH:/opt/riscv/bin + wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.10.18/riscv64-elf-ubuntu-20.04-gcc-nightly-2023.10.18-nightly.tar.gz + sudo tar -xzf riscv64-elf-ubuntu-20.04-gcc-nightly-2023.10.18-nightly.tar.gz -C /opt/ + + - name: run tests + run: | + export PATH=$PATH:/opt/riscv/bin + sh scripts/cross-test.sh qemu + + check_test_cases: + runs-on: ubuntu-20.04 + steps: + - name: checkout code + uses: actions/checkout@v3.2.0 + - name: build artifact + run: | + make test + + coding_style: + runs-on: ubuntu-20.04 + steps: + - name: checkout code + uses: actions/checkout@v3.2.0 + - name: style check + run: | + sudo apt-get install -q -y clang-format-12 + sh scripts/check-format.sh + shell: bash diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e92bb52 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +*.exe +*.o +*.gch +tests/*.d +tests/main +.vs/ +Debug/ +Release/ +*.log diff --git a/LICENSE b/LICENSE index df86fd7..9fa6f8d 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2023 Yang Hau +Copyright (c) 2023 SSE2RVV Contributors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e714b3c --- /dev/null +++ b/Makefile @@ -0,0 +1,89 @@ +ifndef CC +override CC = gcc +endif + +ifndef CXX +override CXX = g++ +endif + +ifndef CROSS_COMPILE + processor := $(shell uname -m) +else # CROSS_COMPILE was set + CC = $(CROSS_COMPILE)gcc + CXX = $(CROSS_COMPILE)g++ + CXXFLAGS += -static + LDFLAGS += -static + + check_riscv := $(shell echo | $(CROSS_COMPILE)cpp -dM - | grep " __riscv_xlen " | cut -c22-) + uname_result := $(shell uname -m) + ifeq ($(check_riscv),64) + processor = rv64 + else ifeq ($(uname_result),rv64imafdc) + processor = rv64 + else ifeq ($(check_riscv),32) + processor = rv32 + else ifeq ($(uname_result),rv32i) + processor = rv32 + else + $(error Unsupported cross-compiler) + endif + + ifeq ($(processor),$(filter $(processor),i386 x86_64)) + ARCH_CFLAGS = -maes -mpclmul -mssse3 -msse4.2 + else + ARCH_CFLAGS = -march=$(processor)gcv_zba + endif + + ifeq ($(SIMULATOR_TYPE), qemu) + SIMULATOR += qemu-riscv64 + SIMULATOR_FLAGS = -cpu $(processor),v=true,zba=true,vlen=128 + else + SIMULATOR = spike + SIMULATOR_FLAGS = --isa=$(processor)gcv_zba + PROXY_KERNEL = pk + endif +endif + +CXXFLAGS += -Wall -Wcast-qual -I. $(ARCH_CFLAGS) +LDFLAGS += -lm +OBJS = \ + tests/binding.o \ + tests/common.o \ + tests/impl.o \ + tests/main.o +deps := $(OBJS:%.o=%.o.d) + +.SUFFIXES: .o .cpp +.cpp.o: + $(CXX) -o $@ $(CXXFLAGS) -c -MMD -MF $@.d $< + +EXEC = tests/main + +$(EXEC): $(OBJS) + $(CXX) $(LDFLAGS) -o $@ $^ + +test: tests/main +ifeq ($(processor),$(filter $(processor),rv32 rv64)) + $(CC) $(ARCH_CFLAGS) -c sse2rvv.h +endif + $(SIMULATOR) $(SIMULATOR_FLAGS) $(PROXY_KERNEL) $^ + +build-test: tests/main +ifeq ($(processor),$(filter $(processor),rv32 rv64)) + $(CC) $(ARCH_CFLAGS) -c sse2rvv.h +endif + +format: + @echo "Formatting files with clang-format.." + @if ! hash clang-format; then echo "clang-format is required to indent"; fi + clang-format -i sse2rvv.h tests/*.cpp tests/*.h + +.PHONY: clean check format + +clean: + $(RM) $(OBJS) $(EXEC) $(deps) sse2rvv.h.gch + +clean-all: clean + $(RM) *.log + +-include $(deps) diff --git a/README.md b/README.md new file mode 100644 index 0000000..48277b0 --- /dev/null +++ b/README.md @@ -0,0 +1,94 @@ +# sse2rvv + +A C/C++ header file that converts Intel SSE intrinsics to RISCV-V Extension intrinsics. + +## Introduction + +`sse2rvv` is a translator of Intel SSE (Streaming SIMD Extensions) intrinsics +to [RISCV-V Extension](https://github.com/riscv/riscv-v-spec), +shortening the time needed to get an RISCV working program that then can be used to +extract profiles and to identify hot paths in the code. +The header file `sse2rvv.h` contains several of the functions provided by Intel +intrinsic headers such as ``, only implemented with RISCV-based counterparts +to produce the exact semantics of the intrinsics. + +This project is based on [sse2neon](https://github.com/DLTcollab/sse2neon), and modify it to RISCV version. + +## Mapping and Coverage + +Header file | Extension | +---|---| +`` | MMX | +`` | SSE | +`` | SSE2 | +`` | SSE3 | +`` | SSSE3 | +`` | SSE4.1 | +`` | SSE4.2 | +`` | AES | + +`sse2rvv` aims to support SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 and AES extension. + +In order to deliver RVV-equivalent intrinsics for all SSE intrinsics used widely, +please be aware that some SSE intrinsics exist a direct mapping with a concrete +NEON-equivalent intrinsic. Others, unfortunately, lack a 1:1 mapping, meaning that +their equivalents are built utilizing a number of NEON intrinsics. + +For example, SSE intrinsic `_mm_loadu_si128` has a direct RVV mapping (`vld1q_s32`), +but SSE intrinsic `_mm_maddubs_epi16` has to be implemented with multiple RVV instructions. + +### Floating-point compatibility + +Some conversions require several RVV intrinsics, which may produce inconsistent results +compared to their SSE counterparts due to differences in the arithmetic rules of IEEE-754. + +## Usage + +- Put the file `sse2rvv.h` in to your source code directory. + +- Locate the following SSE header files included in the code: +```C +#include +#include +``` + {p,t,s,n,w}mmintrin.h could be replaceable as well. + +- Replace them with: +```C +#include "sse2rvv.h" +``` + +- Explicitly specify platform-specific options to gcc/clang compilers. + * On riscv64 + ```shell + -march=r64gcv_zba + ``` + +## Run Built-in Test Suite + +`sse2rvv` provides a unified interface for developing test cases. These test +cases are located in `tests` directory, and the input data is specified at +runtime. Use the following commands to perform test cases: +```shell +$ make test +``` + +## Reference +* [sse2neon](https://github.com/DLTcollab/sse2neon) +* [Intel Intrinsics Guide](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html) +* [Microsoft: x86 intrinsics list](https://learn.microsoft.com/en-us/cpp/intrinsics/x86-intrinsics-list) +* [Arm Neon Intrinsics Reference](https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) +* [Neon Programmer's Guide for Armv8-A](https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/neon-programmers-guide-for-armv8-a) +* [NEON Programmer's Guide](https://static.docs.arm.com/den0018/a/DEN0018A_neon_programmers_guide_en.pdf) +* [qemu/target/i386/ops_sse.h](https://github.com/qemu/qemu/blob/master/target/i386/ops_sse.h): Comprehensive SSE instruction emulation in C. Ideal for semantic checks. +* [Porting Takua Renderer to 64-bit ARM- Part 1](https://blog.yiningkarlli.com/2021/05/porting-takua-to-arm-pt1.html) +* [Porting Takua Renderer to 64-bit ARM- Part 2](https://blog.yiningkarlli.com/2021/07/porting-takua-to-arm-pt2.html) +* [Comparing SIMD on x86-64 and arm64](https://blog.yiningkarlli.com/2021/09/neon-vs-sse.html) +* [Port with SSE2Neon and SIMDe](https://developer.arm.com/documentation/102581/0200/Port-with-SSE2Neon-and-SIMDe) +* [Genomics: Optimizing the BWA aligner for Arm Servers](https://community.arm.com/arm-community-blogs/b/high-performance-computing-blog/posts/optimizing-genomics-and-the-bwa-aligner-for-arm-servers) +* [Bit twiddling with Arm Neon: beating SSE movemasks, counting bits and more](https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon) +* [C/C++ on Graviton](https://github.com/aws/aws-graviton-getting-started/blob/main/c-c%2B%2B.md) + +## Licensing + +`sse2rvv` is freely redistributable under the MIT License. diff --git a/scripts/check-format.sh b/scripts/check-format.sh new file mode 100755 index 0000000..b22c668 --- /dev/null +++ b/scripts/check-format.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +set -x + +for file in ${SOURCES}; +do + clang-format ${file} > expected-format + diff -u -p --label="${file}" --label="expected coding style" ${file} expected-format +done +exit $(clang-format --output-replacements-xml ${SOURCES} | egrep -c "") diff --git a/scripts/cross-test.sh b/scripts/cross-test.sh new file mode 100755 index 0000000..3275059 --- /dev/null +++ b/scripts/cross-test.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +# Clang/LLVM is natively a cross-compiler. +# TODO: Do cross-compilation using Clang +# https://clang.llvm.org/docs/CrossCompilation.html +if [ $(printenv CXX | grep clang) ]; then + exit +fi + +set -x + +make clean +make CROSS_COMPILE=riscv64-unknown-elf- SIMULATOR_TYPE=$1 test || exit 1 # riscv64 diff --git a/sse2rvv.h b/sse2rvv.h new file mode 100644 index 0000000..23108de --- /dev/null +++ b/sse2rvv.h @@ -0,0 +1,3272 @@ +#ifndef SSE2RVV_H +#define SSE2RVV_H + +// This header file provides a simple API translation layer +// between SSE intrinsics to their corresponding RVV versions + +/* + * sse2rvv is freely redistributable under the MIT License. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// /* Tunable configurations */ + +// /* Enable precise implementation of math operations +// * This would slow down the computation a bit, but gives consistent result +// with +// * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result) +// */ +// /* _mm_min|max_ps|ss|pd|sd */ +// #ifndef SSE2RVV_PRECISE_MINMAX +// #define SSE2RVV_PRECISE_MINMAX (0) +// #endif +// /* _mm_rcp_ps and _mm_div_ps */ +// #ifndef SSE2RVV_PRECISE_DIV +// #define SSE2RVV_PRECISE_DIV (0) +// #endif +// /* _mm_sqrt_ps and _mm_rsqrt_ps */ +// #ifndef SSE2RVV_PRECISE_SQRT +// #define SSE2RVV_PRECISE_SQRT (0) +// #endif +// /* _mm_dp_pd */ +// #ifndef SSE2RVV_PRECISE_DP +// #define SSE2RVV_PRECISE_DP (0) +// #endif + +/* compiler specific definitions */ +#if defined(__GNUC__) || defined(__clang__) +#pragma push_macro("FORCE_INLINE") +#pragma push_macro("ALIGN_STRUCT") +#define FORCE_INLINE static inline __attribute__((always_inline)) +#define ALIGN_STRUCT(x) __attribute__((aligned(x))) +#define _sse2rvv_likely(x) __builtin_expect(!!(x), 1) +#define _sse2rvv_unlikely(x) __builtin_expect(!!(x), 0) +#else +#pragma message("Macro name collisions may happen with unsupported compilers.") +#endif + +/* C language does not allow initializing a variable with a function call. */ +#ifdef __cplusplus +#define _sse2rvv_const static const +#else +#define _sse2rvv_const const +#endif + +#include +#include +#include + +/* A few intrinsics accept traditional data types like ints or floats, but + * most operate on data types that are specific to SSE. + * If a vector type ends in d, it contains doubles, and if it does not have + * a suffix, it contains floats. An integer vector type can contain any type + * of integer, from chars to shorts to unsigned long longs. + */ +typedef vint64m1_t __m64; +typedef vfloat32m1_t __m128; /* 128-bit vector containing 4 floats */ +typedef vfloat64m1_t __m128d; /* 128-bit vector containing 2 doubles */ +typedef vint64m1_t __m128i; /* 128-bit vector containing integers */ +typedef vuint8m4_t uint8x16x4_t; + +// __int64 is defined in the Intrinsics Guide which maps to different datatype +// in different data model +#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64)) +#if (defined(__x86_64__) || defined(__i386__)) +#define __int64 long long +#else +#define __int64 int64_t +#endif +#endif + +// A struct is defined in this header file called 'SIMDVec' which can be used +// by applications which attempt to access the contents of an __m128 struct +// directly. It is important to note that accessing the __m128 struct directly +// is bad coding practice by Microsoft: @see: +// https://learn.microsoft.com/en-us/cpp/cpp/m128 +// +// However, some legacy source code may try to access the contents of an __m128 +// struct directly so the developer can use the SIMDVec as an alias for it. Any +// casting must be done manually by the developer, as you cannot cast or +// otherwise alias the base NEON data type for intrinsic operations. +// +// union intended to allow direct access to an __m128 variable using the names +// that the MSVC compiler provides. This union should really only be used when +// trying to access the members of the vector as integer values. GCC/clang +// allow native access to the float members through a simple array access +// operator (in C since 4.6, in C++ since 4.8). +// +// Ideally direct accesses to SIMD vectors should not be used since it can cause +// a performance hit. If it really is needed however, the original __m128 +// variable can be aliased with a pointer to this union and used to access +// individual components. The use of this union should be hidden behind a macro +// that is used throughout the codebase to access the members instead of always +// declaring this type of variable. +typedef union ALIGN_STRUCT(16) SIMDVec { + float m128_f32[4]; // as floats - DON'T USE. Added for convenience. + int8_t m128_i8[16]; // as signed 8-bit integers. + int16_t m128_i16[8]; // as signed 16-bit integers. + int32_t m128_i32[4]; // as signed 32-bit integers. + int64_t m128_i64[2]; // as signed 64-bit integers. + uint8_t m128_u8[16]; // as unsigned 8-bit integers. + uint16_t m128_u16[8]; // as unsigned 16-bit integers. + uint32_t m128_u32[4]; // as unsigned 32-bit integers. + uint64_t m128_u64[2]; // as unsigned 64-bit integers. +} SIMDVec; + +/* SSE macros */ +// #define _MM_GET_FLUSH_ZERO_MODE _sse2rvv_mm_get_flush_zero_mode +// #define _MM_SET_FLUSH_ZERO_MODE _sse2rvv_mm_set_flush_zero_mode +// #define _MM_GET_DENORMALS_ZERO_MODE _sse2rvv_mm_get_denormals_zero_mode +// #define _MM_SET_DENORMALS_ZERO_MODE _sse2rvv_mm_set_denormals_zero_mode + +// Function declaration +// SSE +// FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void); +// FORCE_INLINE __m128 _mm_move_ss(__m128, __m128); +// FORCE_INLINE __m128 _mm_or_ps(__m128, __m128); +// FORCE_INLINE __m128 _mm_set_ps1(float); +// FORCE_INLINE __m128 _mm_setzero_ps(void); +// SSE2 +// FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i); +// FORCE_INLINE __m128i _mm_castps_si128(__m128); +// FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i); +// FORCE_INLINE __m128i _mm_cvtps_epi32(__m128); +// FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d); +// FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i); +// FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int); +// FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t); +// FORCE_INLINE __m128d _mm_set_pd(double, double); +// FORCE_INLINE __m128i _mm_set1_epi32(int); +// FORCE_INLINE __m128i _mm_setzero_si128(void); +// SSE4.1 +// FORCE_INLINE __m128d _mm_ceil_pd(__m128d); +// FORCE_INLINE __m128 _mm_ceil_ps(__m128); +// FORCE_INLINE __m128d _mm_floor_pd(__m128d); +// FORCE_INLINE __m128 _mm_floor_ps(__m128); +// FORCE_INLINE __m128d _mm_round_pd(__m128d, int); +// FORCE_INLINE __m128 _mm_round_ps(__m128, int); +// SSE4.2 +// FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t); + +/* Function Naming Conventions + * The naming convention of SSE intrinsics is straightforward. A generic SSE + * intrinsic function is given as follows: + * _mm__ + * + * The parts of this format are given as follows: + * 1. describes the operation performed by the intrinsic + * 2. identifies the data type of the function's primary arguments + * + * This last part, , is a little complicated. It identifies the + * content of the input values, and can be set to any of the following values: + * + ps - vectors contain floats (ps stands for packed single-precision) + * + pd - vectors contain doubles (pd stands for packed double-precision) + * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit + * signed integers + * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit + * unsigned integers + * + si128 - unspecified 128-bit vector or 256-bit vector + * + m128/m128i/m128d - identifies input vector types when they are different + * than the type of the returned vector + * + * For example, _mm_setzero_ps. The _mm implies that the function returns + * a 128-bit vector. The _ps at the end implies that the argument vectors + * contain floats. + * + * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8) + * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits + * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + * // Set packed 8-bit integers + * // 128 bits, 16 chars, per 8 bits + * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11, + * 4, 5, 12, 13, 6, 7, 14, 15); + * // Shuffle packed 8-bit integers + * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb + */ + +/* Constants for use with _mm_prefetch. */ +enum _mm_hint { + _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */ + _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */ + _MM_HINT_T1 = 2, /* load data to L2 cache only */ + _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */ +}; + +// The bit field mapping to the FPCR(floating-point control register) +typedef struct { + uint16_t res0; + uint8_t res1 : 6; + uint8_t bit22 : 1; + uint8_t bit23 : 1; + uint8_t bit24 : 1; + uint8_t res2 : 7; +#if defined(__aarch64__) || defined(_M_ARM64) + uint32_t res3; +#endif +} fpcr_bitfield; + +// Takes the upper 64 bits of a and places it in the low end of the result +// Takes the lower 64 bits of b and places it into the high end of the result. +// FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) {} + +// takes the lower two 32-bit values from a and swaps them and places in high +// end of result takes the higher two 32 bit values from b and swaps them and +// places in low end of result. +// FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) {} + +// FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) {} + +// FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) {} + +// FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) {} + +// FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) {} + +// FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) {} + +// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the +// high +// FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) {} + +// FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) {} + +// FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) {} + +// FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) {} + +// FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) {} + +// FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) {} + +// FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) {} + +// FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) {} + +// FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) {} + +// C equivalent: +// __m128i _mm_shuffle_epi32_default(__m128i a, +// __constrange(0, 255) int imm) { +// __m128i ret; +// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; +// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03]; +// return ret; +// } +// #define _mm_shuffle_epi32_default(a, imm) + +// Takes the upper 64 bits of a and places it in the low end of the result +// Takes the lower 64 bits of a and places it into the high end of the result. +// FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a) {} + +// takes the lower two 32-bit values from a and swaps them and places in low end +// of result takes the higher two 32 bit values from a and swaps them and places +// in high end of result. +// FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a) {} + +// rotates the least significant 32 bits into the most significant 32 bits, and +// shifts the rest down +// FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a) {} + +// rotates the most significant 32 bits into the least significant 32 bits, and +// shifts the rest up +// FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a) {} + +// gets the lower 64 bits of a, and places it in the upper 64 bits +// gets the lower 64 bits of a and places it in the lower 64 bits +// FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a) {} + +// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the +// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits +// FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a) {} + +// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the +// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and +// places it in the lower 64 bits +// FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a) {} + +// FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a) {} + +// FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a) {} + +// FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) {} + +#if defined(__aarch64__) || defined(_M_ARM64) +// #define _mm_shuffle_epi32_splat(a, imm) +#else +// #define _mm_shuffle_epi32_splat(a, imm) +#endif + +// NEON does not support a general purpose permute intrinsic. +// Shuffle single-precision (32-bit) floating-point elements in a using the +// control in imm8, and store the results in dst. +// +// C equivalent: +// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, +// __constrange(0, 255) int imm) { +// __m128 ret; +// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; +// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03]; +// return ret; +// } +// +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps +// #define _mm_shuffle_ps_default(a, b, imm) + +// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8. +// Store the results in the low 64 bits of dst, with the high 64 bits being +// copied from a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16 +// #define _mm_shufflelo_epi16_function(a, imm) + +// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8. +// Store the results in the high 64 bits of dst, with the low 64 bits being +// copied from a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16 +// #define _mm_shufflehi_epi16_function(a, imm) + +/* MMX */ + +//_mm_empty is a no-op on arm +// FORCE_INLINE void _mm_empty(void) {} +/* SSE */ + +// Add packed single-precision (32-bit) floating-point elements in a and b, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps +// FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) {} + +// Add the lower single-precision (32-bit) floating-point element in a and b, +// store the result in the lower element of dst, and copy the upper 3 packed +// elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss +// FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) {} + +// Compute the bitwise AND of packed single-precision (32-bit) floating-point +// elements in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps +// FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) {} + +// Compute the bitwise NOT of packed single-precision (32-bit) floating-point +// elements in a and then AND with b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps +// FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) {} + +// Average packed unsigned 16-bit integers in a and b, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16 +// FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b) {} + +// Average packed unsigned 8-bit integers in a and b, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8 +// FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b) {} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for equality, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps +// FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) {} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for equality, store the result in the lower element of dst, and copy the +// upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss +// FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b) {} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for greater-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps +// FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) {} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for greater-than-or-equal, store the result in the lower element of dst, +// and copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss +// FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b) {} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for greater-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps +// FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) {} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for greater-than, store the result in the lower element of dst, and copy +// the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss +// FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b) {} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for less-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps +// FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) {} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for less-than-or-equal, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss +// FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b) {} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for less-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps +// FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) {} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for less-than, store the result in the lower element of dst, and copy the +// upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss +// FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b) {} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for not-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps +// FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) {} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for not-equal, store the result in the lower element of dst, and copy the +// upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss +// FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b) {} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for not-greater-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps +// FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b) {} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for not-greater-than-or-equal, store the result in the lower element of +// dst, and copy the upper 3 packed elements from a to the upper elements of +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss +// FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b) {} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for not-greater-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps +// FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b) {} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for not-greater-than, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss +// FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b) {} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for not-less-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps +// FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) {} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for not-less-than-or-equal, store the result in the lower element of dst, +// and copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss +// FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b) {} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for not-less-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps +// FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) {} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for not-less-than, store the result in the lower element of dst, and copy +// the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss +// FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b) {} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// to see if neither is NaN, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps +// +// See also: +// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean +// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics +// FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b) {} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b to see if neither is NaN, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss +// FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b) {} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// to see if either is NaN, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps +// FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b) {} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b to see if either is NaN, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss +// FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b) {} + +// Compare the lower single-precision (32-bit) floating-point element in a and b +// for equality, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss +// FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) {} + +// Compare the lower single-precision (32-bit) floating-point element in a and b +// for greater-than-or-equal, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss +// FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) {} + +// Compare the lower single-precision (32-bit) floating-point element in a and b +// for greater-than, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss +// FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) {} + +// Compare the lower single-precision (32-bit) floating-point element in a and b +// for less-than-or-equal, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss +// FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) {} + +// Compare the lower single-precision (32-bit) floating-point element in a and b +// for less-than, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss +// FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) {} + +// Compare the lower single-precision (32-bit) floating-point element in a and b +// for not-equal, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss +// FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) {} + +// Convert packed signed 32-bit integers in b to packed single-precision +// (32-bit) floating-point elements, store the results in the lower 2 elements +// of dst, and copy the upper 2 packed elements from a to the upper elements of +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps +// FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b) {} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi +// FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a) {} + +// Convert the signed 32-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss +// FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) {} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si +// FORCE_INLINE int _mm_cvt_ss2si(__m128 a) {} + +// Convert packed 16-bit integers in a to packed single-precision (32-bit) +// floating-point elements, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps +// FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a) {} + +// Convert packed 32-bit integers in b to packed single-precision (32-bit) +// floating-point elements, store the results in the lower 2 elements of dst, +// and copy the upper 2 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps +// FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b) {} + +// Convert packed signed 32-bit integers in a to packed single-precision +// (32-bit) floating-point elements, store the results in the lower 2 elements +// of dst, then convert the packed signed 32-bit integers in b to +// single-precision (32-bit) floating-point element, and store the results in +// the upper 2 elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps +// FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b) {} + +// Convert the lower packed 8-bit integers in a to packed single-precision +// (32-bit) floating-point elements, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps +// FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a) {} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 16-bit integers, and store the results in dst. Note: this intrinsic +// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and +// 0x7FFFFFFF. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16 +// FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a) {} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32 +// #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a) + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 8-bit integers, and store the results in lower 4 elements of dst. +// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values +// between 0x7F and 0x7FFFFFFF. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8 +// FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a) {} + +// Convert packed unsigned 16-bit integers in a to packed single-precision +// (32-bit) floating-point elements, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps +// FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a) {} + +// Convert the lower packed unsigned 8-bit integers in a to packed +// single-precision (32-bit) floating-point elements, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps +// FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a) {} + +// Convert the signed 32-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss +// #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b) + +// Convert the signed 64-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss +// FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b) {} + +// Copy the lower single-precision (32-bit) floating-point element of a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32 +// FORCE_INLINE float _mm_cvtss_f32(__m128 a) {} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32 +// #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 64-bit integer, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64 +// FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a) {} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi +// FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a) {} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer with truncation, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si +// FORCE_INLINE int _mm_cvtt_ss2si(__m128 a) {} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32 +// #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer with truncation, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32 +// #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64 +// FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a) {} + +// Divide packed single-precision (32-bit) floating-point elements in a by +// packed elements in b, and store the results in dst. +// Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement +// division by multiplying a by b's reciprocal before using the Newton-Raphson +// method to approximate the results. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps +// FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) {} + +// Divide the lower single-precision (32-bit) floating-point element in a by the +// lower single-precision (32-bit) floating-point element in b, store the result +// in the lower element of dst, and copy the upper 3 packed elements from a to +// the upper elements of dst. +// Warning: ARMv7-A does not produce the same result compared to Intel and not +// IEEE-compliant. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss +// FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) {} + +// Extract a 16-bit integer from a, selected with imm8, and store the result in +// the lower element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16 +// #define _mm_extract_pi16(a, imm) + +// Free aligned memory that was allocated with _mm_malloc. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free +#if !defined(SSE2RVV_ALLOC_DEFINED) +// FORCE_INLINE void _mm_free(void *addr) {} +#endif + +// Macro: Get the flush zero bits from the MXCSR control and status register. +// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or +// _MM_FLUSH_ZERO_OFF +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE +// FORCE_INLINE unsigned int _sse2rvv_mm_get_flush_zero_mode(void) {} + +// Macro: Get the rounding mode bits from the MXCSR control and status register. +// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, +// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE +// FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void) {} + +// Copy a to dst, and insert the 16-bit integer i into dst at the location +// specified by imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16 +// #define _mm_insert_pi16(a, b, imm) + +// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point +// elements) from memory into dst. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps +// FORCE_INLINE __m128 _mm_load_ps(const float *p) {} + +// Load a single-precision (32-bit) floating-point element from memory into all +// elements of dst. +// +// dst[31:0] := MEM[mem_addr+31:mem_addr] +// dst[63:32] := MEM[mem_addr+31:mem_addr] +// dst[95:64] := MEM[mem_addr+31:mem_addr] +// dst[127:96] := MEM[mem_addr+31:mem_addr] +// +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1 +// #define _mm_load_ps1 _mm_load1_ps + +// Load a single-precision (32-bit) floating-point element from memory into the +// lower of dst, and zero the upper 3 elements. mem_addr does not need to be +// aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss +// FORCE_INLINE __m128 _mm_load_ss(const float *p) {} + +// Load a single-precision (32-bit) floating-point element from memory into all +// elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps +// FORCE_INLINE __m128 _mm_load1_ps(const float *p) {} + +// Load 2 single-precision (32-bit) floating-point elements from memory into the +// upper 2 elements of dst, and copy the lower 2 elements from a to dst. +// mem_addr does not need to be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi +// FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p) {} + +// Load 2 single-precision (32-bit) floating-point elements from memory into the +// lower 2 elements of dst, and copy the upper 2 elements from a to dst. +// mem_addr does not need to be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi +// FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p) {} + +// Load 4 single-precision (32-bit) floating-point elements from memory into dst +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps +// FORCE_INLINE __m128 _mm_loadr_ps(const float *p) {} + +// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point +// elements) from memory into dst. mem_addr does not need to be aligned on any +// particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps +// FORCE_INLINE __m128 _mm_loadu_ps(const float *p) {} + +// Load unaligned 16-bit integer from memory into the first element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16 +// FORCE_INLINE __m128i _mm_loadu_si16(const void *p) {} + +// Load unaligned 64-bit integer from memory into the first element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64 +// FORCE_INLINE __m128i _mm_loadu_si64(const void *p) {} + +// Allocate size bytes of memory, aligned to the alignment specified in align, +// and return a pointer to the allocated memory. _mm_free should be used to free +// memory that is allocated with _mm_malloc. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc +#if !defined(SSE2RVV_ALLOC_DEFINED) +// FORCE_INLINE void *_mm_malloc(size_t size, size_t align) {} +#endif + +// Conditionally store 8-bit integer elements from a into memory using mask +// (elements are not stored when the highest bit is not set in the corresponding +// element) and a non-temporal memory hint. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64 +// FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr) {} + +// Conditionally store 8-bit integer elements from a into memory using mask +// (elements are not stored when the highest bit is not set in the corresponding +// element) and a non-temporal memory hint. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq +// #define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr) + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16 +// FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b) {} + +// Compare packed single-precision (32-bit) floating-point elements in a and b, +// and store packed maximum values in dst. dst does not follow the IEEE Standard +// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or +// signed-zero values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps +// FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) {} + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8 +// FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b) {} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b, store the maximum value in the lower element of dst, and copy the upper 3 +// packed elements from a to the upper element of dst. dst does not follow the +// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when +// inputs are NaN or signed-zero values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss +// FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) {} + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16 +// FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b) {} + +// Compare packed single-precision (32-bit) floating-point elements in a and b, +// and store packed minimum values in dst. dst does not follow the IEEE Standard +// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or +// signed-zero values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps +// FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) {} + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8 +// FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b) {} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b, store the minimum value in the lower element of dst, and copy the upper 3 +// packed elements from a to the upper element of dst. dst does not follow the +// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when +// inputs are NaN or signed-zero values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss +// FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) {} + +// Move the lower single-precision (32-bit) floating-point element from b to the +// lower element of dst, and copy the upper 3 packed elements from a to the +// upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss +// FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b) {} + +// Move the upper 2 single-precision (32-bit) floating-point elements from b to +// the lower 2 elements of dst, and copy the upper 2 elements from a to the +// upper 2 elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps +// FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b) {} + +// Move the lower 2 single-precision (32-bit) floating-point elements from b to +// the upper 2 elements of dst, and copy the lower 2 elements from a to the +// lower 2 elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps +// FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B) {} + +// Create mask from the most significant bit of each 8-bit element in a, and +// store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8 +// FORCE_INLINE int _mm_movemask_pi8(__m64 a) {} + +// Set each bit of mask dst based on the most significant bit of the +// corresponding packed single-precision (32-bit) floating-point element in a. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps +// FORCE_INLINE int _mm_movemask_ps(__m128 a) {} + +// Multiply packed single-precision (32-bit) floating-point elements in a and b, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps +// FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) {} + +// Multiply the lower single-precision (32-bit) floating-point element in a and +// b, store the result in the lower element of dst, and copy the upper 3 packed +// elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss +// FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) {} + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16 +// FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b) {} + +// Compute the bitwise OR of packed single-precision (32-bit) floating-point +// elements in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps +// FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) {} + +// Average packed unsigned 8-bit integers in a and b, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb +// #define _m_pavgb(a, b) _mm_avg_pu8(a, b) + +// Average packed unsigned 16-bit integers in a and b, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw +// #define _m_pavgw(a, b) _mm_avg_pu16(a, b) + +// Extract a 16-bit integer from a, selected with imm8, and store the result in +// the lower element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw +// #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm) + +// Copy a to dst, and insert the 16-bit integer i into dst at the location +// specified by imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw +// #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm) + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw +// #define _m_pmaxsw(a, b) _mm_max_pi16(a, b) + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub +// #define _m_pmaxub(a, b) _mm_max_pu8(a, b) + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw +// #define _m_pminsw(a, b) _mm_min_pi16(a, b) + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub +// #define _m_pminub(a, b) _mm_min_pu8(a, b) + +// Create mask from the most significant bit of each 8-bit element in a, and +// store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb +// #define _m_pmovmskb(a) _mm_movemask_pi8(a) + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw +// #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b) + +// Fetch the line of data from memory that contains address p to a location in +// the cache hierarchy specified by the locality hint i. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch +// FORCE_INLINE void _mm_prefetch(char const *p, int i) {} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce four +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw +// #define _m_psadbw(a, b) _mm_sad_pu8(a, b) + +// Shuffle 16-bit integers in a using the control in imm8, and store the results +// in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw +// #define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm) + +// Compute the approximate reciprocal of packed single-precision (32-bit) +// floating-point elements in a, and store the results in dst. The maximum +// relative error for this approximation is less than 1.5*2^-12. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps +// FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) {} + +// Compute the approximate reciprocal of the lower single-precision (32-bit) +// floating-point element in a, store the result in the lower element of dst, +// and copy the upper 3 packed elements from a to the upper elements of dst. The +// maximum relative error for this approximation is less than 1.5*2^-12. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss +// FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) {} + +// Compute the approximate reciprocal square root of packed single-precision +// (32-bit) floating-point elements in a, and store the results in dst. The +// maximum relative error for this approximation is less than 1.5*2^-12. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps +// FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) {} + +// Compute the approximate reciprocal square root of the lower single-precision +// (32-bit) floating-point element in a, store the result in the lower element +// of dst, and copy the upper 3 packed elements from a to the upper elements of +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss +// FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in) {} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce four +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8 +// FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b) {} + +// Macro: Set the flush zero bits of the MXCSR control and status register to +// the value in unsigned 32-bit integer a. The flush zero may contain any of the +// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE +// FORCE_INLINE void _sse2rvv_mm_set_flush_zero_mode(unsigned int flag) {} + +// Set packed single-precision (32-bit) floating-point elements in dst with the +// supplied values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps +// FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) {} + +// Broadcast single-precision (32-bit) floating-point value a to all elements of +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1 +// FORCE_INLINE __m128 _mm_set_ps1(float _w) {} + +// Macro: Set the rounding mode bits of the MXCSR control and status register to +// the value in unsigned 32-bit integer a. The rounding mode may contain any of +// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, +// _MM_ROUND_TOWARD_ZERO +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE +// FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) {} + +// Copy single-precision (32-bit) floating-point element a to the lower element +// of dst, and zero the upper 3 elements. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss +// FORCE_INLINE __m128 _mm_set_ss(float a) {} + +// Broadcast single-precision (32-bit) floating-point value a to all elements of +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps +// FORCE_INLINE __m128 _mm_set1_ps(float _w) {} + +// Set the MXCSR control and status register with the value in unsigned 32-bit +// integer a. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr +// FIXME: _mm_setcsr() implementation supports changing the rounding mode only. +// FORCE_INLINE void _mm_setcsr(unsigned int a) {} + +// Get the unsigned 32-bit value of the MXCSR control and status register. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr +// FIXME: _mm_getcsr() implementation supports reading the rounding mode only. +// FORCE_INLINE unsigned int _mm_getcsr(void) {} + +// Set packed single-precision (32-bit) floating-point elements in dst with the +// supplied values in reverse order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps +// FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x) {} + +// Return vector of type __m128 with all elements set to zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps +// FORCE_INLINE __m128 _mm_setzero_ps(void) {} + +// Shuffle 16-bit integers in a using the control in imm8, and store the results +// in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16 +#ifdef _sse2rvv_shuffle +// #define _mm_shuffle_pi16(a, imm) +#else +// #define _mm_shuffle_pi16(a, imm) +#endif + +// Perform a serializing operation on all store-to-memory instructions that were +// issued prior to this instruction. Guarantees that every store instruction +// that precedes, in program order, is globally visible before any store +// instruction which follows the fence in program order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence +// FORCE_INLINE void _mm_sfence(void) {} + +// Perform a serializing operation on all load-from-memory and store-to-memory +// instructions that were issued prior to this instruction. Guarantees that +// every memory access that precedes, in program order, the memory fence +// instruction is globally visible before any memory instruction which follows +// the fence in program order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence +// FORCE_INLINE void _mm_mfence(void) {} + +// Perform a serializing operation on all load-from-memory instructions that +// were issued prior to this instruction. Guarantees that every load instruction +// that precedes, in program order, is globally visible before any load +// instruction which follows the fence in program order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence +// FORCE_INLINE void _mm_lfence(void) {} + +// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255) +// int imm) +#ifdef _sse2rvv_shuffle +// #define _mm_shuffle_ps(a, b, imm) +#else // generic +// #define _mm_shuffle_ps(a, b, imm) +#endif + +// Compute the square root of packed single-precision (32-bit) floating-point +// elements in a, and store the results in dst. +// Due to ARMv7-A NEON's lack of a precise square root intrinsic, we implement +// square root by multiplying input in with its reciprocal square root before +// using the Newton-Raphson method to approximate the results. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps +// FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) {} + +// Compute the square root of the lower single-precision (32-bit) floating-point +// element in a, store the result in the lower element of dst, and copy the +// upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss +// FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) {} + +// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point +// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary +// or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps +// FORCE_INLINE void _mm_store_ps(float *p, __m128 a) {} + +// Store the lower single-precision (32-bit) floating-point element from a into +// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1 +// FORCE_INLINE void _mm_store_ps1(float *p, __m128 a) {} + +// Store the lower single-precision (32-bit) floating-point element from a into +// memory. mem_addr does not need to be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss +// FORCE_INLINE void _mm_store_ss(float *p, __m128 a) {} + +// Store the lower single-precision (32-bit) floating-point element from a into +// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps +// #define _mm_store1_ps _mm_store_ps1 + +// Store the upper 2 single-precision (32-bit) floating-point elements from a +// into memory. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi +// FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a) {} + +// Store the lower 2 single-precision (32-bit) floating-point elements from a +// into memory. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi +// FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a) {} + +// Store 4 single-precision (32-bit) floating-point elements from a into memory +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps +// FORCE_INLINE void _mm_storer_ps(float *p, __m128 a) {} + +// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point +// elements) from a into memory. mem_addr does not need to be aligned on any +// particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps +// FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) {} + +// Stores 16-bits of integer data a at the address p. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16 +// FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a) {} + +// Stores 64-bits of integer data a at the address p. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64 +// FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a) {} + +// Store 64-bits of integer data from a into memory using a non-temporal memory +// hint. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi +// FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a) {} + +// Store 128-bits (composed of 4 packed single-precision (32-bit) floating- +// point elements) from a into memory using a non-temporal memory hint. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps +// FORCE_INLINE void _mm_stream_ps(float *p, __m128 a) {} + +// Subtract packed single-precision (32-bit) floating-point elements in b from +// packed single-precision (32-bit) floating-point elements in a, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps +// FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) {} + +// Subtract the lower single-precision (32-bit) floating-point element in b from +// the lower single-precision (32-bit) floating-point element in a, store the +// result in the lower element of dst, and copy the upper 3 packed elements from +// a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss +// FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) {} + +// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision +// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the +// transposed matrix in these vectors (row0 now contains column 0, etc.). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS +#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ + do { \ + float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \ + float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \ + row0 = \ + vcombine_f32(vget_low_f32(ROW01.val[0]), vget_low_f32(ROW23.val[0])); \ + row1 = \ + vcombine_f32(vget_low_f32(ROW01.val[1]), vget_low_f32(ROW23.val[1])); \ + row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \ + vget_high_f32(ROW23.val[0])); \ + row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \ + vget_high_f32(ROW23.val[1])); \ + } while (0) + +// according to the documentation, these intrinsics behave the same as the +// non-'u' versions. We'll just alias them here. +// #define _mm_ucomieq_ss _mm_comieq_ss +// #define _mm_ucomige_ss _mm_comige_ss +// #define _mm_ucomigt_ss _mm_comigt_ss +// #define _mm_ucomile_ss _mm_comile_ss +// #define _mm_ucomilt_ss _mm_comilt_ss +// #define _mm_ucomineq_ss _mm_comineq_ss + +// Return vector of type __m128i with undefined elements. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128 +// FORCE_INLINE __m128i _mm_undefined_si128(void) {} + +// Return vector of type __m128 with undefined elements. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps +// FORCE_INLINE __m128 _mm_undefined_ps(void) {} + +// Unpack and interleave single-precision (32-bit) floating-point elements from +// the high half a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps +// FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) {} + +// Unpack and interleave single-precision (32-bit) floating-point elements from +// the low half of a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps +// FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) {} + +// Compute the bitwise XOR of packed single-precision (32-bit) floating-point +// elements in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps +// FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) {} + +/* SSE2 */ + +// Add packed 16-bit integers in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16 +// FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) {} + +// Add packed 32-bit integers in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32 +// FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) {} + +// Add packed 64-bit integers in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64 +// FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b) {} + +// Add packed 8-bit integers in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8 +// FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b) {} + +// Add packed double-precision (64-bit) floating-point elements in a and b, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd +// FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) {} + +// Add the lower double-precision (64-bit) floating-point element in a and b, +// store the result in the lower element of dst, and copy the upper element from +// a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd +// FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b) {} + +// Add 64-bit integers a and b, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64 +// FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) {} + +// Add packed signed 16-bit integers in a and b using saturation, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16 +// FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b) {} + +// Add packed signed 8-bit integers in a and b using saturation, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8 +// FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b) {} + +// Add packed unsigned 16-bit integers in a and b using saturation, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16 +// FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b) {} + +// Add packed unsigned 8-bit integers in a and b using saturation, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8 +// FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b) {} + +// Compute the bitwise AND of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd +// FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) {} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128 +// FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) {} + +// Compute the bitwise NOT of packed double-precision (64-bit) floating-point +// elements in a and then AND with b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd +// FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b) {} + +// Compute the bitwise NOT of 128 bits (representing integer data) in a and then +// AND with b, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128 +// FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) {} + +// Average packed unsigned 16-bit integers in a and b, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16 +// FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b) {} + +// Average packed unsigned 8-bit integers in a and b, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8 +// FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b) {} + +// Shift a left by imm8 bytes while shifting in zeros, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128 +// #define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm) + +// Shift a right by imm8 bytes while shifting in zeros, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128 +// #define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm) + +// Cast vector of type __m128d to type __m128. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps +// FORCE_INLINE __m128 _mm_castpd_ps(__m128d a) {} + +// Cast vector of type __m128d to type __m128i. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128 +// FORCE_INLINE __m128i _mm_castpd_si128(__m128d a) {} + +// Cast vector of type __m128 to type __m128d. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd +// FORCE_INLINE __m128d _mm_castps_pd(__m128 a) {} + +// Cast vector of type __m128 to type __m128i. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128 +// FORCE_INLINE __m128i _mm_castps_si128(__m128 a) {} + +// Cast vector of type __m128i to type __m128d. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd +// FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a) {} + +// Cast vector of type __m128i to type __m128. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps +// FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) {} + +// Invalidate and flush the cache line that contains p from all levels of the +// cache hierarchy. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush +#if defined(__APPLE__) +#include +#endif +// FORCE_INLINE void _mm_clflush(void const *p) {} + +// Compare packed 16-bit integers in a and b for equality, and store the results +// in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16 +// FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) {} + +// Compare packed 32-bit integers in a and b for equality, and store the results +// in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32 +// FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) {} + +// Compare packed 8-bit integers in a and b for equality, and store the results +// in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8 +// FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) {} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for equality, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd +// FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b) {} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for equality, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd +// FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b) {} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for greater-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd +// FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b) {} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for greater-than-or-equal, store the result in the lower element of dst, +// and copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd +// FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b) {} + +// Compare packed signed 16-bit integers in a and b for greater-than, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16 +// FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) {} + +// Compare packed signed 32-bit integers in a and b for greater-than, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32 +// FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) {} + +// Compare packed signed 8-bit integers in a and b for greater-than, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8 +// FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) {} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for greater-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd +// FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b) {} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for greater-than, store the result in the lower element of dst, and copy +// the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd +// FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b) {} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for less-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd +// FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b) {} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for less-than-or-equal, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd +// FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b) {} + +// Compare packed signed 16-bit integers in a and b for less-than, and store the +// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the +// order of the operands switched. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16 +// FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b) {} + +// Compare packed signed 32-bit integers in a and b for less-than, and store the +// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the +// order of the operands switched. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32 +// FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) {} + +// Compare packed signed 8-bit integers in a and b for less-than, and store the +// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the +// order of the operands switched. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8 +// FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b) {} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for less-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd +// FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b) {} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for less-than, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd +// FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b) {} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd +// FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b) {} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-equal, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd +// FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b) {} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-greater-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd +// FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b) {} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-greater-than-or-equal, store the result in the lower element of +// dst, and copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd +// FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b) {} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-greater-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd +// FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b) {} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-greater-than, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd +// FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b) {} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-less-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd +// FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b) {} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-less-than-or-equal, store the result in the lower element of dst, +// and copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd +// FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b) {} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-less-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd +// FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b) {} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-less-than, store the result in the lower element of dst, and copy +// the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd +// FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b) {} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// to see if neither is NaN, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd +// FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b) {} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b to see if neither is NaN, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd +// FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b) {} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// to see if either is NaN, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd +// FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b) {} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b to see if either is NaN, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd +// FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b) {} + +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for greater-than-or-equal, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd +// FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b) {} + +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for greater-than, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd +// FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b) {} + +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for less-than-or-equal, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd +// FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b) {} + +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for less-than, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd +// FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b) {} + +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for equality, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd +// FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b) {} + +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for not-equal, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd +// FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b) {} + +// Convert packed signed 32-bit integers in a to packed double-precision +// (64-bit) floating-point elements, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd +// FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a) {} + +// Convert packed signed 32-bit integers in a to packed single-precision +// (32-bit) floating-point elements, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps +// FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) {} + +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32 +// FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a) {} + +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32 +// FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a) {} + +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed single-precision (32-bit) floating-point elements, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps +// FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) {} + +// Convert packed signed 32-bit integers in a to packed double-precision +// (64-bit) floating-point elements, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd +// FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a) {} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32 +// FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) {} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed double-precision (64-bit) floating-point elements, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd +// FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a) {} + +// Copy the lower double-precision (64-bit) floating-point element of a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64 +// FORCE_INLINE double _mm_cvtsd_f64(__m128d a) {} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 32-bit integer, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32 +// FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a) {} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64 +// FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a) {} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x +// #define _mm_cvtsd_si64x _mm_cvtsd_si64 + +// Convert the lower double-precision (64-bit) floating-point element in b to a +// single-precision (32-bit) floating-point element, store the result in the +// lower element of dst, and copy the upper 3 packed elements from a to the +// upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss +// FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b) {} + +// Copy the lower 32-bit integer in a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32 +// FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) {} + +// Copy the lower 64-bit integer in a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64 +// FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a) {} + +// Copy the lower 64-bit integer in a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x +// #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) + +// Convert the signed 32-bit integer b to a double-precision (64-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd +// FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b) {} + +// Copy the lower 64-bit integer in a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x +// #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) + +// Copy 32-bit integer a to the lower elements of dst, and zero the upper +// elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128 +// FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) {} + +// Convert the signed 64-bit integer b to a double-precision (64-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd +// FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b) {} + +// Copy 64-bit integer a to the lower element of dst, and zero the upper +// element. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128 +// FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a) {} + +// Copy 64-bit integer a to the lower element of dst, and zero the upper +// element. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128 +// #define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a) + +// Convert the signed 64-bit integer b to a double-precision (64-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd +// #define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b) + +// Convert the lower single-precision (32-bit) floating-point element in b to a +// double-precision (64-bit) floating-point element, store the result in the +// lower element of dst, and copy the upper element from a to the upper element +// of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd +// FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) {} + +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32 +// FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a) {} + +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32 +// FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a) {} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32 +// FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) {} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 32-bit integer with truncation, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32 +// FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a) {} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64 +// FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a) {} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x +// #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a) + +// Divide packed double-precision (64-bit) floating-point elements in a by +// packed elements in b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd +// FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b) {} + +// Divide the lower double-precision (64-bit) floating-point element in a by the +// lower double-precision (64-bit) floating-point element in b, store the result +// in the lower element of dst, and copy the upper element from a to the upper +// element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd +// FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b) {} + +// Extract a 16-bit integer from a, selected with imm8, and store the result in +// the lower element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16 +// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm) +// #define _mm_extract_epi16(a, imm) + +// Copy a to dst, and insert the 16-bit integer i into dst at the location +// specified by imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16 +// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b, +// __constrange(0,8) int imm) +// #define _mm_insert_epi16(a, b, imm) + +// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from memory into dst. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd +// FORCE_INLINE __m128d _mm_load_pd(const double *p) {} + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1 +// #define _mm_load_pd1 _mm_load1_pd + +// Load a double-precision (64-bit) floating-point element from memory into the +// lower of dst, and zero the upper element. mem_addr does not need to be +// aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd +// FORCE_INLINE __m128d _mm_load_sd(const double *p) {} + +// Load 128-bits of integer data from memory into dst. mem_addr must be aligned +// on a 16-byte boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128 +// FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) {} + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd +// FORCE_INLINE __m128d _mm_load1_pd(const double *p) {} + +// Load a double-precision (64-bit) floating-point element from memory into the +// upper element of dst, and copy the lower element from a to dst. mem_addr does +// not need to be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd +// FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p) {} + +// Load 64-bit integer from memory into the first element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64 +// FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p) {} + +// Load a double-precision (64-bit) floating-point element from memory into the +// lower element of dst, and copy the upper element from a to dst. mem_addr does +// not need to be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd +// FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p) {} + +// Load 2 double-precision (64-bit) floating-point elements from memory into dst +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd +// FORCE_INLINE __m128d _mm_loadr_pd(const double *p) {} + +// Loads two double-precision from unaligned memory, floating-point values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd +// FORCE_INLINE __m128d _mm_loadu_pd(const double *p) {} + +// Load 128-bits of integer data from memory into dst. mem_addr does not need to +// be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128 +// FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) {} + +// Load unaligned 32-bit integer from memory into the first element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32 +// FORCE_INLINE __m128i _mm_loadu_si32(const void *p) {} + +// Multiply packed signed 16-bit integers in a and b, producing intermediate +// signed 32-bit integers. Horizontally add adjacent pairs of intermediate +// 32-bit integers, and pack the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16 +// FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) {} + +// Conditionally store 8-bit integer elements from a into memory using mask +// (elements are not stored when the highest bit is not set in the corresponding +// element) and a non-temporal memory hint. mem_addr does not need to be aligned +// on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128 +// FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char +// *mem_addr) {} + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16 +// FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b) {} + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8 +// FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) {} + +// Compare packed double-precision (64-bit) floating-point elements in a and b, +// and store packed maximum values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd +// FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b) {} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b, store the maximum value in the lower element of dst, and copy the upper +// element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd +// FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b) {} + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16 +// FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) {} + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8 +// FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b) {} + +// Compare packed double-precision (64-bit) floating-point elements in a and b, +// and store packed minimum values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd +// FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b) {} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b, store the minimum value in the lower element of dst, and copy the upper +// element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd +// FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b) {} + +// Copy the lower 64-bit integer in a to the lower element of dst, and zero the +// upper element. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64 +// FORCE_INLINE __m128i _mm_move_epi64(__m128i a) {} + +// Move the lower double-precision (64-bit) floating-point element from b to the +// lower element of dst, and copy the upper element from a to the upper element +// of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd +// FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b) {} + +// Create mask from the most significant bit of each 8-bit element in a, and +// store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8 +// FORCE_INLINE int _mm_movemask_epi8(__m128i a) {} + +// Set each bit of mask dst based on the most significant bit of the +// corresponding packed double-precision (64-bit) floating-point element in a. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd +// FORCE_INLINE int _mm_movemask_pd(__m128d a) {} + +// Copy the lower 64-bit integer in a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64 +// FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a) {} + +// Copy the 64-bit integer a to the lower element of dst, and zero the upper +// element. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64 +// FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a) {} + +// Multiply the low unsigned 32-bit integers from each packed 64-bit element in +// a and b, and store the unsigned 64-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32 +// FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) {} + +// Multiply packed double-precision (64-bit) floating-point elements in a and b, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd +// FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) {} + +// Multiply the lower double-precision (64-bit) floating-point element in a and +// b, store the result in the lower element of dst, and copy the upper element +// from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd +// FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b) {} + +// Multiply the low unsigned 32-bit integers from a and b, and store the +// unsigned 64-bit result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32 +// FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b) {} + +// Multiply the packed signed 16-bit integers in a and b, producing intermediate +// 32-bit integers, and store the high 16 bits of the intermediate integers in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16 +// FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) {} + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16 +// FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b) {} + +// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit +// integers, and store the low 16 bits of the intermediate integers in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16 +// FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) {} + +// Compute the bitwise OR of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd +// FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b) {} + +// Compute the bitwise OR of 128 bits (representing integer data) in a and b, +// and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128 +// FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) {} + +// Convert packed signed 16-bit integers from a and b to packed 8-bit integers +// using signed saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16 +// FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) {} + +// Convert packed signed 32-bit integers from a and b to packed 16-bit integers +// using signed saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32 +// FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) {} + +// Convert packed signed 16-bit integers from a and b to packed 8-bit integers +// using unsigned saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16 +// FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) {} + +// Pause the processor. This is typically used in spin-wait loops and depending +// on the x86 processor typical values are in the 40-100 cycle range. The +// 'yield' instruction isn't a good fit because it's effectively a nop on most +// Arm cores. Experience with several databases has shown has shown an 'isb' is +// a reasonable approximation. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause +// FORCE_INLINE void _mm_pause(void) {} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce two +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of 64-bit elements in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8 +// FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b) {} + +// Set packed 16-bit integers in dst with the supplied values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16 +// FORCE_INLINE __m128i _mm_set_epi16(short i7, +// short i6, +// short i5, +// short i4, +// short i3, +// short i2, +// short i1, +// short i0) {} + +// Set packed 32-bit integers in dst with the supplied values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32 +// FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) {} + +// Set packed 64-bit integers in dst with the supplied values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64 +// FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2) {} + +// Set packed 64-bit integers in dst with the supplied values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x +// FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2) {} + +// Set packed 8-bit integers in dst with the supplied values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8 +// FORCE_INLINE __m128i _mm_set_epi8(signed char b15, +// signed char b14, +// signed char b13, +// signed char b12, +// signed char b11, +// signed char b10, +// signed char b9, +// signed char b8, +// signed char b7, +// signed char b6, +// signed char b5, +// signed char b4, +// signed char b3, +// signed char b2, +// signed char b1, +// signed char b0) {} + +// Set packed double-precision (64-bit) floating-point elements in dst with the +// supplied values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd +// FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) {} + +// Broadcast double-precision (64-bit) floating-point value a to all elements of +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1 +// #define _mm_set_pd1 _mm_set1_pd + +// Copy double-precision (64-bit) floating-point element a to the lower element +// of dst, and zero the upper element. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd +// FORCE_INLINE __m128d _mm_set_sd(double a) {} + +// Broadcast 16-bit integer a to all elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16 +// FORCE_INLINE __m128i _mm_set1_epi16(short w) {} + +// Broadcast 32-bit integer a to all elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32 +// FORCE_INLINE __m128i _mm_set1_epi32(int _i) {} + +// Broadcast 64-bit integer a to all elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64 +// FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i) {} + +// Broadcast 64-bit integer a to all elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x +// FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i) {} + +// Broadcast 8-bit integer a to all elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8 +// FORCE_INLINE __m128i _mm_set1_epi8(signed char w) {} + +// Broadcast double-precision (64-bit) floating-point value a to all elements of +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd +// FORCE_INLINE __m128d _mm_set1_pd(double d) {} + +// Set packed 16-bit integers in dst with the supplied values in reverse order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16 +// FORCE_INLINE __m128i _mm_setr_epi16(short w0, +// short w1, +// short w2, +// short w3, +// short w4, +// short w5, +// short w6, +// short w7) {} + +// Set packed 32-bit integers in dst with the supplied values in reverse order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32 +// FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0) {} + +// Set packed 64-bit integers in dst with the supplied values in reverse order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64 +// FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0) {} + +// Set packed 8-bit integers in dst with the supplied values in reverse order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8 +// FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, +// signed char b1, +// signed char b2, +// signed char b3, +// signed char b4, +// signed char b5, +// signed char b6, +// signed char b7, +// signed char b8, +// signed char b9, +// signed char b10, +// signed char b11, +// signed char b12, +// signed char b13, +// signed char b14, +// signed char b15) {} + +// Set packed double-precision (64-bit) floating-point elements in dst with the +// supplied values in reverse order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd +// FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0) {} + +// Return vector of type __m128d with all elements set to zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd +// FORCE_INLINE __m128d _mm_setzero_pd(void) {} + +// Return vector of type __m128i with all elements set to zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128 +// FORCE_INLINE __m128i _mm_setzero_si128(void) {} + +// Shuffle 32-bit integers in a using the control in imm8, and store the results +// in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32 +// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, +// __constrange(0,255) int imm) +#if defined(_sse2rvv_shuffle) +// #define _mm_shuffle_epi32(a, imm) +#else // generic +// #define _mm_shuffle_epi32(a, imm) +#endif + +// Shuffle double-precision (64-bit) floating-point elements using the control +// in imm8, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd +#ifdef _sse2rvv_shuffle +// #define _mm_shuffle_pd(a, b, imm8) +#else +// #define _mm_shuffle_pd(a, b, imm8) +#endif + +// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, +// __constrange(0,255) int imm) +#if defined(_sse2rvv_shuffle) +// #define _mm_shufflehi_epi16(a, imm) +#else // generic +// #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm)) +#endif + +// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, +// __constrange(0,255) int imm) +#if defined(_sse2rvv_shuffle) +// #define _mm_shufflelo_epi16(a, imm) +#else // generic +// #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm)) +#endif + +// Shift packed 16-bit integers in a left by count while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16 +// FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count) {} + +// Shift packed 32-bit integers in a left by count while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32 +// FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count) {} + +// Shift packed 64-bit integers in a left by count while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64 +// FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count) {} + +// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16 +// FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm) {} + +// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32 +// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm) {} + +// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64 +// FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm) {} + +// Shift a left by imm8 bytes while shifting in zeros, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128 +// #define _mm_slli_si128(a, imm) + +// Compute the square root of packed double-precision (64-bit) floating-point +// elements in a, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd +// FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a) {} + +// Compute the square root of the lower double-precision (64-bit) floating-point +// element in b, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd +// FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b) {} + +// Shift packed 16-bit integers in a right by count while shifting in sign bits, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16 +// FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) {} + +// Shift packed 32-bit integers in a right by count while shifting in sign bits, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32 +// FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) {} + +// Shift packed 16-bit integers in a right by imm8 while shifting in sign +// bits, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16 +// FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm) {} + +// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32 +// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm) +// #define _mm_srai_epi32(a, imm) + +// Shift packed 16-bit integers in a right by count while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16 +// FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count) {} + +// Shift packed 32-bit integers in a right by count while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32 +// FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count) {} + +// Shift packed 64-bit integers in a right by count while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64 +// FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) {} + +// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16 +// #define _mm_srli_epi16(a, imm) + +// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32 +// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm) +// #define _mm_srli_epi32(a, imm) + +// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64 +// #define _mm_srli_epi64(a, imm) + +// Shift a right by imm8 bytes while shifting in zeros, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128 +// #define _mm_srli_si128(a, imm) + +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary +// or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd +// FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) {} + +// Store the lower double-precision (64-bit) floating-point element from a into +// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1 +// FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a) {} + +// Store the lower double-precision (64-bit) floating-point element from a into +// memory. mem_addr does not need to be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd +// FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a) {} + +// Store 128-bits of integer data from a into memory. mem_addr must be aligned +// on a 16-byte boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128 +// FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a) {} + +// Store the lower double-precision (64-bit) floating-point element from a into +// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd +// #define _mm_store1_pd _mm_store_pd1 + +// Store the upper double-precision (64-bit) floating-point element from a into +// memory. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd +// FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a) {} + +// Store 64-bit integer from the first element of a into memory. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64 +// FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b) {} + +// Store the lower double-precision (64-bit) floating-point element from a into +// memory. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd +// FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a) {} + +// Store 2 double-precision (64-bit) floating-point elements from a into memory +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd +// FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a) {} + +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory. mem_addr does not need to be aligned on any +// particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd +// FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a) {} + +// Store 128-bits of integer data from a into memory. mem_addr does not need to +// be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128 +// FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) {} + +// Store 32-bit integer from the first element of a into memory. mem_addr does +// not need to be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32 +// FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a) {} + +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory using a non-temporal memory hint. mem_addr must +// be aligned on a 16-byte boundary or a general-protection exception may be +// generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd +// FORCE_INLINE void _mm_stream_pd(double *p, __m128d a) {} + +// Store 128-bits of integer data from a into memory using a non-temporal memory +// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection +// exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128 +// FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) {} + +// Store 32-bit integer a into memory using a non-temporal hint to minimize +// cache pollution. If the cache line containing address mem_addr is already in +// the cache, the cache will be updated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32 +// FORCE_INLINE void _mm_stream_si32(int *p, int a) {} + +// Store 64-bit integer a into memory using a non-temporal hint to minimize +// cache pollution. If the cache line containing address mem_addr is already in +// the cache, the cache will be updated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64 +// FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a) {} + +// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16 +// FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b) {} + +// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32 +// FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) {} + +// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64 +// FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b) {} + +// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8 +// FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b) {} + +// Subtract packed double-precision (64-bit) floating-point elements in b from +// packed double-precision (64-bit) floating-point elements in a, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd +// FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b) {} + +// Subtract the lower double-precision (64-bit) floating-point element in b from +// the lower double-precision (64-bit) floating-point element in a, store the +// result in the lower element of dst, and copy the upper element from a to the +// upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd +// FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b) {} + +// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64 +// FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b) {} + +// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a +// using saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16 +// FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) {} + +// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a +// using saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8 +// FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) {} + +// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit +// integers in a using saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16 +// FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) {} + +// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit +// integers in a using saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8 +// FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) {} + +// #define _mm_ucomieq_sd _mm_comieq_sd +// #define _mm_ucomige_sd _mm_comige_sd +// #define _mm_ucomigt_sd _mm_comigt_sd +// #define _mm_ucomile_sd _mm_comile_sd +// #define _mm_ucomilt_sd _mm_comilt_sd +// #define _mm_ucomineq_sd _mm_comineq_sd + +// Return vector of type __m128d with undefined elements. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd +// FORCE_INLINE __m128d _mm_undefined_pd(void) {} + +// Unpack and interleave 16-bit integers from the high half of a and b, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16 +// FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) {} + +// Unpack and interleave 32-bit integers from the high half of a and b, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32 +// FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) {} + +// Unpack and interleave 64-bit integers from the high half of a and b, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64 +// FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) {} + +// Unpack and interleave 8-bit integers from the high half of a and b, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8 +// FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) {} + +// Unpack and interleave double-precision (64-bit) floating-point elements from +// the high half of a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd +// FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b) {} + +// Unpack and interleave 16-bit integers from the low half of a and b, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16 +// FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) {} + +// Unpack and interleave 32-bit integers from the low half of a and b, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32 +// FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) {} + +// Unpack and interleave 64-bit integers from the low half of a and b, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64 +// FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) {} + +// Unpack and interleave 8-bit integers from the low half of a and b, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8 +// FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) {} + +// Unpack and interleave double-precision (64-bit) floating-point elements from +// the low half of a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd +// FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b) {} + +// Compute the bitwise XOR of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd +// FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b) {} + +// Compute the bitwise XOR of 128 bits (representing integer data) in a and b, +// and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128 +// FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) {} + +/* SSE3 */ + +// Alternatively add and subtract packed double-precision (64-bit) +// floating-point elements in a to/from packed elements in b, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd +// FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b) {} + +// Alternatively add and subtract packed single-precision (32-bit) +// floating-point elements in a to/from packed elements in b, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps +// FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) {} + +// Horizontally add adjacent pairs of double-precision (64-bit) floating-point +// elements in a and b, and pack the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd +// FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) {} + +// Horizontally add adjacent pairs of single-precision (32-bit) floating-point +// elements in a and b, and pack the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps +// FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) {} + +// Horizontally subtract adjacent pairs of double-precision (64-bit) +// floating-point elements in a and b, and pack the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd +// FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b) {} + +// Horizontally subtract adjacent pairs of single-precision (32-bit) +// floating-point elements in a and b, and pack the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps +// FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b) {} + +// Load 128-bits of integer data from unaligned memory into dst. This intrinsic +// may perform better than _mm_loadu_si128 when the data crosses a cache line +// boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128 +// #define _mm_lddqu_si128 _mm_loadu_si128 + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd +// #define _mm_loaddup_pd _mm_load1_pd + +// Duplicate the low double-precision (64-bit) floating-point element from a, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd +// FORCE_INLINE __m128d _mm_movedup_pd(__m128d a) {} + +// Duplicate odd-indexed single-precision (32-bit) floating-point elements +// from a, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps +// FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) {} + +// Duplicate even-indexed single-precision (32-bit) floating-point elements +// from a, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps +// FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) {} + +/* SSSE3 */ + +// Compute the absolute value of packed signed 16-bit integers in a, and store +// the unsigned results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16 +// FORCE_INLINE __m128i _mm_abs_epi16(__m128i a) {} + +// Compute the absolute value of packed signed 32-bit integers in a, and store +// the unsigned results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32 +// FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) {} + +// Compute the absolute value of packed signed 8-bit integers in a, and store +// the unsigned results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8 +// FORCE_INLINE __m128i _mm_abs_epi8(__m128i a) {} + +// Compute the absolute value of packed signed 16-bit integers in a, and store +// the unsigned results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16 +// FORCE_INLINE __m64 _mm_abs_pi16(__m64 a) {} + +// Compute the absolute value of packed signed 32-bit integers in a, and store +// the unsigned results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32 +// FORCE_INLINE __m64 _mm_abs_pi32(__m64 a) {} + +// Compute the absolute value of packed signed 8-bit integers in a, and store +// the unsigned results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8 +// FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) {} + +// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift +// the result right by imm8 bytes, and store the low 16 bytes in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8 +#if defined(__GNUC__) && !defined(__clang__) +// #define _mm_alignr_epi8(a, b, imm) + +#else +// #define _mm_alignr_epi8(a, b, imm) + +#endif + +// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift +// the result right by imm8 bytes, and store the low 8 bytes in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8 +// #define _mm_alignr_pi8(a, b, imm) + +// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the +// signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16 +// FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b) {} + +// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the +// signed 32-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32 +// FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b) {} + +// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the +// signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16 +// FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b) {} + +// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the +// signed 32-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32 +// FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b) {} + +// Horizontally add adjacent pairs of signed 16-bit integers in a and b using +// saturation, and pack the signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16 +// FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b) {} + +// Horizontally add adjacent pairs of signed 16-bit integers in a and b using +// saturation, and pack the signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16 +// FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b) {} + +// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack +// the signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16 +// FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b) {} + +// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack +// the signed 32-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32 +// FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b) {} + +// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack +// the signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16 +// FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b) {} + +// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack +// the signed 32-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32 +// FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b) {} + +// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b +// using saturation, and pack the signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16 +// FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b) {} + +// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b +// using saturation, and pack the signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16 +// FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b) {} + +// Vertically multiply each unsigned 8-bit integer from a with the corresponding +// signed 8-bit integer from b, producing intermediate signed 16-bit integers. +// Horizontally add adjacent pairs of intermediate signed 16-bit integers, +// and pack the saturated results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16 +// FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b) {} + +// Vertically multiply each unsigned 8-bit integer from a with the corresponding +// signed 8-bit integer from b, producing intermediate signed 16-bit integers. +// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and +// pack the saturated results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16 +// FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b) {} + +// Multiply packed signed 16-bit integers in a and b, producing intermediate +// signed 32-bit integers. Shift right by 15 bits while rounding up, and store +// the packed 16-bit integers in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16 +// FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) {} + +// Multiply packed signed 16-bit integers in a and b, producing intermediate +// signed 32-bit integers. Truncate each intermediate integer to the 18 most +// significant bits, round by adding 1, and store bits [16:1] to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16 +// FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b) {} + +// Shuffle packed 8-bit integers in a according to shuffle control mask in the +// corresponding 8-bit element of b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8 +// FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) {} + +// Shuffle packed 8-bit integers in a according to shuffle control mask in the +// corresponding 8-bit element of b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8 +// FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b) {} + +// Negate packed 16-bit integers in a when the corresponding signed +// 16-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16 +// FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b) {} + +// Negate packed 32-bit integers in a when the corresponding signed +// 32-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32 +// FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b) {} + +// Negate packed 8-bit integers in a when the corresponding signed +// 8-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8 +// FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b) {} + +// Negate packed 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative, and store the results in dst. Element in dst are +// zeroed out when the corresponding element in b is zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16 +// FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b) {} + +// Negate packed 32-bit integers in a when the corresponding signed 32-bit +// integer in b is negative, and store the results in dst. Element in dst are +// zeroed out when the corresponding element in b is zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32 +// FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b) {} + +// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer +// in b is negative, and store the results in dst. Element in dst are zeroed out +// when the corresponding element in b is zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8 +// FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) {} + +/* SSE4.1 */ + +// Blend packed 16-bit integers from a and b using control mask imm8, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16 +// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, +// __constrange(0,255) int imm) +// #define _mm_blend_epi16(a, b, imm) + +// Blend packed double-precision (64-bit) floating-point elements from a and b +// using control mask imm8, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd +// #define _mm_blend_pd(a, b, imm) + +// Blend packed single-precision (32-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps +// FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8) {} + +// Blend packed 8-bit integers from a and b using mask, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8 +// FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask) +// {} + +// Blend packed double-precision (64-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd +// FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask) {} + +// Blend packed single-precision (32-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps +// FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask) {} + +// Round the packed double-precision (64-bit) floating-point elements in a up +// to an integer value, and store the results as packed double-precision +// floating-point elements in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd +// FORCE_INLINE __m128d _mm_ceil_pd(__m128d a) {} + +// Round the packed single-precision (32-bit) floating-point elements in a up to +// an integer value, and store the results as packed single-precision +// floating-point elements in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps +// FORCE_INLINE __m128 _mm_ceil_ps(__m128 a) {} + +// Round the lower double-precision (64-bit) floating-point element in b up to +// an integer value, store the result as a double-precision floating-point +// element in the lower element of dst, and copy the upper element from a to the +// upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd +// FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b) {} + +// Round the lower single-precision (32-bit) floating-point element in b up to +// an integer value, store the result as a single-precision floating-point +// element in the lower element of dst, and copy the upper 3 packed elements +// from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss +// FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b) {} + +// Compare packed 64-bit integers in a and b for equality, and store the results +// in dst +// FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) {} + +// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32 +// FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a) {} + +// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64 +// FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a) {} + +// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64 +// FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a) {} + +// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16 +// FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a) {} + +// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32 +// FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a) {} + +// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit +// integers, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64 +// FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a) {} + +// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32 +// FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a) {} + +// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64 +// FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a) {} + +// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64 +// FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a) {} + +// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16 +// FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a) {} + +// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32 +// FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a) {} + +// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed +// 64-bit integers, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64 +// FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) {} + +// Conditionally multiply the packed double-precision (64-bit) floating-point +// elements in a and b using the high 4 bits in imm8, sum the four products, and +// conditionally store the sum in dst using the low 4 bits of imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd +// FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm) {} + +// Conditionally multiply the packed single-precision (32-bit) floating-point +// elements in a and b using the high 4 bits in imm8, sum the four products, +// and conditionally store the sum in dst using the low 4 bits of imm. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps +// FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) {} + +// Extract a 32-bit integer from a, selected with imm8, and store the result in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32 +// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm) +// #define _mm_extract_epi32(a, imm) + +// Extract a 64-bit integer from a, selected with imm8, and store the result in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64 +// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm) +// #define _mm_extract_epi64(a, imm) + +// Extract an 8-bit integer from a, selected with imm8, and store the result in +// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a, +// __constrange(0,16) int imm) +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8 +// #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), +// (imm)) + +// Extracts the selected single-precision (32-bit) floating-point from a. +// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm) +// #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), +// (imm)) + +// Round the packed double-precision (64-bit) floating-point elements in a down +// to an integer value, and store the results as packed double-precision +// floating-point elements in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd +// FORCE_INLINE __m128d _mm_floor_pd(__m128d a) {} + +// Round the packed single-precision (32-bit) floating-point elements in a down +// to an integer value, and store the results as packed single-precision +// floating-point elements in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps +// FORCE_INLINE __m128 _mm_floor_ps(__m128 a) {} + +// Round the lower double-precision (64-bit) floating-point element in b down to +// an integer value, store the result as a double-precision floating-point +// element in the lower element of dst, and copy the upper element from a to the +// upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd +// FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b) {} + +// Round the lower single-precision (32-bit) floating-point element in b down to +// an integer value, store the result as a single-precision floating-point +// element in the lower element of dst, and copy the upper 3 packed elements +// from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss +// FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b) {} + +// Copy a to dst, and insert the 32-bit integer i into dst at the location +// specified by imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32 +// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b, +// __constrange(0,4) int imm) +// #define _mm_insert_epi32(a, b, imm) + +// Copy a to dst, and insert the 64-bit integer i into dst at the location +// specified by imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64 +// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b, +// __constrange(0,2) int imm) +// #define _mm_insert_epi64(a, b, imm) + +// Copy a to dst, and insert the lower 8-bit integer from i into dst at the +// location specified by imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8 +// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b, +// __constrange(0,16) int imm) +// #define _mm_insert_epi8(a, b, imm) + +// Copy a to tmp, then insert a single-precision (32-bit) floating-point +// element from b into tmp using the control in imm8. Store tmp to dst using +// the mask in imm8 (elements are zeroed out when the corresponding bit is set). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps +// #define _mm_insert_ps(a, b, imm8) + +// Compare packed signed 32-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32 +// FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b) {} + +// Compare packed signed 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8 +// FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b) {} + +// Compare packed unsigned 16-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16 +// FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b) {} + +// Compare packed unsigned 32-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32 +// FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) {} + +// Compare packed signed 32-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32 +// FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b) {} + +// Compare packed signed 8-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8 +// FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b) {} + +// Compare packed unsigned 16-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16 +// FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b) {} + +// Compare packed unsigned 32-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32 +// FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) {} + +// Horizontally compute the minimum amongst the packed unsigned 16-bit integers +// in a, store the minimum and index in dst, and zero the remaining bits in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16 +// FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a) {} + +// Compute the sum of absolute differences (SADs) of quadruplets of unsigned +// 8-bit integers in a compared to those in b, and store the 16-bit results in +// dst. Eight SADs are performed using one quadruplet from b and eight +// quadruplets from a. One quadruplet is selected from b starting at on the +// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit +// integers selected from a starting at the offset specified in imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8 +// FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm) {} + +// Multiply the low signed 32-bit integers from each packed 64-bit element in +// a and b, and store the signed 64-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32 +// FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b) {} + +// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit +// integers, and store the low 32 bits of the intermediate integers in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32 +// FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b) {} + +// Convert packed signed 32-bit integers from a and b to packed 16-bit integers +// using unsigned saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32 +// FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) {} + +// Round the packed double-precision (64-bit) floating-point elements in a using +// the rounding parameter, and store the results as packed double-precision +// floating-point elements in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd +// FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding) {} + +// Round the packed single-precision (32-bit) floating-point elements in a using +// the rounding parameter, and store the results as packed single-precision +// floating-point elements in dst. +// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps +// FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) {} + +// Round the lower double-precision (64-bit) floating-point element in b using +// the rounding parameter, store the result as a double-precision floating-point +// element in the lower element of dst, and copy the upper element from a to the +// upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd +// FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding) {} + +// Round the lower single-precision (32-bit) floating-point element in b using +// the rounding parameter, store the result as a single-precision floating-point +// element in the lower element of dst, and copy the upper 3 packed elements +// from a to the upper elements of dst. Rounding is done according to the +// rounding[3:0] parameter, which can be one of: +// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and +// suppress exceptions +// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and +// suppress exceptions +// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress +// exceptions +// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress +// exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see +// _MM_SET_ROUNDING_MODE +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss +// FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding) {} + +// Load 128-bits of integer data from memory into dst using a non-temporal +// memory hint. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128 +// FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p) {} + +// Compute the bitwise NOT of a and then AND with a 128-bit vector containing +// all 1's, and return 1 if the result is zero, otherwise return 0. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones +// FORCE_INLINE int _mm_test_all_ones(__m128i a) {} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and +// mask, and return 1 if the result is zero, otherwise return 0. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros +// FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask) {} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and +// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute +// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is +// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, +// otherwise return 0. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero +// FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask) {} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the +// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, +// otherwise set CF to 0. Return the CF value. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128 +// FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) {} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the +// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, +// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, +// otherwise return 0. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128 +// #define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b) + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the +// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, +// otherwise set CF to 0. Return the ZF value. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128 +// FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) {} + +// Compare packed strings in a and b with lengths la and lb using the control +// in imm8, and returns 1 if b did not contain a null character and the +// resulting mask was zero, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra +// FORCE_INLINE int _mm_cmpestra(__m128i a, +// int la, +// __m128i b, +// int lb, +// const int imm8) {} + +// Compare packed strings in a and b with lengths la and lb using the control in +// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc +// FORCE_INLINE int _mm_cmpestrc(__m128i a, +// int la, +// __m128i b, +// int lb, +// const int imm8) {} + +// Compare packed strings in a and b with lengths la and lb using the control +// in imm8, and store the generated index in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri +// FORCE_INLINE int _mm_cmpestri(__m128i a, +// int la, +// __m128i b, +// int lb, +// const int imm8) {} + +// Compare packed strings in a and b with lengths la and lb using the control +// in imm8, and store the generated mask in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm +// FORCE_INLINE __m128i _mm_cmpestrm(__m128i a, int la, __m128i b, int lb, +// const int imm8) {} + +// Compare packed strings in a and b with lengths la and lb using the control in +// imm8, and returns bit 0 of the resulting bit mask. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro +// FORCE_INLINE int _mm_cmpestro(__m128i a, +// int la, +// __m128i b, +// int lb, +// const int imm8) {} + +// Compare packed strings in a and b with lengths la and lb using the control in +// imm8, and returns 1 if any character in a was null, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs +// FORCE_INLINE int _mm_cmpestrs(__m128i a, +// int la, +// __m128i b, +// int lb, +// const int imm8) {} + +// Compare packed strings in a and b with lengths la and lb using the control in +// imm8, and returns 1 if any character in b was null, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz +// FORCE_INLINE int _mm_cmpestrz(__m128i a, +// int la, +// __m128i b, +// int lb, +// const int imm8) {} + +// Compare packed strings with implicit lengths in a and b using the control in +// imm8, and returns 1 if b did not contain a null character and the resulting +// mask was zero, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra +// FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8) {} + +// Compare packed strings with implicit lengths in a and b using the control in +// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc +// FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8) {} + +// Compare packed strings with implicit lengths in a and b using the control in +// imm8, and store the generated index in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri +// FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8) {} + +// Compare packed strings with implicit lengths in a and b using the control in +// imm8, and store the generated mask in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm +// FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8) {} + +// Compare packed strings with implicit lengths in a and b using the control in +// imm8, and returns bit 0 of the resulting bit mask. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro +// FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8) {} + +// Compare packed strings with implicit lengths in a and b using the control in +// imm8, and returns 1 if any character in a was null, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs +// FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8) {} + +// Compare packed strings with implicit lengths in a and b using the control in +// imm8, and returns 1 if any character in b was null, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz +// FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8) {} + +// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers +// in b for greater than. +// FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) {} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 16-bit integer v, and stores the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16 +// FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) {} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 32-bit integer v, and stores the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32 +// FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) {} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 64-bit integer v, and stores the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64 +// FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) {} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 8-bit integer v, and stores the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8 +// FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) {} + +/* AES */ + +// In the absence of crypto extensions, implement aesenc using regular NEON +// intrinsics instead. See: +// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/ +// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and +// for more information. +// FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey) {} + +// Perform one round of an AES decryption flow on data (state) in a using the +// round key in RoundKey, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128 +// FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey) {} + +// Perform the last round of an AES encryption flow on data (state) in a using +// the round key in RoundKey, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128 +// FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) {} + +// Perform the last round of an AES decryption flow on data (state) in a using +// the round key in RoundKey, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128 +// FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey) {} + +// Perform the InvMixColumns transformation on a and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128 +// FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a) {} + +// Assist in expanding the AES cipher key by computing steps towards generating +// a round key for encryption cipher using data from a and an 8-bit round +// constant specified in imm8, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128 +// +// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist. +// This instruction generates a round key for AES encryption. See +// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/ +// for details. +// FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) {} + +/* Others */ + +// Perform a carry-less multiplication of two 64-bit integers, selected from a +// and b according to imm8, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128 +// FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int +// imm) {} + +// FORCE_INLINE unsigned int _sse2rvv_mm_get_denormals_zero_mode(void) {} + +// Count the number of bits set to 1 in unsigned 32-bit integer a, and +// return that count in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32 +// FORCE_INLINE int _mm_popcnt_u32(unsigned int a) {} + +// Count the number of bits set to 1 in unsigned 64-bit integer a, and +// return that count in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64 +// FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) {} + +// FORCE_INLINE void _sse2rvv_mm_set_denormals_zero_mode(unsigned int flag) {} + +// Return the current 64-bit value of the processor's time-stamp counter. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc +// FORCE_INLINE uint64_t _rdtsc(void) {} + +#if defined(__GNUC__) || defined(__clang__) +#pragma pop_macro("ALIGN_STRUCT") +#pragma pop_macro("FORCE_INLINE") +#endif + +// #if defined(__GNUC__) && !defined(__clang__) +// #pragma GCC pop_options +// #endif + +#endif diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..7f66634 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,29 @@ +# Test Suite for SSE2RVV + +:warning: **Warning: The test suite is based on the little-endian architecture.** + +## Add More Test Items +Once the conversion is implemented, the test can be added with the following steps: + +* File `tests/impl.h` + + Add the intrinsic under `INTRIN_LIST` macro. The naming convention + should be `mm_xxx`. + Place it in the correct classification with the alphabetical order. + The classification can be referenced from [Intel Intrinsics Guide](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html). + +* File `tests/impl.cpp` + ```c + result_t test_mm_xxx() + { + // The C implementation + ... + + // The rvv implementation + ret = _mm_xxx(); + + // Compare the result of two implementations and return either + // TEST_SUCCESS, TEST_FAIL, or TEST_UNIMPL + ... + } + ``` diff --git a/tests/binding.cpp b/tests/binding.cpp new file mode 100644 index 0000000..6f9b6a3 --- /dev/null +++ b/tests/binding.cpp @@ -0,0 +1,31 @@ +#include "binding.h" + +#include +#include + +namespace SSE2RVV { +void *platform_aligned_alloc(size_t size) { + void *address; +#if defined(_WIN32) + address = _aligned_malloc(size, 16); +#else + // FIXME + // int ret = posix_memalign(&address, 16, size); + address = malloc(size); +#endif + if (!address) { + fprintf(stderr, "Error at File %s line number %d\n", __FILE__, __LINE__); + exit(EXIT_FAILURE); + } + return address; +} + +void platform_aligned_free(void *ptr) { +#if defined(_WIN32) + _aligned_free(ptr); +#else + free(ptr); +#endif +} + +} // namespace SSE2RVV diff --git a/tests/binding.h b/tests/binding.h new file mode 100644 index 0000000..564b502 --- /dev/null +++ b/tests/binding.h @@ -0,0 +1,18 @@ +#ifndef SSE2RVV_BINDING_H +#define SSE2RVV_BINDING_H + +#include + +// The SSE2RVV unit tests run both within our own internal project +// as well as within the open source framework. +// This header file is used to abstract any distinctions between +// those two build environments. +// +// Initially, this is for how 16 byte aligned memory is allocated +namespace SSE2RVV { +void *platform_aligned_alloc(size_t size); +void platform_aligned_free(void *ptr); + +} // namespace SSE2RVV + +#endif diff --git a/tests/common.cpp b/tests/common.cpp new file mode 100644 index 0000000..33338c0 --- /dev/null +++ b/tests/common.cpp @@ -0,0 +1,330 @@ +#include "common.h" +#include +#include + +namespace SSE2RVV { +int32_t NaN = ~0; +int64_t NaN64 = ~0; + +result_t validate_int64(__m128i a, int64_t i0, int64_t i1) { + const int64_t *t = (const int64_t *)&a; + ASSERT_RETURN(t[0] == i0); + ASSERT_RETURN(t[1] == i1); + return TEST_SUCCESS; +} + +result_t validate_uint64(__m128i a, uint64_t i0, uint64_t i1) { + const uint64_t *t = (const uint64_t *)&a; + ASSERT_RETURN(t[0] == i0); + ASSERT_RETURN(t[1] == i1); + return TEST_SUCCESS; +} + +result_t validate_int64(__m64 a, int64_t i0) { + const int64_t *t = (const int64_t *)&a; + ASSERT_RETURN(t[0] == i0); + return TEST_SUCCESS; +} + +result_t validate_uint64(__m64 a, uint64_t i0) { + const uint64_t *t = (const uint64_t *)&a; + ASSERT_RETURN(t[0] == i0); + return TEST_SUCCESS; +} + +result_t validate_int32(__m128i a, int32_t i0, int32_t i1, int32_t i2, + int32_t i3) { + const int32_t *t = (const int32_t *)&a; + ASSERT_RETURN(t[0] == i0); + ASSERT_RETURN(t[1] == i1); + ASSERT_RETURN(t[2] == i2); + ASSERT_RETURN(t[3] == i3); + return TEST_SUCCESS; +} + +result_t validate_uint32(__m128i a, uint32_t u0, uint32_t u1, uint32_t u2, + uint32_t u3) { + const uint32_t *t = (const uint32_t *)&a; + ASSERT_RETURN(t[0] == u0); + ASSERT_RETURN(t[1] == u1); + ASSERT_RETURN(t[2] == u2); + ASSERT_RETURN(t[3] == u3); + return TEST_SUCCESS; +} + +result_t validate_int32(__m64 a, int32_t u0, int32_t u1) { + const int32_t *t = (const int32_t *)&a; + ASSERT_RETURN(t[0] == u0); + ASSERT_RETURN(t[1] == u1); + return TEST_SUCCESS; +} + +result_t validate_uint32(__m64 a, uint32_t u0, uint32_t u1) { + const uint32_t *t = (const uint32_t *)&a; + ASSERT_RETURN(t[0] == u0); + ASSERT_RETURN(t[1] == u1); + return TEST_SUCCESS; +} + +result_t validate_int16(__m128i a, int16_t i0, int16_t i1, int16_t i2, + int16_t i3, int16_t i4, int16_t i5, int16_t i6, + int16_t i7) { + const int16_t *t = (const int16_t *)&a; + ASSERT_RETURN(t[0] == i0); + ASSERT_RETURN(t[1] == i1); + ASSERT_RETURN(t[2] == i2); + ASSERT_RETURN(t[3] == i3); + ASSERT_RETURN(t[4] == i4); + ASSERT_RETURN(t[5] == i5); + ASSERT_RETURN(t[6] == i6); + ASSERT_RETURN(t[7] == i7); + return TEST_SUCCESS; +} + +result_t validate_uint16(__m128i a, uint16_t u0, uint16_t u1, uint16_t u2, + uint16_t u3, uint16_t u4, uint16_t u5, uint16_t u6, + uint16_t u7) { + const uint16_t *t = (const uint16_t *)&a; + ASSERT_RETURN(t[0] == u0); + ASSERT_RETURN(t[1] == u1); + ASSERT_RETURN(t[2] == u2); + ASSERT_RETURN(t[3] == u3); + ASSERT_RETURN(t[4] == u4); + ASSERT_RETURN(t[5] == u5); + ASSERT_RETURN(t[6] == u6); + ASSERT_RETURN(t[7] == u7); + return TEST_SUCCESS; +} + +result_t validate_int16(__m64 a, int16_t i0, int16_t i1, int16_t i2, + int16_t i3) { + const int16_t *t = (const int16_t *)&a; + ASSERT_RETURN(t[0] == i0); + ASSERT_RETURN(t[1] == i1); + ASSERT_RETURN(t[2] == i2); + ASSERT_RETURN(t[3] == i3); + return TEST_SUCCESS; +} + +result_t validate_uint16(__m64 a, uint16_t u0, uint16_t u1, uint16_t u2, + uint16_t u3) { + const uint16_t *t = (const uint16_t *)&a; + ASSERT_RETURN(t[0] == u0); + ASSERT_RETURN(t[1] == u1); + ASSERT_RETURN(t[2] == u2); + ASSERT_RETURN(t[3] == u3); + return TEST_SUCCESS; +} + +result_t validate_int8(__m128i a, int8_t i0, int8_t i1, int8_t i2, int8_t i3, + int8_t i4, int8_t i5, int8_t i6, int8_t i7, int8_t i8, + int8_t i9, int8_t i10, int8_t i11, int8_t i12, + int8_t i13, int8_t i14, int8_t i15) { + const int8_t *t = (const int8_t *)&a; + ASSERT_RETURN(t[0] == i0); + ASSERT_RETURN(t[1] == i1); + ASSERT_RETURN(t[2] == i2); + ASSERT_RETURN(t[3] == i3); + ASSERT_RETURN(t[4] == i4); + ASSERT_RETURN(t[5] == i5); + ASSERT_RETURN(t[6] == i6); + ASSERT_RETURN(t[7] == i7); + ASSERT_RETURN(t[8] == i8); + ASSERT_RETURN(t[9] == i9); + ASSERT_RETURN(t[10] == i10); + ASSERT_RETURN(t[11] == i11); + ASSERT_RETURN(t[12] == i12); + ASSERT_RETURN(t[13] == i13); + ASSERT_RETURN(t[14] == i14); + ASSERT_RETURN(t[15] == i15); + return TEST_SUCCESS; +} + +result_t validate_uint8(__m128i a, uint8_t u0, uint8_t u1, uint8_t u2, + uint8_t u3, uint8_t u4, uint8_t u5, uint8_t u6, + uint8_t u7, uint8_t u8, uint8_t u9, uint8_t u10, + uint8_t u11, uint8_t u12, uint8_t u13, uint8_t u14, + uint8_t u15) { + const uint8_t *t = (const uint8_t *)&a; + ASSERT_RETURN(t[0] == u0); + ASSERT_RETURN(t[1] == u1); + ASSERT_RETURN(t[2] == u2); + ASSERT_RETURN(t[3] == u3); + ASSERT_RETURN(t[4] == u4); + ASSERT_RETURN(t[5] == u5); + ASSERT_RETURN(t[6] == u6); + ASSERT_RETURN(t[7] == u7); + ASSERT_RETURN(t[8] == u8); + ASSERT_RETURN(t[9] == u9); + ASSERT_RETURN(t[10] == u10); + ASSERT_RETURN(t[11] == u11); + ASSERT_RETURN(t[12] == u12); + ASSERT_RETURN(t[13] == u13); + ASSERT_RETURN(t[14] == u14); + ASSERT_RETURN(t[15] == u15); + return TEST_SUCCESS; +} + +result_t validate_int8(__m64 a, int8_t i0, int8_t i1, int8_t i2, int8_t i3, + int8_t i4, int8_t i5, int8_t i6, int8_t i7) { + const int8_t *t = (const int8_t *)&a; + ASSERT_RETURN(t[0] == i0); + ASSERT_RETURN(t[1] == i1); + ASSERT_RETURN(t[2] == i2); + ASSERT_RETURN(t[3] == i3); + ASSERT_RETURN(t[4] == i4); + ASSERT_RETURN(t[5] == i5); + ASSERT_RETURN(t[6] == i6); + ASSERT_RETURN(t[7] == i7); + return TEST_SUCCESS; +} + +result_t validate_uint8(__m64 a, uint8_t u0, uint8_t u1, uint8_t u2, uint8_t u3, + uint8_t u4, uint8_t u5, uint8_t u6, uint8_t u7) { + const uint8_t *t = (const uint8_t *)&a; + ASSERT_RETURN(t[0] == u0); + ASSERT_RETURN(t[1] == u1); + ASSERT_RETURN(t[2] == u2); + ASSERT_RETURN(t[3] == u3); + ASSERT_RETURN(t[4] == u4); + ASSERT_RETURN(t[5] == u5); + ASSERT_RETURN(t[6] == u6); + ASSERT_RETURN(t[7] == u7); + return TEST_SUCCESS; +} + +result_t validate_float_pair(float a, float b) { + const uint32_t *ua = (const uint32_t *)&a; + const uint32_t *ub = (const uint32_t *)&b; + // We do an integer (binary) compare rather than a + // floating point compare to take NaNs and infinities + // into account as well. + return (*ua) == (*ub) ? TEST_SUCCESS : TEST_FAIL; +} + +result_t validate_double_pair(double a, double b) { + const uint64_t *ua = (const uint64_t *)&a; + const uint64_t *ub = (const uint64_t *)&b; + // We do an integer (binary) compare rather than a + // floating point compare to take NaNs and infinities + // into account as well. + + if (std::isnan(a) && std::isnan(b)) { + return TEST_SUCCESS; + } + + return (*ua) == (*ub) ? TEST_SUCCESS : TEST_FAIL; +} + +result_t validate_float(__m64 a, float f0, float f1) { + const float *t = (const float *)&a; + ASSERT_RETURN(validate_float_pair(t[0], f0)); + ASSERT_RETURN(validate_float_pair(t[1], f1)); + return TEST_SUCCESS; +} + +result_t validate_float(__m128 a, float f0, float f1, float f2, float f3) { + const float *t = (const float *)&a; + ASSERT_RETURN(validate_float_pair(t[0], f0)); + ASSERT_RETURN(validate_float_pair(t[1], f1)); + ASSERT_RETURN(validate_float_pair(t[2], f2)); + ASSERT_RETURN(validate_float_pair(t[3], f3)); + return TEST_SUCCESS; +} + +result_t validate_double(__m128d a, double d0, double d1) { + const double *t = (const double *)&a; + ASSERT_RETURN(validate_double_pair(t[0], d0)); + ASSERT_RETURN(validate_double_pair(t[1], d1)); + return TEST_SUCCESS; +} + +result_t validate_float_epsilon(__m128 a, float f0, float f1, float f2, + float f3, float epsilon) { + const float *t = (const float *)&a; + float df0 = fabsf(t[0] - f0); + float df1 = fabsf(t[1] - f1); + float df2 = fabsf(t[2] - f2); + float df3 = fabsf(t[3] - f3); + + // Due to floating-point error, subtracting floating-point number with NaN + // and zero value usually produces erroneous result. Therefore, we directly + // define the difference of two floating-point numbers to zero if both + // numbers are NaN or zero. + if ((std::isnan(t[0]) && std::isnan(f0)) || (t[0] == 0 && f0 == 0)) { + df0 = 0; + } + + if ((std::isnan(t[1]) && std::isnan(f1)) || (t[1] == 0 && f1 == 0)) { + df1 = 0; + } + + if ((std::isnan(t[2]) && std::isnan(f2)) || (t[2] == 0 && f2 == 0)) { + df2 = 0; + } + + if ((std::isnan(t[3]) && std::isnan(f3)) || (t[3] == 0 && f3 == 0)) { + df3 = 0; + } + + ASSERT_RETURN(df0 < epsilon); + ASSERT_RETURN(df1 < epsilon); + ASSERT_RETURN(df2 < epsilon); + ASSERT_RETURN(df3 < epsilon); + return TEST_SUCCESS; +} + +result_t validate_float_error(__m128 a, float f0, float f1, float f2, float f3, + float err) { + const float *t = (const float *)&a; + float df0 = fabsf((t[0] - f0) / f0); + float df1 = fabsf((t[1] - f1) / f1); + float df2 = fabsf((t[2] - f2) / f2); + float df3 = fabsf((t[3] - f3) / f3); + + if ((std::isnan(t[0]) && std::isnan(f0)) || (t[0] == 0 && f0 == 0) || + (std::isinf(t[0]) && std::isinf(f0))) { + df0 = 0; + } + + if ((std::isnan(t[1]) && std::isnan(f1)) || (t[1] == 0 && f1 == 0) || + (std::isinf(t[1]) && std::isinf(f1))) { + df1 = 0; + } + + if ((std::isnan(t[2]) && std::isnan(f2)) || (t[2] == 0 && f2 == 0) || + (std::isinf(t[2]) && std::isinf(f2))) { + df2 = 0; + } + + if ((std::isnan(t[3]) && std::isnan(f3)) || (t[3] == 0 && f3 == 0) || + (std::isinf(t[3]) && std::isinf(f3))) { + df3 = 0; + } + + ASSERT_RETURN(df0 < err); + ASSERT_RETURN(df1 < err); + ASSERT_RETURN(df2 < err); + ASSERT_RETURN(df3 < err); + return TEST_SUCCESS; +} + +result_t validate_double_error(__m128d a, double d0, double d1, double err) { + const double *t = (const double *)&a; + double td0 = fabs((t[0] - d0) / d0); + double td1 = fabs((t[1] - d1) / d1); + + if (std::isnan(t[0]) && std::isnan(d0)) { + td0 = 0; + } + + if (std::isnan(t[1]) && std::isnan(d1)) { + td1 = 0; + } + + ASSERT_RETURN(td0 < err); + ASSERT_RETURN(td1 < err); + return TEST_SUCCESS; +} + +} // namespace SSE2RVV diff --git a/tests/common.h b/tests/common.h new file mode 100644 index 0000000..efb2357 --- /dev/null +++ b/tests/common.h @@ -0,0 +1,421 @@ +#ifndef SSE2RVV_COMMON_H +#define SSE2RVV_COMMON_H +#include +#if defined(__riscv) || defined(__riscv__) +#include "sse2rvv.h" +#elif defined(__x86_64__) || defined(__i386__) +#include +#include +#include +#include +#include +#include + +// __int64 is defined in the Intrinsics Guide which maps to different datatype +// in different data model +#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64)) +#if (defined(__x86_64__) || defined(__i386__)) +#define __int64 long long +#else +#define __int64 int64_t +#endif +#endif + +#if defined(__GNUC__) || defined(__clang__) +#pragma push_macro("ALIGN_STRUCT") +#define ALIGN_STRUCT(x) __attribute__((aligned(x))) +#else +#define ALIGN_STRUCT(x) __declspec(align(x)) +#endif + +typedef union ALIGN_STRUCT(16) SIMDVec { + float m128_f32[4]; // as floats - DON'T USE. Added for convenience. + int8_t m128_i8[16]; // as signed 8-bit integers. + int16_t m128_i16[8]; // as signed 16-bit integers. + int32_t m128_i32[4]; // as signed 32-bit integers. + int64_t m128_i64[2]; // as signed 64-bit integers. + uint8_t m128_u8[16]; // as unsigned 8-bit integers. + uint16_t m128_u16[8]; // as unsigned 16-bit integers. + uint32_t m128_u32[4]; // as unsigned 32-bit integers. + uint64_t m128_u64[2]; // as unsigned 64-bit integers. +} SIMDVec; + +#if defined(__GNUC__) || defined(__clang__) +#pragma pop_macro("ALIGN_STRUCT") +#endif + +/* Tunable testing configuration for precise testing */ +/* _mm_min|max_ps|ss|pd|sd */ +#ifndef SSE2RVV_PRECISE_MINMAX +#define SSE2RVV_PRECISE_MINMAX (0) +#endif +#endif + +#define ASSERT_RETURN(x) \ + if (!(x)) \ + return TEST_FAIL; + +namespace SSE2RVV { +enum result_t { + TEST_SUCCESS = 1, + TEST_FAIL = 0, + TEST_UNIMPL = -1, +}; +extern int32_t NaN; +extern int64_t NaN64; +#define ALL_BIT_1_32 (*(float *)&NaN) +#define ALL_BIT_1_64 (*(double *)&NaN64) + +template result_t validate_128bits(T a, T b) { + const int32_t *t1 = (const int32_t *)&a; + const int32_t *t2 = (const int32_t *)&b; + + ASSERT_RETURN(t1[0] == t2[0]); + ASSERT_RETURN(t1[1] == t2[1]); + ASSERT_RETURN(t1[2] == t2[2]); + ASSERT_RETURN(t1[3] == t2[3]); + return TEST_SUCCESS; +} +result_t validate_int64(__m128i a, int64_t i0, int64_t i1); +result_t validate_uint64(__m128i a, uint64_t i0, uint64_t i1); +result_t validate_int64(__m64 a, int64_t i0); +result_t validate_uint64(__m64 a, uint64_t i0); +result_t validate_int32(__m128i a, int32_t i0, int32_t i1, int32_t i2, + int32_t i3); +result_t validate_uint32(__m128i a, uint32_t u0, uint32_t u1, uint32_t u2, + uint32_t u3); +result_t validate_int32(__m64 a, int32_t u0, int32_t u1); +result_t validate_uint32(__m64 a, uint32_t u0, uint32_t u1); +result_t validate_int16(__m128i a, int16_t i0, int16_t i1, int16_t i2, + int16_t i3, int16_t i4, int16_t i5, int16_t i6, + int16_t i7); +result_t validate_uint16(__m128i a, uint16_t u0, uint16_t u1, uint16_t u2, + uint16_t u3, uint16_t u4, uint16_t u5, uint16_t u6, + uint16_t u7); +result_t validate_int16(__m64 a, int16_t i0, int16_t i1, int16_t i2, + int16_t i3); +result_t validate_uint16(__m64 a, uint16_t u0, uint16_t u1, uint16_t u2, + uint16_t u3); +result_t validate_int8(__m128i a, int8_t i0, int8_t i1, int8_t i2, int8_t i3, + int8_t i4, int8_t i5, int8_t i6, int8_t i7, int8_t i8, + int8_t i9, int8_t i10, int8_t i11, int8_t i12, + int8_t i13, int8_t i14, int8_t i15); +result_t validate_uint8(__m128i a, uint8_t u0, uint8_t u1, uint8_t u2, + uint8_t u3, uint8_t u4, uint8_t u5, uint8_t u6, + uint8_t u7, uint8_t u8, uint8_t u9, uint8_t u10, + uint8_t u11, uint8_t u12, uint8_t u13, uint8_t u14, + uint8_t u15); +result_t validate_int8(__m64 a, int8_t i0, int8_t i1, int8_t i2, int8_t i3, + int8_t i4, int8_t i5, int8_t i6, int8_t i7); +result_t validate_uint8(__m64 a, uint8_t u0, uint8_t u1, uint8_t u2, uint8_t u3, + uint8_t u4, uint8_t u5, uint8_t u6, uint8_t u7); +result_t validate_float_pair(float a, float b); +result_t validate_double_pair(double a, double b); +result_t validate_float(__m64 a, float f0, float f1); +result_t validate_float(__m128 a, float f0, float f1, float f2, float f3); +result_t validate_float_epsilon(__m128 a, float f0, float f1, float f2, + float f3, float epsilon); +result_t validate_float_error(__m128 a, float f0, float f1, float f2, float f3, + float err); +result_t validate_double(__m128d a, double d0, double d1); +result_t validate_double_error(__m128d a, double d0, double d1, double err); + +#define VALIDATE_INT8_M128(A, B) \ + validate_int8(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7], B[8], B[9], \ + B[10], B[11], B[12], B[13], B[14], B[15]) +#define VALIDATE_UINT8_M128(A, B) \ + validate_uint8(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7], B[8], \ + B[9], B[10], B[11], B[12], B[13], B[14], B[15]) +#define VALIDATE_INT16_M128(A, B) \ + validate_int16(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7]) +#define VALIDATE_UINT16_M128(A, B) \ + validate_uint16(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7]) +#define VALIDATE_INT32_M128(A, B) validate_int32(A, B[0], B[1], B[2], B[3]) +#define VALIDATE_UINT32_M128(A, B) validate_uint32(A, B[0], B[1], B[2], B[3]) + +#define VALIDATE_INT8_M64(A, B) \ + validate_int8(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7]) +#define VALIDATE_UINT8_M64(A, B) \ + validate_uint8(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7]) +#define VALIDATE_INT16_M64(A, B) validate_int16(A, B[0], B[1], B[2], B[3]) +#define VALIDATE_UINT16_M64(A, B) validate_uint16(A, B[0], B[1], B[2], B[3]) +#define VALIDATE_INT32_M64(A, B) validate_int32(A, B[0], B[1]) +#define VALIDATE_UINT32_M64(A, B) validate_uint32(A, B[0], B[1]) +#define CHECK_RESULT(EXP) \ + if (EXP != TEST_SUCCESS) { \ + return TEST_FAIL; \ + } +#define IMM_2_ITER \ + TEST_IMPL(0) \ + TEST_IMPL(1) +#define IMM_4_ITER \ + IMM_2_ITER \ + TEST_IMPL(2) \ + TEST_IMPL(3) +#define IMM_8_ITER \ + IMM_4_ITER \ + TEST_IMPL(4) \ + TEST_IMPL(5) \ + TEST_IMPL(6) \ + TEST_IMPL(7) +#define IMM_16_ITER \ + IMM_8_ITER \ + TEST_IMPL(8) \ + TEST_IMPL(9) \ + TEST_IMPL(10) \ + TEST_IMPL(11) \ + TEST_IMPL(12) \ + TEST_IMPL(13) \ + TEST_IMPL(14) \ + TEST_IMPL(15) +#define IMM_32_ITER \ + IMM_16_ITER \ + TEST_IMPL(16) \ + TEST_IMPL(17) \ + TEST_IMPL(18) \ + TEST_IMPL(19) \ + TEST_IMPL(20) \ + TEST_IMPL(21) \ + TEST_IMPL(22) \ + TEST_IMPL(23) \ + TEST_IMPL(24) \ + TEST_IMPL(25) \ + TEST_IMPL(26) \ + TEST_IMPL(27) \ + TEST_IMPL(28) \ + TEST_IMPL(29) \ + TEST_IMPL(30) \ + TEST_IMPL(31) +#define IMM_64_ITER \ + IMM_32_ITER \ + TEST_IMPL(32) \ + TEST_IMPL(33) \ + TEST_IMPL(34) \ + TEST_IMPL(35) \ + TEST_IMPL(36) \ + TEST_IMPL(37) \ + TEST_IMPL(38) \ + TEST_IMPL(39) \ + TEST_IMPL(40) \ + TEST_IMPL(41) \ + TEST_IMPL(42) \ + TEST_IMPL(43) \ + TEST_IMPL(44) \ + TEST_IMPL(45) \ + TEST_IMPL(46) \ + TEST_IMPL(47) \ + TEST_IMPL(48) \ + TEST_IMPL(49) \ + TEST_IMPL(50) \ + TEST_IMPL(51) \ + TEST_IMPL(52) \ + TEST_IMPL(53) \ + TEST_IMPL(54) \ + TEST_IMPL(55) \ + TEST_IMPL(56) \ + TEST_IMPL(57) \ + TEST_IMPL(58) \ + TEST_IMPL(59) \ + TEST_IMPL(60) \ + TEST_IMPL(61) \ + TEST_IMPL(62) \ + TEST_IMPL(63) +#define IMM_128_ITER \ + IMM_64_ITER \ + TEST_IMPL(64) \ + TEST_IMPL(65) \ + TEST_IMPL(66) \ + TEST_IMPL(67) \ + TEST_IMPL(68) \ + TEST_IMPL(69) \ + TEST_IMPL(70) \ + TEST_IMPL(71) \ + TEST_IMPL(72) \ + TEST_IMPL(73) \ + TEST_IMPL(74) \ + TEST_IMPL(75) \ + TEST_IMPL(76) \ + TEST_IMPL(77) \ + TEST_IMPL(78) \ + TEST_IMPL(79) \ + TEST_IMPL(80) \ + TEST_IMPL(81) \ + TEST_IMPL(82) \ + TEST_IMPL(83) \ + TEST_IMPL(84) \ + TEST_IMPL(85) \ + TEST_IMPL(86) \ + TEST_IMPL(87) \ + TEST_IMPL(88) \ + TEST_IMPL(89) \ + TEST_IMPL(90) \ + TEST_IMPL(91) \ + TEST_IMPL(92) \ + TEST_IMPL(93) \ + TEST_IMPL(94) \ + TEST_IMPL(95) \ + TEST_IMPL(96) \ + TEST_IMPL(97) \ + TEST_IMPL(98) \ + TEST_IMPL(99) \ + TEST_IMPL(100) \ + TEST_IMPL(101) \ + TEST_IMPL(102) \ + TEST_IMPL(103) \ + TEST_IMPL(104) \ + TEST_IMPL(105) \ + TEST_IMPL(106) \ + TEST_IMPL(107) \ + TEST_IMPL(108) \ + TEST_IMPL(109) \ + TEST_IMPL(110) \ + TEST_IMPL(111) \ + TEST_IMPL(112) \ + TEST_IMPL(113) \ + TEST_IMPL(114) \ + TEST_IMPL(115) \ + TEST_IMPL(116) \ + TEST_IMPL(117) \ + TEST_IMPL(118) \ + TEST_IMPL(119) \ + TEST_IMPL(120) \ + TEST_IMPL(121) \ + TEST_IMPL(122) \ + TEST_IMPL(123) \ + TEST_IMPL(124) \ + TEST_IMPL(125) \ + TEST_IMPL(126) \ + TEST_IMPL(127) +#define IMM_256_ITER \ + IMM_128_ITER \ + TEST_IMPL(128) \ + TEST_IMPL(129) \ + TEST_IMPL(130) \ + TEST_IMPL(131) \ + TEST_IMPL(132) \ + TEST_IMPL(133) \ + TEST_IMPL(134) \ + TEST_IMPL(135) \ + TEST_IMPL(136) \ + TEST_IMPL(137) \ + TEST_IMPL(138) \ + TEST_IMPL(139) \ + TEST_IMPL(140) \ + TEST_IMPL(141) \ + TEST_IMPL(142) \ + TEST_IMPL(143) \ + TEST_IMPL(144) \ + TEST_IMPL(145) \ + TEST_IMPL(146) \ + TEST_IMPL(147) \ + TEST_IMPL(148) \ + TEST_IMPL(149) \ + TEST_IMPL(150) \ + TEST_IMPL(151) \ + TEST_IMPL(152) \ + TEST_IMPL(153) \ + TEST_IMPL(154) \ + TEST_IMPL(155) \ + TEST_IMPL(156) \ + TEST_IMPL(157) \ + TEST_IMPL(158) \ + TEST_IMPL(159) \ + TEST_IMPL(160) \ + TEST_IMPL(161) \ + TEST_IMPL(162) \ + TEST_IMPL(163) \ + TEST_IMPL(164) \ + TEST_IMPL(165) \ + TEST_IMPL(166) \ + TEST_IMPL(167) \ + TEST_IMPL(168) \ + TEST_IMPL(169) \ + TEST_IMPL(170) \ + TEST_IMPL(171) \ + TEST_IMPL(172) \ + TEST_IMPL(173) \ + TEST_IMPL(174) \ + TEST_IMPL(175) \ + TEST_IMPL(176) \ + TEST_IMPL(177) \ + TEST_IMPL(178) \ + TEST_IMPL(179) \ + TEST_IMPL(180) \ + TEST_IMPL(181) \ + TEST_IMPL(182) \ + TEST_IMPL(183) \ + TEST_IMPL(184) \ + TEST_IMPL(185) \ + TEST_IMPL(186) \ + TEST_IMPL(187) \ + TEST_IMPL(188) \ + TEST_IMPL(189) \ + TEST_IMPL(190) \ + TEST_IMPL(191) \ + TEST_IMPL(192) \ + TEST_IMPL(193) \ + TEST_IMPL(194) \ + TEST_IMPL(195) \ + TEST_IMPL(196) \ + TEST_IMPL(197) \ + TEST_IMPL(198) \ + TEST_IMPL(199) \ + TEST_IMPL(200) \ + TEST_IMPL(201) \ + TEST_IMPL(202) \ + TEST_IMPL(203) \ + TEST_IMPL(204) \ + TEST_IMPL(205) \ + TEST_IMPL(206) \ + TEST_IMPL(207) \ + TEST_IMPL(208) \ + TEST_IMPL(209) \ + TEST_IMPL(210) \ + TEST_IMPL(211) \ + TEST_IMPL(212) \ + TEST_IMPL(213) \ + TEST_IMPL(214) \ + TEST_IMPL(215) \ + TEST_IMPL(216) \ + TEST_IMPL(217) \ + TEST_IMPL(218) \ + TEST_IMPL(219) \ + TEST_IMPL(220) \ + TEST_IMPL(221) \ + TEST_IMPL(222) \ + TEST_IMPL(223) \ + TEST_IMPL(224) \ + TEST_IMPL(225) \ + TEST_IMPL(226) \ + TEST_IMPL(227) \ + TEST_IMPL(228) \ + TEST_IMPL(229) \ + TEST_IMPL(230) \ + TEST_IMPL(231) \ + TEST_IMPL(232) \ + TEST_IMPL(233) \ + TEST_IMPL(234) \ + TEST_IMPL(235) \ + TEST_IMPL(236) \ + TEST_IMPL(237) \ + TEST_IMPL(238) \ + TEST_IMPL(239) \ + TEST_IMPL(240) \ + TEST_IMPL(241) \ + TEST_IMPL(242) \ + TEST_IMPL(243) \ + TEST_IMPL(244) \ + TEST_IMPL(245) \ + TEST_IMPL(246) \ + TEST_IMPL(247) \ + TEST_IMPL(248) \ + TEST_IMPL(249) \ + TEST_IMPL(250) \ + TEST_IMPL(251) \ + TEST_IMPL(252) \ + TEST_IMPL(253) \ + TEST_IMPL(254) \ + TEST_IMPL(255) +} // namespace SSE2RVV + +#endif diff --git a/tests/impl.cpp b/tests/impl.cpp new file mode 100644 index 0000000..53dddc3 --- /dev/null +++ b/tests/impl.cpp @@ -0,0 +1,9570 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "binding.h" +#include "impl.h" + +// Try 10,000 random floating point values for each test we run +#define MAX_TEST_VALUE 10000 + +/* Pattern Matching for C macros. + * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms + */ + +/* catenate */ +#define PRIMITIVE_CAT(a, ...) a##__VA_ARGS__ + +#define IIF(c) PRIMITIVE_CAT(IIF_, c) +/* run the 2nd parameter */ +#define IIF_0(t, ...) __VA_ARGS__ +/* run the 1st parameter */ +#define IIF_1(t, ...) t + +// This program a set of unit tests to ensure that each SSE call provide the +// output we expect. If this fires an assert, then something didn't match up. +// +// Functions with "test_" prefix will be called in run_single_test. +namespace SSE2RVV { +// Forward declaration +class SSE2RVV_TEST_IMPL : public SSE2RVV_TEST { +public: + SSE2RVV_TEST_IMPL(void); + result_t load_test_float_pointers(uint32_t i); + result_t load_test_int_pointers(uint32_t i); + result_t run_single_test(INSTRUCTION_TEST test, uint32_t i); + + float *test_cases_float_pointer1; + float *test_cases_float_pointer2; + int32_t *test_cases_int_pointer1; + int32_t *test_cases_int_pointer2; + float test_cases_floats[MAX_TEST_VALUE]; + int32_t test_cases_ints[MAX_TEST_VALUE]; + + virtual ~SSE2RVV_TEST_IMPL(void) { + platform_aligned_free(test_cases_float_pointer1); + platform_aligned_free(test_cases_float_pointer2); + platform_aligned_free(test_cases_int_pointer1); + platform_aligned_free(test_cases_int_pointer2); + } + virtual void release(void) { delete this; } + virtual result_t run_test(INSTRUCTION_TEST test) { + result_t ret = TEST_SUCCESS; + + // Test a whole bunch of values + for (uint32_t i = 0; i < (MAX_TEST_VALUE - 8); i++) { + ret = load_test_float_pointers(i); // Load some random float values + if (ret == TEST_FAIL) + break; // load test float failed?? + ret = load_test_int_pointers(i); // load some random int values + if (ret == TEST_FAIL) + break; // load test float failed?? + // If we are testing the reciprocal, then invert the input data + // (easier for debugging) + if (test == it_mm_rcp_ps) { + test_cases_float_pointer1[0] = 1.0f / test_cases_float_pointer1[0]; + test_cases_float_pointer1[1] = 1.0f / test_cases_float_pointer1[1]; + test_cases_float_pointer1[2] = 1.0f / test_cases_float_pointer1[2]; + test_cases_float_pointer1[3] = 1.0f / test_cases_float_pointer1[3]; + } + if (test == it_mm_rcp_ps || test == it_mm_rcp_ss || + test == it_mm_rsqrt_ps || test == it_mm_rsqrt_ss) { + if ((rand() & 3) == 0) { + uint32_t r1 = rand() & 3; + uint32_t r2 = rand() & 3; + uint32_t r3 = rand() & 3; + uint32_t r4 = rand() & 3; + uint32_t r5 = rand() & 3; + uint32_t r6 = rand() & 3; + uint32_t r7 = rand() & 3; + uint32_t r8 = rand() & 3; + test_cases_float_pointer1[r1] = 0.0f; + test_cases_float_pointer1[r2] = 0.0f; + test_cases_float_pointer1[r3] = 0.0f; + test_cases_float_pointer1[r4] = 0.0f; + test_cases_float_pointer1[r5] = -0.0f; + test_cases_float_pointer1[r6] = -0.0f; + test_cases_float_pointer1[r7] = -0.0f; + test_cases_float_pointer1[r8] = -0.0f; + } + } + if (test == it_mm_cmpge_ps || test == it_mm_cmpge_ss || + test == it_mm_cmple_ps || test == it_mm_cmple_ss || + test == it_mm_cmpeq_ps || test == it_mm_cmpeq_ss) { + // Make sure at least one value is the same. + test_cases_float_pointer1[3] = test_cases_float_pointer2[3]; + } + + if (test == it_mm_cmpord_ps || test == it_mm_cmpord_ss || + test == it_mm_cmpunord_ps || test == it_mm_cmpunord_ss || + test == it_mm_cmpeq_ps || test == it_mm_cmpeq_ss || + test == it_mm_cmpge_ps || test == it_mm_cmpge_ss || + test == it_mm_cmpgt_ps || test == it_mm_cmpgt_ss || + test == it_mm_cmple_ps || test == it_mm_cmple_ss || + test == it_mm_cmplt_ps || test == it_mm_cmplt_ss || + test == it_mm_cmpneq_ps || test == it_mm_cmpneq_ss || + test == it_mm_cmpnge_ps || test == it_mm_cmpnge_ss || + test == it_mm_cmpngt_ps || test == it_mm_cmpngt_ss || + test == it_mm_cmpnle_ps || test == it_mm_cmpnle_ss || + test == it_mm_cmpnlt_ps || test == it_mm_cmpnlt_ss || + test == it_mm_comieq_ss || test == it_mm_ucomieq_ss || + test == it_mm_comige_ss || test == it_mm_ucomige_ss || + test == it_mm_comigt_ss || test == it_mm_ucomigt_ss || + test == it_mm_comile_ss || test == it_mm_ucomile_ss || + test == it_mm_comilt_ss || test == it_mm_ucomilt_ss || + test == it_mm_comineq_ss || test == it_mm_ucomineq_ss) { + // Make sure the NaN values are included in the testing + // one out of four times. + if ((rand() & 3) == 0) { + uint32_t r1 = rand() & 3; + uint32_t r2 = rand() & 3; + test_cases_float_pointer1[r1] = nanf(""); + test_cases_float_pointer2[r2] = nanf(""); + } + } + + if (test == it_mm_cmpord_pd || test == it_mm_cmpord_sd || + test == it_mm_cmpunord_pd || test == it_mm_cmpunord_sd || + test == it_mm_cmpeq_pd || test == it_mm_cmpeq_sd || + test == it_mm_cmpge_pd || test == it_mm_cmpge_sd || + test == it_mm_cmpgt_pd || test == it_mm_cmpgt_sd || + test == it_mm_cmple_pd || test == it_mm_cmple_sd || + test == it_mm_cmplt_pd || test == it_mm_cmplt_sd || + test == it_mm_cmpneq_pd || test == it_mm_cmpneq_sd || + test == it_mm_cmpnge_pd || test == it_mm_cmpnge_sd || + test == it_mm_cmpngt_pd || test == it_mm_cmpngt_sd || + test == it_mm_cmpnle_pd || test == it_mm_cmpnle_sd || + test == it_mm_cmpnlt_pd || test == it_mm_cmpnlt_sd || + test == it_mm_comieq_sd || test == it_mm_ucomieq_sd || + test == it_mm_comige_sd || test == it_mm_ucomige_sd || + test == it_mm_comigt_sd || test == it_mm_ucomigt_sd || + test == it_mm_comile_sd || test == it_mm_ucomile_sd || + test == it_mm_comilt_sd || test == it_mm_ucomilt_sd || + test == it_mm_comineq_sd || test == it_mm_ucomineq_sd) { + // Make sure the NaN values are included in the testing + // one out of four times. + if ((rand() & 3) == 0) { + // FIXME: + // The argument "0xFFFFFFFFFFFF" is a tricky workaround to + // set the NaN value for doubles. The code is not intuitive + // and should be fixed in the future. + uint32_t r1 = ((rand() & 1) << 1) + 1; + uint32_t r2 = ((rand() & 1) << 1) + 1; + test_cases_float_pointer1[r1] = nanf("0xFFFFFFFFFFFF"); + test_cases_float_pointer2[r2] = nanf("0xFFFFFFFFFFFF"); + } + } + + if (test == it_mm_max_pd || test == it_mm_max_sd || + test == it_mm_min_pd || test == it_mm_min_sd) { + // Make sure the positive/negative infinity values are included + // in the testing one out of four times. + if ((rand() & 3) == 0) { + uint32_t r1 = ((rand() & 1) << 1) + 1; + uint32_t r2 = ((rand() & 1) << 1) + 1; + uint32_t r3 = ((rand() & 1) << 1) + 1; + uint32_t r4 = ((rand() & 1) << 1) + 1; + test_cases_float_pointer1[r1] = INFINITY; + test_cases_float_pointer2[r2] = INFINITY; + test_cases_float_pointer1[r3] = -INFINITY; + test_cases_float_pointer1[r4] = -INFINITY; + } + } + +#if SSE2RVV_PRECISE_MINMAX + if (test == it_mm_max_ps || test == it_mm_max_ss || + test == it_mm_min_ps || test == it_mm_min_ss) { + // Make sure the NaN values are included in the testing + // one out of four times. + if ((rand() & 3) == 0) { + uint32_t r1 = rand() & 3; + uint32_t r2 = rand() & 3; + test_cases_float_pointer1[r1] = nanf(""); + test_cases_float_pointer2[r2] = nanf(""); + } + } + + if (test == it_mm_max_pd || test == it_mm_max_sd || + test == it_mm_min_pd || test == it_mm_min_sd) { + // Make sure the NaN values are included in the testing + // one out of four times. + if ((rand() & 3) == 0) { + // FIXME: + // The argument "0xFFFFFFFFFFFF" is a tricky workaround to + // set the NaN value for doubles. The code is not intuitive + // and should be fixed in the future. + uint32_t r1 = ((rand() & 1) << 1) + 1; + uint32_t r2 = ((rand() & 1) << 1) + 1; + test_cases_float_pointer1[r1] = nanf("0xFFFFFFFFFFFF"); + test_cases_float_pointer2[r2] = nanf("0xFFFFFFFFFFFF"); + } + } +#endif + + // one out of every random 64 times or so, mix up the test floats to + // contain some integer values + if ((rand() & 63) == 0) { + uint32_t option = rand() & 3; + switch (option) { + // All integers.. + case 0: + test_cases_float_pointer1[0] = float(test_cases_int_pointer1[0]); + test_cases_float_pointer1[1] = float(test_cases_int_pointer1[1]); + test_cases_float_pointer1[2] = float(test_cases_int_pointer1[2]); + test_cases_float_pointer1[3] = float(test_cases_int_pointer1[3]); + + test_cases_float_pointer2[0] = float(test_cases_int_pointer2[0]); + test_cases_float_pointer2[1] = float(test_cases_int_pointer2[1]); + test_cases_float_pointer2[2] = float(test_cases_int_pointer2[2]); + test_cases_float_pointer2[3] = float(test_cases_int_pointer2[3]); + + break; + case 1: { + uint32_t index = rand() & 3; + test_cases_float_pointer1[index] = + float(test_cases_int_pointer1[index]); + index = rand() & 3; + test_cases_float_pointer2[index] = + float(test_cases_int_pointer2[index]); + } break; + case 2: { + uint32_t index1 = rand() & 3; + uint32_t index2 = rand() & 3; + test_cases_float_pointer1[index1] = + float(test_cases_int_pointer1[index1]); + test_cases_float_pointer1[index2] = + float(test_cases_int_pointer1[index2]); + index1 = rand() & 3; + index2 = rand() & 3; + test_cases_float_pointer1[index1] = + float(test_cases_int_pointer1[index1]); + test_cases_float_pointer1[index2] = + float(test_cases_int_pointer1[index2]); + } break; + case 3: + test_cases_float_pointer1[0] = float(test_cases_int_pointer1[0]); + test_cases_float_pointer1[1] = float(test_cases_int_pointer1[1]); + test_cases_float_pointer1[2] = float(test_cases_int_pointer1[2]); + test_cases_float_pointer1[3] = float(test_cases_int_pointer1[3]); + break; + } + if ((rand() & 3) == 0) { // one out of 4 times, make halves + for (uint32_t j = 0; j < 4; j++) { + test_cases_float_pointer1[j] *= 0.5f; + test_cases_float_pointer2[j] *= 0.5f; + } + } + } + + ret = run_single_test(test, i); + if (ret == TEST_FAIL) // the test failed... + { + // Set a breakpoint here if you want to step through the failure + // case in the debugger + ret = run_single_test(test, i); + break; + } + } + return ret; + } +}; + +const char *instructionString[] = { +#define _(x) #x, + INTRIN_LIST +#undef _ +}; + +// Produce rounding which is the same as SSE instructions with _MM_ROUND_NEAREST +// rounding mode +static inline float bankersRounding(float val) { + if (val < 0) + return -bankersRounding(-val); + + float ret; + float roundDown = floorf(val); // Round down value + float roundUp = ceilf(val); // Round up value + float diffDown = val - roundDown; + float diffUp = roundUp - val; + + if (diffDown < diffUp) { + /* If it's closer to the round down value, then use it */ + ret = roundDown; + } else if (diffDown > diffUp) { + /* If it's closer to the round up value, then use it */ + ret = roundUp; + } else { + /* If it's equidistant between round up and round down value, pick the + * one which is an even number */ + float half = roundDown / 2; + if (half != floorf(half)) { + /* If the round down value is odd, return the round up value */ + ret = roundUp; + } else { + /* If the round up value is odd, return the round down value */ + ret = roundDown; + } + } + return ret; +} + +static inline double bankersRounding(double val) { + if (val < 0) + return -bankersRounding(-val); + + double ret; + double roundDown = floor(val); // Round down value + double roundUp = ceil(val); // Round up value + double diffDown = val - roundDown; + double diffUp = roundUp - val; + + if (diffDown < diffUp) { + /* If it's closer to the round down value, then use it */ + ret = roundDown; + } else if (diffDown > diffUp) { + /* If it's closer to the round up value, then use it */ + ret = roundUp; + } else { + /* If it's equidistant between round up and round down value, pick the + * one which is an even number */ + double half = roundDown / 2; + if (half != floor(half)) { + /* If the round down value is odd, return the round up value */ + ret = roundUp; + } else { + /* If the round up value is odd, return the round down value */ + ret = roundDown; + } + } + return ret; +} + +// SplitMix64 PRNG by Sebastiano Vigna, see: +// +static uint64_t state; // the state of SplitMix64 PRNG +const double TWOPOWER64 = pow(2, 64); + +#define SSE2RVV_INIT_RNG(seed) \ + do { \ + state = seed; \ + } while (0) + +static double next() { + uint64_t z = (state += 0x9e3779b97f4a7c15); + z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9; + z = (z ^ (z >> 27)) * 0x94d049bb133111eb; + return z ^ (z >> 31); +} + +static float ranf() { return next() / TWOPOWER64; } + +static float ranf(float low, float high) { return ranf() * (high - low) + low; } + +// Enable the tests which are using the macro of another tests +result_t test_mm_slli_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter); +result_t test_mm_srli_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter); +result_t test_mm_shuffle_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter); + +// This function is not called from "run_single_test", but for other intrinsic +// tests that might need to call "_mm_set_epi32". +// __m128i do_mm_set_epi32(int32_t x, int32_t y, int32_t z, int32_t w) { +// __m128i a = _mm_set_epi32(x, y, z, w); +// validate_int32(a, w, z, y, x); +// return a; +// } + +// This function is not called from "run_single_test", but for other intrinsic +// tests that might need to load __m64 data. +// template __m64 load_m64(const T *p) { return *((const __m64 *)p); } + +// This function is not called from "run_single_test", but for other intrinsic +// tests that might need to call "_mm_load_ps". +// template __m128 load_m128(const T *p) { +// return _mm_loadu_ps((const float *)p); +// } + +// This function is not called from "run_single_test", but for other intrinsic +// tests that might need to call "_mm_load_ps". +// template __m128i load_m128i(const T *p) { +// __m128 a = _mm_loadu_ps((const float *)p); +// __m128i ia = *(const __m128i *)&a; +// return ia; +// } + +// This function is not called from "run_single_test", but for other intrinsic +// tests that might need to call "_mm_load_pd". +// template __m128d load_m128d(const T *p) { +// return _mm_loadu_pd((const double *)p); +// } + +// This function is not called from "run_single_test", but for other intrinsic +// tests that might need to call "_mm_store_ps". +// result_t do_mm_store_ps(float *p, float x, float y, float z, float w) { +// __m128 a = _mm_set_ps(x, y, z, w); +// _mm_store_ps(p, a); +// ASSERT_RETURN(p[0] == w); +// ASSERT_RETURN(p[1] == z); +// ASSERT_RETURN(p[2] == y); +// ASSERT_RETURN(p[3] == x); +// return TEST_SUCCESS; +// } + +// This function is not called from "run_single_test", but for other intrinsic +// tests that might need to call "_mm_store_ps". +// result_t do_mm_store_ps(int32_t *p, int32_t x, int32_t y, int32_t z, +// int32_t w) { +// __m128i a = _mm_set_epi32(x, y, z, w); +// _mm_store_ps((float *)p, *(const __m128 *)&a); +// ASSERT_RETURN(p[0] == w); +// ASSERT_RETURN(p[1] == z); +// ASSERT_RETURN(p[2] == y); +// ASSERT_RETURN(p[3] == x); +// return TEST_SUCCESS; +// } + +float cmp_noNaN(float a, float b) { + return (!isnan(a) && !isnan(b)) ? ALL_BIT_1_32 : 0.0f; +} + +double cmp_noNaN(double a, double b) { + return (!isnan(a) && !isnan(b)) ? ALL_BIT_1_64 : 0.0f; +} + +float cmp_hasNaN(float a, float b) { + return (isnan(a) || isnan(b)) ? ALL_BIT_1_32 : 0.0f; +} + +double cmp_hasNaN(double a, double b) { + return (isnan(a) || isnan(b)) ? ALL_BIT_1_64 : 0.0f; +} + +int32_t comilt_ss(float a, float b) { + if (isnan(a) || isnan(b)) + return 0; + return (a < b); +} + +int32_t comigt_ss(float a, float b) { + if (isnan(a) || isnan(b)) + return 0; + return (a > b); +} + +int32_t comile_ss(float a, float b) { + if (isnan(a) || isnan(b)) + return 0; + return (a <= b); +} + +int32_t comige_ss(float a, float b) { + if (isnan(a) || isnan(b)) + return 0; + return (a >= b); +} + +int32_t comieq_ss(float a, float b) { + if (isnan(a) || isnan(b)) + return 0; + return (a == b); +} + +int32_t comineq_ss(float a, float b) { + if (isnan(a) || isnan(b)) + return 1; + return (a != b); +} + +static inline int16_t saturate_16(int32_t a) { + int32_t max = (1 << 15) - 1; + int32_t min = -(1 << 15); + if (a > max) + return max; + if (a < min) + return min; + return a; +} + +uint32_t canonical_crc32_u8(uint32_t crc, uint8_t v) { + crc ^= v; + for (int bit = 0; bit < 8; bit++) { + if (crc & 1) + crc = (crc >> 1) ^ uint32_t(0x82f63b78); + else + crc = (crc >> 1); + } + return crc; +} + +uint32_t canonical_crc32_u16(uint32_t crc, uint16_t v) { + crc = canonical_crc32_u8(crc, v & 0xff); + crc = canonical_crc32_u8(crc, (v >> 8) & 0xff); + return crc; +} + +uint32_t canonical_crc32_u32(uint32_t crc, uint32_t v) { + crc = canonical_crc32_u16(crc, v & 0xffff); + crc = canonical_crc32_u16(crc, (v >> 16) & 0xffff); + return crc; +} + +uint64_t canonical_crc32_u64(uint64_t crc, uint64_t v) { + crc = canonical_crc32_u32((uint32_t)(crc), v & 0xffffffff); + crc = canonical_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff); + return crc; +} + +static const uint8_t crypto_aes_sbox[256] = { + 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, + 0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, + 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, + 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, + 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, + 0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, + 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, + 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, + 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, + 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, + 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, + 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, + 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, + 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, + 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d, + 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, + 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, + 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, + 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11, + 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, + 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, + 0xb0, 0x54, 0xbb, 0x16, +}; + +static const uint8_t crypto_aes_rsbox[256] = { + 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, + 0x81, 0xf3, 0xd7, 0xfb, 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, + 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, 0x54, 0x7b, 0x94, 0x32, + 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, + 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, + 0x6d, 0x8b, 0xd1, 0x25, 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, + 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, 0x6c, 0x70, 0x48, 0x50, + 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, + 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, + 0xb8, 0xb3, 0x45, 0x06, 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, + 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, 0x3a, 0x91, 0x11, 0x41, + 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, + 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, + 0x1c, 0x75, 0xdf, 0x6e, 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, + 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, 0xfc, 0x56, 0x3e, 0x4b, + 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, + 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, + 0x27, 0x80, 0xec, 0x5f, 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, + 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, 0xa0, 0xe0, 0x3b, 0x4d, + 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, + 0x55, 0x21, 0x0c, 0x7d, +}; + +// XT is x_time function that muliplies 'x' by 2 in GF(2^8) +#define XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b)) + +inline __m128i aesenc_128_reference(__m128i a, __m128i b) { + uint8_t i, t, u, v[4][4]; + for (i = 0; i < 16; ++i) { + v[((i / 4) + 4 - (i % 4)) % 4][i % 4] = + crypto_aes_sbox[((SIMDVec *)&a)->m128_u8[i]]; + } + for (i = 0; i < 4; ++i) { + t = v[i][0]; + u = v[i][0] ^ v[i][1] ^ v[i][2] ^ v[i][3]; + v[i][0] ^= u ^ XT(v[i][0] ^ v[i][1]); + v[i][1] ^= u ^ XT(v[i][1] ^ v[i][2]); + v[i][2] ^= u ^ XT(v[i][2] ^ v[i][3]); + v[i][3] ^= u ^ XT(v[i][3] ^ t); + } + + for (i = 0; i < 16; ++i) { + ((SIMDVec *)&a)->m128_u8[i] = v[i / 4][i % 4] ^ ((SIMDVec *)&b)->m128_u8[i]; + } + + return a; +} + +#define MULTIPLY(x, y) \ + (((y & 1) * x) ^ ((y >> 1 & 1) * XT(x)) ^ ((y >> 2 & 1) * XT(XT(x))) ^ \ + ((y >> 3 & 1) * XT(XT(XT(x)))) ^ ((y >> 4 & 1) * XT(XT(XT(XT(x)))))) + +inline __m128i aesdec_128_reference(__m128i a, __m128i b) { + uint8_t i, e, f, g, h, v[4][4]; + for (i = 0; i < 16; ++i) { + v[((i / 4) + (i % 4)) % 4][i % 4] = + crypto_aes_rsbox[((SIMDVec *)&a)->m128_u8[i]]; + } + + for (i = 0; i < 4; ++i) { + e = v[i][0]; + f = v[i][1]; + g = v[i][2]; + h = v[i][3]; + + v[i][0] = MULTIPLY(e, 0x0e) ^ MULTIPLY(f, 0x0b) ^ MULTIPLY(g, 0x0d) ^ + MULTIPLY(h, 0x09); + v[i][1] = MULTIPLY(e, 0x09) ^ MULTIPLY(f, 0x0e) ^ MULTIPLY(g, 0x0b) ^ + MULTIPLY(h, 0x0d); + v[i][2] = MULTIPLY(e, 0x0d) ^ MULTIPLY(f, 0x09) ^ MULTIPLY(g, 0x0e) ^ + MULTIPLY(h, 0x0b); + v[i][3] = MULTIPLY(e, 0x0b) ^ MULTIPLY(f, 0x0d) ^ MULTIPLY(g, 0x09) ^ + MULTIPLY(h, 0x0e); + } + + for (i = 0; i < 16; ++i) { + ((SIMDVec *)&a)->m128_u8[i] = v[i / 4][i % 4] ^ ((SIMDVec *)&b)->m128_u8[i]; + } + return a; +} + +inline __m128i aesenclast_128_reference(__m128i s, __m128i rk) { + uint8_t i, v[4][4]; + for (i = 0; i < 16; ++i) + v[((i / 4) + 4 - (i % 4)) % 4][i % 4] = + crypto_aes_sbox[((SIMDVec *)&s)->m128_u8[i]]; + for (i = 0; i < 16; ++i) + ((SIMDVec *)&s)->m128_u8[i] = + v[i / 4][i % 4] ^ ((SIMDVec *)&rk)->m128_u8[i]; + return s; +} + +// Rotates right (circular right shift) value by "amount" positions +static inline uint32_t rotr(uint32_t value, uint32_t amount) { + return (value >> amount) | (value << ((32 - amount) & 31)); +} + +static inline uint64_t MUL(uint32_t a, uint32_t b) { + return (uint64_t)a * (uint64_t)b; +} + +// From BearSSL. Performs a 32-bit->64-bit carryless/polynomial +// long multiply. +// +// This implementation was chosen because it is reasonably fast +// without a lookup table or branching. +// +// This does it by splitting up the bits in a way that they +// would not carry, then combine them together with xor (a +// carryless add). +// +// https://www.bearssl.org/gitweb/?p=BearSSL;a=blob;f=src/hash/ghash_ctmul.c;h=3623202;hb=5f045c7#l164 +static uint64_t clmul_32(uint32_t x, uint32_t y) { + uint32_t x0, x1, x2, x3; + uint32_t y0, y1, y2, y3; + uint64_t z0, z1, z2, z3; + + x0 = x & (uint32_t)0x11111111; + x1 = x & (uint32_t)0x22222222; + x2 = x & (uint32_t)0x44444444; + x3 = x & (uint32_t)0x88888888; + y0 = y & (uint32_t)0x11111111; + y1 = y & (uint32_t)0x22222222; + y2 = y & (uint32_t)0x44444444; + y3 = y & (uint32_t)0x88888888; + z0 = MUL(x0, y0) ^ MUL(x1, y3) ^ MUL(x2, y2) ^ MUL(x3, y1); + z1 = MUL(x0, y1) ^ MUL(x1, y0) ^ MUL(x2, y3) ^ MUL(x3, y2); + z2 = MUL(x0, y2) ^ MUL(x1, y1) ^ MUL(x2, y0) ^ MUL(x3, y3); + z3 = MUL(x0, y3) ^ MUL(x1, y2) ^ MUL(x2, y1) ^ MUL(x3, y0); + z0 &= (uint64_t)0x1111111111111111; + z1 &= (uint64_t)0x2222222222222222; + z2 &= (uint64_t)0x4444444444444444; + z3 &= (uint64_t)0x8888888888888888; + return z0 | z1 | z2 | z3; +} + +// Performs a 64x64->128-bit carryless/polynomial long +// multiply, using the above routine to calculate the +// subproducts needed for the full-size multiply. +// +// This uses the Karatsuba algorithm. +// +// Normally, the Karatsuba algorithm isn't beneficial +// until very large numbers due to carry tracking and +// multiplication being relatively cheap. +// +// However, we have no carries and multiplication is +// definitely not cheap, so the Karatsuba algorithm is +// a low cost and easy optimization. +// +// https://en.m.wikipedia.org/wiki/Karatsuba_algorithm +// +// Note that addition and subtraction are both +// performed with xor, since all operations are +// carryless. +// +// The comments represent the actual mathematical +// operations being performed (instead of the bitwise +// operations) and to reflect the linked Wikipedia article. +static std::pair clmul_64(uint64_t x, uint64_t y) { + // B = 2 + // m = 32 + // x = (x1 * B^m) + x0 + uint32_t x0 = x & 0xffffffff; + uint32_t x1 = x >> 32; + // y = (y1 * B^m) + y0 + uint32_t y0 = y & 0xffffffff; + uint32_t y1 = y >> 32; + + // z0 = x0 * y0 + uint64_t z0 = clmul_32(x0, y0); + // z2 = x1 * y1 + uint64_t z2 = clmul_32(x1, y1); + // z1 = (x0 + x1) * (y0 + y1) - z0 - z2 + uint64_t z1 = clmul_32(x0 ^ x1, y0 ^ y1) ^ z0 ^ z2; + + // xy = z0 + (z1 * B^m) + (z2 * B^2m) + // note: z1 is split between the low and high halves + uint64_t xy0 = z0 ^ (z1 << 32); + uint64_t xy1 = z2 ^ (z1 >> 32); + + return std::make_pair(xy0, xy1); +} + +/* MMX */ +result_t test_mm_empty(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +/* SSE */ +result_t test_mm_add_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // float dx = _a[0] + _b[0]; + // float dy = _a[1] + _b[1]; + // float dz = _a[2] + _b[2]; + // float dw = _a[3] + _b[3]; + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 c = _mm_add_ps(a, b); + // return validate_float(c, dx, dy, dz, dw); + return TEST_UNIMPL; +} + +result_t test_mm_add_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer1; + // + // float f0 = _a[0] + _b[0]; + // float f1 = _a[1]; + // float f2 = _a[2]; + // float f3 = _a[3]; + // + // __m128 a = _mm_load_ps(_a); + // __m128 b = _mm_load_ps(_b); + // __m128 c = _mm_add_ss(a, b); + // + // return validate_float(c, f0, f1, f2, f3); + return TEST_UNIMPL; +} + +result_t test_mm_and_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 c = _mm_and_ps(a, b); + // now for the assertion... + // const uint32_t *ia = (const uint32_t *)&a; + // const uint32_t *ib = (const uint32_t *)&b; + // uint32_t r[4]; + // r[0] = ia[0] & ib[0]; + // r[1] = ia[1] & ib[1]; + // r[2] = ia[2] & ib[2]; + // r[3] = ia[3] & ib[3]; + // __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]); + // result_t res = VALIDATE_INT32_M128(*(const __m128i *)&c, r); + // if (res) { + // res = VALIDATE_INT32_M128(ret, r); + // } + // return res; + return TEST_UNIMPL; +} + +// r0 := ~a0 & b0 +// r1 := ~a1 & b1 +// r2 := ~a2 & b2 +// r3 := ~a3 & b3 +result_t test_mm_andnot_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 c = _mm_andnot_ps(a, b); + // now for the assertion... + // const uint32_t *ia = (const uint32_t *)&a; + // const uint32_t *ib = (const uint32_t *)&b; + // uint32_t r[4]; + // r[0] = ~ia[0] & ib[0]; + // r[1] = ~ia[1] & ib[1]; + // r[2] = ~ia[2] & ib[2]; + // r[3] = ~ia[3] & ib[3]; + // __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]); + // result_t res = TEST_FAIL; + // res = VALIDATE_INT32_M128(*(const __m128i *)&c, r); + // if (res) { + // res = VALIDATE_INT32_M128(ret, r); + // } + // return res; + return TEST_UNIMPL; +} + +result_t test_mm_avg_pu16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1; + // const uint16_t *_b = (const uint16_t *)impl.test_cases_int_pointer2; + // uint16_t d[4]; + // d[0] = (_a[0] + _b[0] + 1) >> 1; + // d[1] = (_a[1] + _b[1] + 1) >> 1; + // d[2] = (_a[2] + _b[2] + 1) >> 1; + // d[3] = (_a[3] + _b[3] + 1) >> 1; + // + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 c = _mm_avg_pu16(a, b); + // + // return VALIDATE_UINT16_M64(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_avg_pu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1; + // const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2; + // uint8_t d[8]; + // d[0] = (_a[0] + _b[0] + 1) >> 1; + // d[1] = (_a[1] + _b[1] + 1) >> 1; + // d[2] = (_a[2] + _b[2] + 1) >> 1; + // d[3] = (_a[3] + _b[3] + 1) >> 1; + // d[4] = (_a[4] + _b[4] + 1) >> 1; + // d[5] = (_a[5] + _b[5] + 1) >> 1; + // d[6] = (_a[6] + _b[6] + 1) >> 1; + // d[7] = (_a[7] + _b[7] + 1) >> 1; + // + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 c = _mm_avg_pu8(a, b); + // + // return VALIDATE_UINT8_M64(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_cmpeq_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // int32_t result[4]; + // result[0] = _a[0] == _b[0] ? -1 : 0; + // result[1] = _a[1] == _b[1] ? -1 : 0; + // result[2] = _a[2] == _b[2] ? -1 : 0; + // result[3] = _a[3] == _b[3] ? -1 : 0; + // + // __m128 ret = _mm_cmpeq_ps(a, b); + // __m128i iret = *(const __m128i *)&ret; + // return VALIDATE_INT32_M128(iret, result); + return TEST_UNIMPL; +} + +result_t test_mm_cmpeq_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // float result[4]; + // result[0] = _a[0] == _b[0] ? ALL_BIT_1_32 : 0; + // result[1] = _a[1]; + // result[2] = _a[2]; + // result[3] = _a[3]; + // + // __m128 ret = _mm_cmpeq_ss(a, b); + // return validate_float(ret, result[0], result[1], result[2], result[3]); + return TEST_UNIMPL; +} + +result_t test_mm_cmpge_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // int32_t result[4]; + // result[0] = _a[0] >= _b[0] ? -1 : 0; + // result[1] = _a[1] >= _b[1] ? -1 : 0; + // result[2] = _a[2] >= _b[2] ? -1 : 0; + // result[3] = _a[3] >= _b[3] ? -1 : 0; + // + // __m128 ret = _mm_cmpge_ps(a, b); + // __m128i iret = *(const __m128i *)&ret; + // return VALIDATE_INT32_M128(iret, result); + return TEST_UNIMPL; +} + +result_t test_mm_cmpge_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // float result[4]; + // result[0] = _a[0] >= _b[0] ? ALL_BIT_1_32 : 0; + // result[1] = _a[1]; + // result[2] = _a[2]; + // result[3] = _a[3]; + // + // __m128 ret = _mm_cmpge_ss(a, b); + // return validate_float(ret, result[0], result[1], result[2], result[3]); + return TEST_UNIMPL; +} + +result_t test_mm_cmpgt_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // int32_t result[4]; + // result[0] = _a[0] > _b[0] ? -1 : 0; + // result[1] = _a[1] > _b[1] ? -1 : 0; + // result[2] = _a[2] > _b[2] ? -1 : 0; + // result[3] = _a[3] > _b[3] ? -1 : 0; + // + // __m128 ret = _mm_cmpgt_ps(a, b); + // __m128i iret = *(const __m128i *)&ret; + // return VALIDATE_INT32_M128(iret, result); + return TEST_UNIMPL; +} + +result_t test_mm_cmpgt_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // float result[4]; + // result[0] = _a[0] > _b[0] ? ALL_BIT_1_32 : 0; + // result[1] = _a[1]; + // result[2] = _a[2]; + // result[3] = _a[3]; + // + // __m128 ret = _mm_cmpgt_ss(a, b); + // return validate_float(ret, result[0], result[1], result[2], result[3]); + return TEST_UNIMPL; +} + +result_t test_mm_cmple_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // int32_t result[4]; + // result[0] = _a[0] <= _b[0] ? -1 : 0; + // result[1] = _a[1] <= _b[1] ? -1 : 0; + // result[2] = _a[2] <= _b[2] ? -1 : 0; + // result[3] = _a[3] <= _b[3] ? -1 : 0; + // + // __m128 ret = _mm_cmple_ps(a, b); + // __m128i iret = *(const __m128i *)&ret; + // return VALIDATE_INT32_M128(iret, result); + return TEST_UNIMPL; +} + +result_t test_mm_cmple_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // float result[4]; + // result[0] = _a[0] <= _b[0] ? ALL_BIT_1_32 : 0; + // result[1] = _a[1]; + // result[2] = _a[2]; + // result[3] = _a[3]; + // + // __m128 ret = _mm_cmple_ss(a, b); + // return validate_float(ret, result[0], result[1], result[2], result[3]); + return TEST_UNIMPL; +} + +result_t test_mm_cmplt_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // int32_t result[4]; + // result[0] = _a[0] < _b[0] ? -1 : 0; + // result[1] = _a[1] < _b[1] ? -1 : 0; + // result[2] = _a[2] < _b[2] ? -1 : 0; + // result[3] = _a[3] < _b[3] ? -1 : 0; + // + // __m128 ret = _mm_cmplt_ps(a, b); + // __m128i iret = *(const __m128i *)&ret; + // return VALIDATE_INT32_M128(iret, result); + return TEST_UNIMPL; +} + +result_t test_mm_cmplt_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // float result[4]; + // result[0] = _a[0] < _b[0] ? ALL_BIT_1_32 : 0; + // result[1] = _a[1]; + // result[2] = _a[2]; + // result[3] = _a[3]; + // + // __m128 ret = _mm_cmplt_ss(a, b); + // return validate_float(ret, result[0], result[1], result[2], result[3]); + return TEST_UNIMPL; +} + +result_t test_mm_cmpneq_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // int32_t result[4]; + // result[0] = _a[0] != _b[0] ? -1 : 0; + // result[1] = _a[1] != _b[1] ? -1 : 0; + // result[2] = _a[2] != _b[2] ? -1 : 0; + // result[3] = _a[3] != _b[3] ? -1 : 0; + // + // __m128 ret = _mm_cmpneq_ps(a, b); + // __m128i iret = *(const __m128i *)&ret; + // return VALIDATE_INT32_M128(iret, result); + return TEST_UNIMPL; +} + +result_t test_mm_cmpneq_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // float result[4]; + // result[0] = _a[0] != _b[0] ? ALL_BIT_1_32 : 0; + // result[1] = _a[1]; + // result[2] = _a[2]; + // result[3] = _a[3]; + // + // __m128 ret = _mm_cmpneq_ss(a, b); + // return validate_float(ret, result[0], result[1], result[2], result[3]); + return TEST_UNIMPL; +} + +result_t test_mm_cmpnge_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // float result[4]; + // result[0] = !(_a[0] >= _b[0]) ? ALL_BIT_1_32 : 0; + // result[1] = !(_a[1] >= _b[1]) ? ALL_BIT_1_32 : 0; + // result[2] = !(_a[2] >= _b[2]) ? ALL_BIT_1_32 : 0; + // result[3] = !(_a[3] >= _b[3]) ? ALL_BIT_1_32 : 0; + // + // __m128 ret = _mm_cmpnge_ps(a, b); + // return validate_float(ret, result[0], result[1], result[2], result[3]); + return TEST_UNIMPL; +} + +result_t test_mm_cmpnge_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // float result[4]; + // result[0] = !(_a[0] >= _b[0]) ? ALL_BIT_1_32 : 0; + // result[1] = _a[1]; + // result[2] = _a[2]; + // result[3] = _a[3]; + // + // __m128 ret = _mm_cmpnge_ss(a, b); + // return validate_float(ret, result[0], result[1], result[2], result[3]); + return TEST_UNIMPL; +} + +result_t test_mm_cmpngt_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // float result[4]; + // result[0] = !(_a[0] > _b[0]) ? ALL_BIT_1_32 : 0; + // result[1] = !(_a[1] > _b[1]) ? ALL_BIT_1_32 : 0; + // result[2] = !(_a[2] > _b[2]) ? ALL_BIT_1_32 : 0; + // result[3] = !(_a[3] > _b[3]) ? ALL_BIT_1_32 : 0; + // + // __m128 ret = _mm_cmpngt_ps(a, b); + // return validate_float(ret, result[0], result[1], result[2], result[3]); + return TEST_UNIMPL; +} + +result_t test_mm_cmpngt_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // float result[4]; + // result[0] = !(_a[0] > _b[0]) ? ALL_BIT_1_32 : 0; + // result[1] = _a[1]; + // result[2] = _a[2]; + // result[3] = _a[3]; + // + // __m128 ret = _mm_cmpngt_ss(a, b); + // return validate_float(ret, result[0], result[1], result[2], result[3]); + return TEST_UNIMPL; +} + +result_t test_mm_cmpnle_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // float result[4]; + // result[0] = !(_a[0] <= _b[0]) ? ALL_BIT_1_32 : 0; + // result[1] = !(_a[1] <= _b[1]) ? ALL_BIT_1_32 : 0; + // result[2] = !(_a[2] <= _b[2]) ? ALL_BIT_1_32 : 0; + // result[3] = !(_a[3] <= _b[3]) ? ALL_BIT_1_32 : 0; + // + // __m128 ret = _mm_cmpnle_ps(a, b); + // return validate_float(ret, result[0], result[1], result[2], result[3]); + return TEST_UNIMPL; +} + +result_t test_mm_cmpnle_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // float result[4]; + // result[0] = !(_a[0] <= _b[0]) ? ALL_BIT_1_32 : 0; + // result[1] = _a[1]; + // result[2] = _a[2]; + // result[3] = _a[3]; + // + // __m128 ret = _mm_cmpnle_ss(a, b); + // return validate_float(ret, result[0], result[1], result[2], result[3]); + return TEST_UNIMPL; +} + +result_t test_mm_cmpnlt_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // float result[4]; + // result[0] = !(_a[0] < _b[0]) ? ALL_BIT_1_32 : 0; + // result[1] = !(_a[1] < _b[1]) ? ALL_BIT_1_32 : 0; + // result[2] = !(_a[2] < _b[2]) ? ALL_BIT_1_32 : 0; + // result[3] = !(_a[3] < _b[3]) ? ALL_BIT_1_32 : 0; + // + // __m128 ret = _mm_cmpnlt_ps(a, b); + // return validate_float(ret, result[0], result[1], result[2], result[3]); + return TEST_UNIMPL; +} + +result_t test_mm_cmpnlt_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // float result[4]; + // result[0] = !(_a[0] < _b[0]) ? ALL_BIT_1_32 : 0; + // result[1] = _a[1]; + // result[2] = _a[2]; + // result[3] = _a[3]; + // + // __m128 ret = _mm_cmpnlt_ss(a, b); + // return validate_float(ret, result[0], result[1], result[2], result[3]); + return TEST_UNIMPL; +} + +result_t test_mm_cmpord_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // float result[4]; + // + // for (uint32_t i = 0; i < 4; i++) { + // result[i] = cmp_noNaN(_a[i], _b[i]); + // } + // + // __m128 ret = _mm_cmpord_ps(a, b); + // + // return validate_float(ret, result[0], result[1], result[2], result[3]); + return TEST_UNIMPL; +} + +result_t test_mm_cmpord_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // float result[4]; + // result[0] = cmp_noNaN(_a[0], _b[0]); + // result[1] = _a[1]; + // result[2] = _a[2]; + // result[3] = _a[3]; + // + // __m128 ret = _mm_cmpord_ss(a, b); + // + // return validate_float(ret, result[0], result[1], result[2], result[3]); + return TEST_UNIMPL; +} + +result_t test_mm_cmpunord_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // float result[4]; + // + // for (uint32_t i = 0; i < 4; i++) { + // result[i] = cmp_hasNaN(_a[i], _b[i]); + // } + // + // __m128 ret = _mm_cmpunord_ps(a, b); + // + // return validate_float(ret, result[0], result[1], result[2], result[3]); + return TEST_UNIMPL; +} + +result_t test_mm_cmpunord_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // float result[4]; + // result[0] = cmp_hasNaN(_a[0], _b[0]); + // result[1] = _a[1]; + // result[2] = _a[2]; + // result[3] = _a[3]; + // + // __m128 ret = _mm_cmpunord_ss(a, b); + // + // return validate_float(ret, result[0], result[1], result[2], result[3]); + return TEST_UNIMPL; +} + +result_t test_mm_comieq_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // FIXME: + // The GCC does not implement _mm_comieq_ss correctly. + // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98612 for more + // information. + // #if defined(__GNUC__) && !defined(__clang__) + // return TEST_UNIMPL; + // #else + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // int32_t result = comieq_ss(_a[0], _b[0]); + // int32_t ret = _mm_comieq_ss(a, b); + // + // return result == ret ? TEST_SUCCESS : TEST_FAIL; + // #endif + return TEST_UNIMPL; +} + +result_t test_mm_comige_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // int32_t result = comige_ss(_a[0], _b[0]); + // int32_t ret = _mm_comige_ss(a, b); + // + // return result == ret ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +result_t test_mm_comigt_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // int32_t result = comigt_ss(_a[0], _b[0]); + // int32_t ret = _mm_comigt_ss(a, b); + // + // return result == ret ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +result_t test_mm_comile_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // FIXME: + // The GCC does not implement _mm_comile_ss correctly. + // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98612 for more + // information. + // #if defined(__GNUC__) && !defined(__clang__) + // return TEST_UNIMPL; + // #else + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // int32_t result = comile_ss(_a[0], _b[0]); + // int32_t ret = _mm_comile_ss(a, b); + // + // return result == ret ? TEST_SUCCESS : TEST_FAIL; + // #endif + return TEST_UNIMPL; +} + +result_t test_mm_comilt_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // FIXME: + // The GCC does not implement _mm_comilt_ss correctly. + // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98612 for more + // information. + // #if defined(__GNUC__) && !defined(__clang__) + // return TEST_UNIMPL; + // #else + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // int32_t result = comilt_ss(_a[0], _b[0]); + // + // int32_t ret = _mm_comilt_ss(a, b); + // + // return result == ret ? TEST_SUCCESS : TEST_FAIL; + // #endif + return TEST_UNIMPL; +} + +result_t test_mm_comineq_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // FIXME: + // The GCC does not implement _mm_comineq_ss correctly. + // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98612 for more + // information. + // #if defined(__GNUC__) && !defined(__clang__) + // return TEST_UNIMPL; + // #else + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // int32_t result = comineq_ss(_a[0], _b[0]); + // int32_t ret = _mm_comineq_ss(a, b); + // + // return result == ret ? TEST_SUCCESS : TEST_FAIL; + // #endif + return TEST_UNIMPL; +} + +result_t test_mm_cvt_pi2ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const int32_t *_b = impl.test_cases_int_pointer2; + // + // float dx = (float)_b[0]; + // float dy = (float)_b[1]; + // float dz = _a[2]; + // float dw = _a[3]; + // + // __m128 a = load_m128(_a); + // __m64 b = load_m64(_b); + // __m128 c = _mm_cvt_pi2ps(a, b); + // + // return validate_float(c, dx, dy, dz, dw); + return TEST_UNIMPL; +} + +result_t test_mm_cvt_ps2pi(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // int32_t d[2]; + // + // for (int idx = 0; idx < 2; idx++) { + // switch (iter & 0x3) { + // case 0: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + // d[idx] = (int32_t)(bankersRounding(_a[idx])); + // break; + // case 1: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); + // d[idx] = (int32_t)(floorf(_a[idx])); + // break; + // case 2: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); + // d[idx] = (int32_t)(ceilf(_a[idx])); + // break; + // case 3: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); + // d[idx] = (int32_t)(_a[idx]); + // break; + // } + // } + // + // __m128 a = load_m128(_a); + // __m64 ret = _mm_cvt_ps2pi(a); + // + // return VALIDATE_INT32_M64(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_cvt_si2ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const int32_t b = *impl.test_cases_int_pointer2; + // + // float dx = (float)b; + // float dy = _a[1]; + // float dz = _a[2]; + // float dw = _a[3]; + // + // __m128 a = load_m128(_a); + // __m128 c = _mm_cvt_si2ss(a, b); + // + // return validate_float(c, dx, dy, dz, dw); + return TEST_UNIMPL; +} + +result_t test_mm_cvt_ss2si(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // int32_t d0; + // + // switch (iter & 0x3) { + // case 0: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + // d0 = (int32_t)(bankersRounding(_a[0])); + // break; + // case 1: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); + // d0 = (int32_t)(floorf(_a[0])); + // break; + // case 2: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); + // d0 = (int32_t)(ceilf(_a[0])); + // break; + // case 3: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); + // d0 = (int32_t)(_a[0]); + // break; + // } + // + // __m128 a = load_m128(_a); + // int32_t ret = _mm_cvt_ss2si(a); + // return ret == d0 ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +result_t test_mm_cvtpi16_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // + // float dx = (float)_a[0]; + // float dy = (float)_a[1]; + // float dz = (float)_a[2]; + // float dw = (float)_a[3]; + // + // __m64 a = load_m64(_a); + // __m128 c = _mm_cvtpi16_ps(a); + // + // return validate_float(c, dx, dy, dz, dw); + return TEST_UNIMPL; +} + +result_t test_mm_cvtpi32_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2; + // + // float dx = (float)_b[0]; + // float dy = (float)_b[1]; + // float dz = _a[2]; + // float dw = _a[3]; + // + // __m128 a = load_m128(_a); + // __m64 b = load_m64(_b); + // __m128 c = _mm_cvtpi32_ps(a, b); + // + // return validate_float(c, dx, dy, dz, dw); + return TEST_UNIMPL; +} + +result_t test_mm_cvtpi32x2_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2; + // + // float dx = (float)_a[0]; + // float dy = (float)_a[1]; + // float dz = (float)_b[0]; + // float dw = (float)_b[1]; + // + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m128 c = _mm_cvtpi32x2_ps(a, b); + // + // return validate_float(c, dx, dy, dz, dw); + return TEST_UNIMPL; +} + +result_t test_mm_cvtpi8_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // + // float dx = (float)_a[0]; + // float dy = (float)_a[1]; + // float dz = (float)_a[2]; + // float dw = (float)_a[3]; + // + // __m64 a = load_m64(_a); + // __m128 c = _mm_cvtpi8_ps(a); + // + // return validate_float(c, dx, dy, dz, dw); + return TEST_UNIMPL; +} + +result_t test_mm_cvtps_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // int16_t rnd[4]; + // + // for (int i = 0; i < 4; i++) { + // if ((float)INT16_MAX <= _a[i] && _a[i] <= (float)INT32_MAX) { + // rnd[i] = INT16_MAX; + // } else if (INT16_MIN < _a[i] && _a[i] < INT16_MAX) { + // switch (iter & 0x3) { + // case 0: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + // rnd[i] = (int16_t)bankersRounding(_a[i]); + // break; + // case 1: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); + // rnd[i] = (int16_t)floorf(_a[i]); + // break; + // case 2: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); + // rnd[i] = (int16_t)ceilf(_a[i]); + // break; + // case 3: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); + // rnd[i] = (int16_t)_a[i]; + // break; + // } + // } else { + // rnd[i] = INT16_MIN; + // } + // } + // + // __m128 a = load_m128(_a); + // __m64 ret = _mm_cvtps_pi16(a); + // return VALIDATE_INT16_M64(ret, rnd); + return TEST_UNIMPL; +} + +result_t test_mm_cvtps_pi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // int32_t d[2]; + // + // switch (iter & 0x3) { + // case 0: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + // d[0] = (int32_t)bankersRounding(_a[0]); + // d[1] = (int32_t)bankersRounding(_a[1]); + // break; + // case 1: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); + // d[0] = (int32_t)floorf(_a[0]); + // d[1] = (int32_t)floorf(_a[1]); + // break; + // case 2: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); + // d[0] = (int32_t)ceilf(_a[0]); + // d[1] = (int32_t)ceilf(_a[1]); + // break; + // case 3: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); + // d[0] = (int32_t)_a[0]; + // d[1] = (int32_t)_a[1]; + // break; + // } + // + // __m128 a = load_m128(_a); + // __m64 ret = _mm_cvtps_pi32(a); + // + // return VALIDATE_INT32_M64(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_cvtps_pi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // int8_t rnd[8] = {}; + // + // for (int i = 0; i < 4; i++) { + // if ((float)INT8_MAX <= _a[i] && _a[i] <= (float)INT32_MAX) { + // rnd[i] = INT8_MAX; + // } else if (INT8_MIN < _a[i] && _a[i] < INT8_MAX) { + // switch (iter & 0x3) { + // case 0: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + // rnd[i] = (int8_t)bankersRounding(_a[i]); + // break; + // case 1: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); + // rnd[i] = (int8_t)floorf(_a[i]); + // break; + // case 2: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); + // rnd[i] = (int8_t)ceilf(_a[i]); + // break; + // case 3: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); + // rnd[i] = (int8_t)_a[i]; + // break; + // } + // } else { + // rnd[i] = INT8_MIN; + // } + // } + // + // __m128 a = load_m128(_a); + // __m64 ret = _mm_cvtps_pi8(a); + // return VALIDATE_INT8_M64(ret, rnd); + return TEST_UNIMPL; +} + +result_t test_mm_cvtpu16_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1; + // + // float dx = (float)_a[0]; + // float dy = (float)_a[1]; + // float dz = (float)_a[2]; + // float dw = (float)_a[3]; + // + // __m64 a = load_m64(_a); + // __m128 c = _mm_cvtpu16_ps(a); + // + // return validate_float(c, dx, dy, dz, dw); + return TEST_UNIMPL; +} + +result_t test_mm_cvtpu8_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1; + // + // float dx = (float)_a[0]; + // float dy = (float)_a[1]; + // float dz = (float)_a[2]; + // float dw = (float)_a[3]; + // + // __m64 a = load_m64(_a); + // __m128 c = _mm_cvtpu8_ps(a); + // + // return validate_float(c, dx, dy, dz, dw); + return TEST_UNIMPL; +} + +result_t test_mm_cvtsi32_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const int32_t b = *impl.test_cases_int_pointer2; + // + // float dx = (float)b; + // float dy = _a[1]; + // float dz = _a[2]; + // float dw = _a[3]; + // + // __m128 a = load_m128(_a); + // __m128 c = _mm_cvtsi32_ss(a, b); + // + // return validate_float(c, dx, dy, dz, dw); + return TEST_UNIMPL; +} + +result_t test_mm_cvtsi64_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const int64_t b = *(int64_t *)impl.test_cases_int_pointer2; + // + // float dx = (float)b; + // float dy = _a[1]; + // float dz = _a[2]; + // float dw = _a[3]; + // + // __m128 a = load_m128(_a); + // __m128 c = _mm_cvtsi64_ss(a, b); + // + // return validate_float(c, dx, dy, dz, dw); + return TEST_UNIMPL; +} + +result_t test_mm_cvtss_f32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // + // float f = _a[0]; + // + // __m128 a = load_m128(_a); + // float c = _mm_cvtss_f32(a); + // + // return f == c ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +result_t test_mm_cvtss_si32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // + // int32_t d0; + // switch (iter & 0x3) { + // case 0: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + // d0 = (int32_t)(bankersRounding(_a[0])); + // break; + // case 1: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); + // d0 = (int32_t)(floorf(_a[0])); + // break; + // case 2: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); + // d0 = (int32_t)(ceilf(_a[0])); + // break; + // case 3: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); + // d0 = (int32_t)(_a[0]); + // break; + // } + // + // __m128 a = load_m128(_a); + // int32_t ret = _mm_cvtss_si32(a); + // + // return ret == d0 ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +result_t test_mm_cvtss_si64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // + // int64_t d0; + // switch (iter & 0x3) { + // case 0: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + // d0 = (int64_t)(bankersRounding(_a[0])); + // break; + // case 1: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); + // d0 = (int64_t)(floorf(_a[0])); + // break; + // case 2: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); + // d0 = (int64_t)(ceilf(_a[0])); + // break; + // case 3: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); + // d0 = (int64_t)(_a[0]); + // break; + // } + // + // __m128 a = load_m128(_a); + // int64_t ret = _mm_cvtss_si64(a); + // + // return ret == d0 ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +result_t test_mm_cvtt_ps2pi(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // int32_t d[2]; + // + // d[0] = (int32_t)_a[0]; + // d[1] = (int32_t)_a[1]; + // + // __m128 a = load_m128(_a); + // __m64 ret = _mm_cvtt_ps2pi(a); + // + // return VALIDATE_INT32_M64(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_cvtt_ss2si(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // + // __m128 a = load_m128(_a); + // int ret = _mm_cvtt_ss2si(a); + // + // return ret == (int32_t)_a[0] ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +result_t test_mm_cvttps_pi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // int32_t d[2]; + // + // d[0] = (int32_t)_a[0]; + // d[1] = (int32_t)_a[1]; + // + // __m128 a = load_m128(_a); + // __m64 ret = _mm_cvttps_pi32(a); + // + // return VALIDATE_INT32_M64(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_cvttss_si32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // + // __m128 a = load_m128(_a); + // int ret = _mm_cvttss_si32(a); + // + // return ret == (int32_t)_a[0] ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +result_t test_mm_cvttss_si64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // + // __m128 a = load_m128(_a); + // int64_t ret = _mm_cvttss_si64(a); + // + // return ret == (int64_t)_a[0] ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +result_t test_mm_div_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // float f0 = _a[0] / _b[0]; + // float f1 = _a[1] / _b[1]; + // float f2 = _a[2] / _b[2]; + // float f3 = _a[3] / _b[3]; + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 c = _mm_div_ps(a, b); + // + // return validate_float(c, f0, f1, f2, f3); + return TEST_UNIMPL; +} + +result_t test_mm_div_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // + // float d0 = _a[0] / _b[0]; + // float d1 = _a[1]; + // float d2 = _a[2]; + // float d3 = _a[3]; + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 c = _mm_div_ss(a, b); + // + // return validate_float(c, d0, d1, d2, d3); + return TEST_UNIMPL; +} + +result_t test_mm_extract_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // FIXME GCC has bug on "_mm_extract_pi16" intrinsics. We will enable this + // test when GCC fix this bug. + // see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98495 for more + // information + // #if defined(__clang__) || defined(_MSC_VER) + // uint64_t *_a = (uint64_t *)impl.test_cases_int_pointer1; + // const int idx = iter & 0x3; + // + // __m64 a = load_m64(_a); + // int c; + // switch (idx) { + // case 0: + // c = _mm_extract_pi16(a, 0); + // break; + // case 1: + // c = _mm_extract_pi16(a, 1); + // break; + // case 2: + // c = _mm_extract_pi16(a, 2); + // break; + // case 3: + // c = _mm_extract_pi16(a, 3); + // break; + // } + // + // ASSERT_RETURN((uint64_t)c == ((*_a >> (idx * 16)) & 0xFFFF)); + // ASSERT_RETURN(0 == ((uint64_t)c & 0xFFFF0000)); + // return TEST_SUCCESS; + // #else + // return TEST_UNIMPL; + // #endif + return TEST_UNIMPL; +} + +result_t test_mm_malloc(const SSE2RVV_TEST_IMPL &impl, uint32_t iter); +result_t test_mm_free(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // /* We verify _mm_malloc first, and there is no need to check _mm_free . + // */ return test_mm_malloc(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_get_flush_zero_mode(const SSE2RVV_TEST_IMPL &impl, + uint32_t iter) { + // int res_flush_zero_on, res_flush_zero_off; + // _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); + // res_flush_zero_on = _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON; + // _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); + // res_flush_zero_off = _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_OFF; + // + // return (res_flush_zero_on && res_flush_zero_off) ? TEST_SUCCESS : + // TEST_FAIL; + return TEST_UNIMPL; +} + +result_t test_mm_get_rounding_mode(const SSE2RVV_TEST_IMPL &impl, + uint32_t iter) { + // int res_toward_zero, res_to_neg_inf, res_to_pos_inf, res_nearest; + // _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); + // res_toward_zero = _MM_GET_ROUNDING_MODE() == _MM_ROUND_TOWARD_ZERO ? 1 : + // 0; _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); res_to_neg_inf = + // _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN ? 1 : 0; + // _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); + // res_to_pos_inf = _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP ? 1 : 0; + // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + // res_nearest = _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST ? 1 : 0; + // + // if (res_toward_zero && res_to_neg_inf && res_to_pos_inf && res_nearest) { + // return TEST_SUCCESS; + // } else { + // return TEST_FAIL; + // } + return TEST_UNIMPL; +} + +result_t test_mm_getcsr(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // store original csr value for post test restoring + // unsigned int originalCsr = _mm_getcsr(); + // + // unsigned int roundings[] = {_MM_ROUND_TOWARD_ZERO, _MM_ROUND_DOWN, + // _MM_ROUND_UP, _MM_ROUND_NEAREST}; + // for (size_t i = 0; i < sizeof(roundings) / sizeof(roundings[0]); i++) { + // _mm_setcsr(_mm_getcsr() | roundings[i]); + // if ((_mm_getcsr() & roundings[i]) != roundings[i]) { + // return TEST_FAIL; + // } + // } + // + // restore original csr value for remaining tests + // _mm_setcsr(originalCsr); + // + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_insert_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t insert = (int16_t)impl.test_cases_ints[iter]; + // __m64 a; + // __m64 b; + // + // #define TEST_IMPL(IDX) + // int16_t d##IDX[4]; + // for (int i = 0; i < 4; i++) { + // d##IDX[i] = _a[i]; + // } + // d##IDX[IDX] = insert; + // + // a = load_m64(_a); + // b = _mm_insert_pi16(a, insert, IDX); + // CHECK_RESULT(VALIDATE_INT16_M64(b, d##IDX)) + // + // IMM_4_ITER + // #undef TEST_IMPL + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_load_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *addr = impl.test_cases_float_pointer1; + // + // __m128 ret = _mm_load_ps(addr); + // + // return validate_float(ret, addr[0], addr[1], addr[2], addr[3]); + return TEST_UNIMPL; +} + +result_t test_mm_load_ps1(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *addr = impl.test_cases_float_pointer1; + // + // __m128 ret = _mm_load_ps1(addr); + // + // return validate_float(ret, addr[0], addr[0], addr[0], addr[0]); + return TEST_UNIMPL; +} + +result_t test_mm_load_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *addr = impl.test_cases_float_pointer1; + // + // __m128 ret = _mm_load_ss(addr); + // + // return validate_float(ret, addr[0], 0, 0, 0); + return TEST_UNIMPL; +} + +result_t test_mm_load1_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *p = impl.test_cases_float_pointer1; + // __m128 a = _mm_load1_ps(p); + // return validate_float(a, p[0], p[0], p[0], p[0]); + return TEST_UNIMPL; +} + +result_t test_mm_loadh_pi(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *p1 = impl.test_cases_float_pointer1; + // const float *p2 = impl.test_cases_float_pointer2; + // const __m64 *b = (const __m64 *)p2; + // __m128 a = _mm_load_ps(p1); + // __m128 c = _mm_loadh_pi(a, b); + // + // return validate_float(c, p1[0], p1[1], p2[0], p2[1]); + return TEST_UNIMPL; +} + +result_t test_mm_loadl_pi(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *p1 = impl.test_cases_float_pointer1; + // const float *p2 = impl.test_cases_float_pointer2; + // __m128 a = _mm_load_ps(p1); + // const __m64 *b = (const __m64 *)p2; + // __m128 c = _mm_loadl_pi(a, b); + // + // return validate_float(c, p2[0], p2[1], p1[2], p1[3]); + return TEST_UNIMPL; +} + +result_t test_mm_loadr_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *addr = impl.test_cases_float_pointer1; + // + // __m128 ret = _mm_loadr_ps(addr); + // + // return validate_float(ret, addr[3], addr[2], addr[1], addr[0]); + return TEST_UNIMPL; +} + +result_t test_mm_loadu_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *addr = impl.test_cases_float_pointer1; + // + // __m128 ret = _mm_loadu_ps(addr); + // + // return validate_float(ret, addr[0], addr[1], addr[2], addr[3]); + return TEST_UNIMPL; +} + +result_t test_mm_loadu_si16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // The GCC version before 11 does not implement intrinsic function + // _mm_loadu_si16. Check https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95483 + // for more information. + // #if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10) + // return TEST_UNIMPL; + // #else + // const int16_t *addr = (const int16_t *)impl.test_cases_int_pointer1; + // + // __m128i ret = _mm_loadu_si16((const void *)addr); + // + // return validate_int16(ret, addr[0], 0, 0, 0, 0, 0, 0, 0); + // #endif + return TEST_UNIMPL; +} + +result_t test_mm_loadu_si64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // Versions of GCC prior to 9 do not implement intrinsic function + // _mm_loadu_si64. Check https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78782 + // for more information. + // #if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ < 9) + // return TEST_UNIMPL; + // #else + // const int64_t *addr = (const int64_t *)impl.test_cases_int_pointer1; + // + // __m128i ret = _mm_loadu_si64((const void *)addr); + // + // return validate_int64(ret, addr[0], 0); + // #endif + return TEST_UNIMPL; +} + +result_t test_mm_malloc(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const size_t *a = (const size_t *)impl.test_cases_int_pointer1; + // const size_t *b = (const size_t *)impl.test_cases_int_pointer2; + // size_t size = *a % (1024 * 16) + 1; + // size_t align = 2 << (*b % 5); + // + // void *p = _mm_malloc(size, align); + // if (!p) + // return TEST_FAIL; + // result_t res = (((uintptr_t)p % align) == 0) ? TEST_SUCCESS : TEST_FAIL; + // _mm_free(p); + // return res; + return TEST_UNIMPL; +} + +result_t test_mm_maskmove_si64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1; + // const uint8_t *_mask = (const uint8_t *)impl.test_cases_int_pointer2; + // char mem_addr[16]; + // + // const __m64 *a = (const __m64 *)_a; + // const __m64 *mask = (const __m64 *)_mask; + // _mm_maskmove_si64(*a, *mask, (char *)mem_addr); + // + // for (int i = 0; i < 8; i++) { + // if (_mask[i] >> 7) { + // ASSERT_RETURN(_a[i] == (uint8_t)mem_addr[i]); + // } + // } + // + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_m_maskmovq(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + return test_mm_maskmove_si64(impl, iter); +} + +result_t test_mm_max_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // int16_t c[4]; + // + // c[0] = _a[0] > _b[0] ? _a[0] : _b[0]; + // c[1] = _a[1] > _b[1] ? _a[1] : _b[1]; + // c[2] = _a[2] > _b[2] ? _a[2] : _b[2]; + // c[3] = _a[3] > _b[3] ? _a[3] : _b[3]; + // + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 ret = _mm_max_pi16(a, b); + // return VALIDATE_INT16_M64(ret, c); + return TEST_UNIMPL; +} + +result_t test_mm_max_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // float c[4]; + // + // c[0] = _a[0] > _b[0] ? _a[0] : _b[0]; + // c[1] = _a[1] > _b[1] ? _a[1] : _b[1]; + // c[2] = _a[2] > _b[2] ? _a[2] : _b[2]; + // c[3] = _a[3] > _b[3] ? _a[3] : _b[3]; + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 ret = _mm_max_ps(a, b); + // return validate_float(ret, c[0], c[1], c[2], c[3]); + return TEST_UNIMPL; +} + +result_t test_mm_max_pu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1; + // const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2; + // uint8_t c[8]; + // + // c[0] = _a[0] > _b[0] ? _a[0] : _b[0]; + // c[1] = _a[1] > _b[1] ? _a[1] : _b[1]; + // c[2] = _a[2] > _b[2] ? _a[2] : _b[2]; + // c[3] = _a[3] > _b[3] ? _a[3] : _b[3]; + // c[4] = _a[4] > _b[4] ? _a[4] : _b[4]; + // c[5] = _a[5] > _b[5] ? _a[5] : _b[5]; + // c[6] = _a[6] > _b[6] ? _a[6] : _b[6]; + // c[7] = _a[7] > _b[7] ? _a[7] : _b[7]; + // + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 ret = _mm_max_pu8(a, b); + // return VALIDATE_UINT8_M64(ret, c); + return TEST_UNIMPL; +} + +result_t test_mm_max_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer1; + // + // float f0 = _a[0] > _b[0] ? _a[0] : _b[0]; + // float f1 = _a[1]; + // float f2 = _a[2]; + // float f3 = _a[3]; + // + // __m128 a = _mm_load_ps(_a); + // __m128 b = _mm_load_ps(_b); + // __m128 c = _mm_max_ss(a, b); + // + // return validate_float(c, f0, f1, f2, f3); + return TEST_UNIMPL; +} + +result_t test_mm_min_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // int16_t c[4]; + // + // c[0] = _a[0] < _b[0] ? _a[0] : _b[0]; + // c[1] = _a[1] < _b[1] ? _a[1] : _b[1]; + // c[2] = _a[2] < _b[2] ? _a[2] : _b[2]; + // c[3] = _a[3] < _b[3] ? _a[3] : _b[3]; + // + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 ret = _mm_min_pi16(a, b); + // return VALIDATE_INT16_M64(ret, c); + return TEST_UNIMPL; +} + +result_t test_mm_min_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // float c[4]; + // + // c[0] = _a[0] < _b[0] ? _a[0] : _b[0]; + // c[1] = _a[1] < _b[1] ? _a[1] : _b[1]; + // c[2] = _a[2] < _b[2] ? _a[2] : _b[2]; + // c[3] = _a[3] < _b[3] ? _a[3] : _b[3]; + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 ret = _mm_min_ps(a, b); + // return validate_float(ret, c[0], c[1], c[2], c[3]); + return TEST_UNIMPL; +} + +result_t test_mm_min_pu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1; + // const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2; + // uint8_t c[8]; + // + // c[0] = _a[0] < _b[0] ? _a[0] : _b[0]; + // c[1] = _a[1] < _b[1] ? _a[1] : _b[1]; + // c[2] = _a[2] < _b[2] ? _a[2] : _b[2]; + // c[3] = _a[3] < _b[3] ? _a[3] : _b[3]; + // c[4] = _a[4] < _b[4] ? _a[4] : _b[4]; + // c[5] = _a[5] < _b[5] ? _a[5] : _b[5]; + // c[6] = _a[6] < _b[6] ? _a[6] : _b[6]; + // c[7] = _a[7] < _b[7] ? _a[7] : _b[7]; + // + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 ret = _mm_min_pu8(a, b); + // return VALIDATE_UINT8_M64(ret, c); + return TEST_UNIMPL; +} + +result_t test_mm_min_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // float c; + // + // c = _a[0] < _b[0] ? _a[0] : _b[0]; + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 ret = _mm_min_ss(a, b); + // + // return validate_float(ret, c, _a[1], _a[2], _a[3]); + return TEST_UNIMPL; +} + +result_t test_mm_move_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // + // float result[4]; + // result[0] = _b[0]; + // result[1] = _a[1]; + // result[2] = _a[2]; + // result[3] = _a[3]; + // + // __m128 ret = _mm_move_ss(a, b); + // return validate_float(ret, result[0], result[1], result[2], result[3]); + return TEST_UNIMPL; +} + +result_t test_mm_movehl_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // + // float f0 = _b[2]; + // float f1 = _b[3]; + // float f2 = _a[2]; + // float f3 = _a[3]; + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 ret = _mm_movehl_ps(a, b); + // + // return validate_float(ret, f0, f1, f2, f3); + return TEST_UNIMPL; +} + +result_t test_mm_movelh_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // + // float f0 = _a[0]; + // float f1 = _a[1]; + // float f2 = _b[0]; + // float f3 = _b[1]; + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 ret = _mm_movelh_ps(a, b); + // + // return validate_float(ret, f0, f1, f2, f3); + return TEST_UNIMPL; +} + +result_t test_mm_movemask_pi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1; + // unsigned int _c = 0; + // for (int i = 0; i < 8; i++) { + // if (_a[i] & 0x80) { + // _c |= (1 << i); + // } + // } + // + // const __m64 *a = (const __m64 *)_a; + // int c = _mm_movemask_pi8(*a); + // + // ASSERT_RETURN((unsigned int)c == _c); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_movemask_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *p = impl.test_cases_float_pointer1; + // int ret = 0; + // + // const uint32_t *ip = (const uint32_t *)p; + // if (ip[0] & 0x80000000) { + // ret |= 1; + // } + // if (ip[1] & 0x80000000) { + // ret |= 2; + // } + // if (ip[2] & 0x80000000) { + // ret |= 4; + // } + // if (ip[3] & 0x80000000) { + // ret |= 8; + // } + // __m128 a = load_m128(p); + // int val = _mm_movemask_ps(a); + // return val == ret ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +result_t test_mm_mul_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // float dx = _a[0] * _b[0]; + // float dy = _a[1] * _b[1]; + // float dz = _a[2] * _b[2]; + // float dw = _a[3] * _b[3]; + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 c = _mm_mul_ps(a, b); + // return validate_float(c, dx, dy, dz, dw); + return TEST_UNIMPL; +} + +result_t test_mm_mul_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // + // float dx = _a[0] * _b[0]; + // float dy = _a[1]; + // float dz = _a[2]; + // float dw = _a[3]; + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 c = _mm_mul_ss(a, b); + // return validate_float(c, dx, dy, dz, dw); + return TEST_UNIMPL; +} + +result_t test_mm_mulhi_pu16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1; + // const uint16_t *_b = (const uint16_t *)impl.test_cases_int_pointer2; + // uint16_t d[4]; + // for (uint32_t i = 0; i < 4; i++) { + // uint32_t m = (uint32_t)_a[i] * (uint32_t)_b[i]; + // d[i] = (uint16_t)(m >> 16); + // } + // + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 c = _mm_mulhi_pu16(a, b); + // return VALIDATE_UINT16_M64(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_or_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 c = _mm_or_ps(a, b); + // now for the assertion... + // const uint32_t *ia = (const uint32_t *)&a; + // const uint32_t *ib = (const uint32_t *)&b; + // uint32_t r[4]; + // r[0] = ia[0] | ib[0]; + // r[1] = ia[1] | ib[1]; + // r[2] = ia[2] | ib[2]; + // r[3] = ia[3] | ib[3]; + // __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]); + // result_t res = VALIDATE_INT32_M128(*(const __m128i *)&c, r); + // if (res) { + // res = VALIDATE_INT32_M128(ret, r); + // } + // + // return res; + return TEST_UNIMPL; +} + +result_t test_m_pavgb(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + return test_mm_avg_pu8(impl, iter); +} + +result_t test_m_pavgw(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + return test_mm_avg_pu16(impl, iter); +} + +result_t test_m_pextrw(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + return test_mm_extract_pi16(impl, iter); +} + +result_t test_m_pinsrw(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + return test_mm_insert_pi16(impl, iter); +} + +result_t test_m_pmaxsw(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + return test_mm_max_pi16(impl, iter); +} + +result_t test_m_pmaxub(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + return test_mm_max_pu8(impl, iter); +} + +result_t test_m_pminsw(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + return test_mm_min_pi16(impl, iter); +} + +result_t test_m_pminub(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + return test_mm_min_pu8(impl, iter); +} + +result_t test_m_pmovmskb(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + return test_mm_movemask_pi8(impl, iter); +} + +result_t test_m_pmulhuw(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + return test_mm_mulhi_pu16(impl, iter); +} + +result_t test_mm_prefetch(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // typedef struct { + // __m128 a; + // float r[4]; + // } prefetch_test_t; + // prefetch_test_t test_vec[8] = { + // { + // _mm_set_ps(-0.1f, 0.2f, 0.3f, 0.4f), + // {0.4f, 0.3f, 0.2f, -0.1f}, + // }, + // { + // _mm_set_ps(0.5f, 0.6f, -0.7f, -0.8f), + // {-0.8f, -0.7f, 0.6f, 0.5f}, + // }, + // { + // _mm_set_ps(0.9f, 0.10f, -0.11f, 0.12f), + // {0.12f, -0.11f, 0.10f, 0.9f}, + // }, + // { + // _mm_set_ps(-1.1f, -2.1f, -3.1f, -4.1f), + // {-4.1f, -3.1f, -2.1f, -1.1f}, + // }, + // { + // _mm_set_ps(100.0f, -110.0f, 120.0f, -130.0f), + // {-130.0f, 120.0f, -110.0f, 100.0f}, + // }, + // { + // _mm_set_ps(200.5f, 210.5f, -220.5f, 230.5f), + // {995.74f, -93.04f, 144.03f, 902.50f}, + // }, + // { + // _mm_set_ps(10.11f, -11.12f, -12.13f, 13.14f), + // {13.14f, -12.13f, -11.12f, 10.11f}, + // }, + // { + // _mm_set_ps(10.1f, -20.2f, 30.3f, 40.4f), + // {40.4f, 30.3f, -20.2f, 10.1f}, + // }, + // }; + // + // for (size_t i = 0; i < (sizeof(test_vec) / (sizeof(test_vec[0]))); i++) { + // _mm_prefetch(((const char *)&test_vec[i].a), _MM_HINT_T0); + // _mm_prefetch(((const char *)&test_vec[i].a), _MM_HINT_T1); + // _mm_prefetch(((const char *)&test_vec[i].a), _MM_HINT_T2); + // _mm_prefetch(((const char *)&test_vec[i].a), _MM_HINT_NTA); + // } + // + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_m_psadbw(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1; + // const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2; + // uint16_t d = 0; + // for (int i = 0; i < 8; i++) { + // d += abs(_a[i] - _b[i]); + // } + + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 c = _m_psadbw(a, b); + // return validate_uint16(c, d, 0, 0, 0); + return TEST_UNIMPL; +} + +result_t test_m_pshufw(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // return test_mm_shuffle_pi16(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_rcp_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // float dx = 1.0f / _a[0]; + // float dy = 1.0f / _a[1]; + // float dz = 1.0f / _a[2]; + // float dw = 1.0f / _a[3]; + // + // __m128 a = load_m128(_a); + // __m128 c = _mm_rcp_ps(a); + // return validate_float_error(c, dx, dy, dz, dw, 0.001f); + return TEST_UNIMPL; +} + +result_t test_mm_rcp_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // + // float dx = 1.0f / _a[0]; + // float dy = _a[1]; + // float dz = _a[2]; + // float dw = _a[3]; + // __m128 a = load_m128(_a); + // __m128 c = _mm_rcp_ss(a); + // return validate_float_error(c, dx, dy, dz, dw, 0.001f); + return TEST_UNIMPL; +} + +result_t test_mm_rsqrt_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = (const float *)impl.test_cases_float_pointer1; + // + // float f0 = 1 / sqrt(_a[0]); + // float f1 = 1 / sqrt(_a[1]); + // float f2 = 1 / sqrt(_a[2]); + // float f3 = 1 / sqrt(_a[3]); + // + // __m128 a = load_m128(_a); + // __m128 c = _mm_rsqrt_ps(a); + // + // Here, we ensure the error rate of "_mm_rsqrt_ps()" is under 0.1% compared + // to the C implementation. + // return validate_float_error(c, f0, f1, f2, f3, 0.001f); + return TEST_UNIMPL; +} + +result_t test_mm_rsqrt_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = (const float *)impl.test_cases_float_pointer1; + // + // float f0 = 1 / sqrt(_a[0]); + // float f1 = _a[1]; + // float f2 = _a[2]; + // float f3 = _a[3]; + // + // __m128 a = load_m128(_a); + // __m128 c = _mm_rsqrt_ss(a); + // + // Here, we ensure the error rate of "_mm_rsqrt_ps()" is under 0.1% compared + // to the C implementation. + // return validate_float_error(c, f0, f1, f2, f3, 0.001f); + return TEST_UNIMPL; +} + +result_t test_mm_sad_pu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1; + // const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2; + // uint16_t d = 0; + // for (int i = 0; i < 8; i++) { + // d += abs(_a[i] - _b[i]); + // } + // + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 c = _mm_sad_pu8(a, b); + // return validate_uint16(c, d, 0, 0, 0); + return TEST_UNIMPL; +} + +result_t test_mm_set_flush_zero_mode(const SSE2RVV_TEST_IMPL &impl, + uint32_t iter) { + // TODO: + // After the behavior of denormal number and flush zero mode is fully + // investigated, the testing would be added. + // return TEST_UNIMPL; + return TEST_UNIMPL; +} + +result_t test_mm_set_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // float x = impl.test_cases_floats[iter]; + // float y = impl.test_cases_floats[iter + 1]; + // float z = impl.test_cases_floats[iter + 2]; + // float w = impl.test_cases_floats[iter + 3]; + // __m128 a = _mm_set_ps(x, y, z, w); + // return validate_float(a, w, z, y, x); + return TEST_UNIMPL; +} + +result_t test_mm_set_ps1(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // float a = impl.test_cases_floats[iter]; + // + // __m128 ret = _mm_set_ps1(a); + // + // return validate_float(ret, a, a, a, a); + return TEST_UNIMPL; +} + +result_t test_mm_set_rounding_mode(const SSE2RVV_TEST_IMPL &impl, + uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // result_t res_toward_zero, res_to_neg_inf, res_to_pos_inf, res_nearest; + // + // __m128 a = load_m128(_a); + // __m128 b, c; + // + // _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); + // b = _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION); + // c = _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + // res_toward_zero = validate_128bits(c, b); + // + // _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); + // b = _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION); + // c = _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + // res_to_neg_inf = validate_128bits(c, b); + // + // _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); + // b = _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION); + // c = _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + // res_to_pos_inf = validate_128bits(c, b); + // + // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + // b = _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION); + // c = _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + // res_nearest = validate_128bits(c, b); + // + // if (res_toward_zero == TEST_SUCCESS && res_to_neg_inf == TEST_SUCCESS && + // res_to_pos_inf == TEST_SUCCESS && res_nearest == TEST_SUCCESS) { + // return TEST_SUCCESS; + // } else { + // return TEST_FAIL; + // } + return TEST_UNIMPL; +} + +result_t test_mm_set_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // float a = impl.test_cases_floats[iter]; + // __m128 c = _mm_set_ss(a); + // return validate_float(c, a, 0, 0, 0); + return TEST_UNIMPL; +} + +result_t test_mm_set1_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // float w = impl.test_cases_floats[iter]; + // __m128 a = _mm_set1_ps(w); + // return validate_float(a, w, w, w, w); + return TEST_UNIMPL; +} + +result_t test_mm_setcsr(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // return test_mm_set_rounding_mode(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_setr_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // float x = impl.test_cases_floats[iter]; + // float y = impl.test_cases_floats[iter + 1]; + // float z = impl.test_cases_floats[iter + 2]; + // float w = impl.test_cases_floats[iter + 3]; + // + // __m128 ret = _mm_setr_ps(w, z, y, x); + // + // return validate_float(ret, w, z, y, x); + return TEST_UNIMPL; +} + +result_t test_mm_setzero_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // __m128 a = _mm_setzero_ps(); + // return validate_float(a, 0, 0, 0, 0); + return TEST_UNIMPL; +} + +result_t test_mm_sfence(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // /* FIXME: Assume that memory barriers always function as intended. */ + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_shuffle_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // __m64 a; + // __m64 d; + // + // #define TEST_IMPL(IDX) + // a = load_m64(_a); + // d = _mm_shuffle_pi16(a, IDX); + // + // int16_t _d##IDX[4]; + // _d##IDX[0] = _a[IDX & 0x3]; + // _d##IDX[1] = _a[(IDX >> 2) & 0x3]; + // _d##IDX[2] = _a[(IDX >> 4) & 0x3]; + // _d##IDX[3] = _a[(IDX >> 6) & 0x3]; + // if (VALIDATE_INT16_M64(d, _d##IDX) != TEST_SUCCESS) { + // return TEST_FAIL; + // } + // + // IMM_256_ITER + // #undef TEST_IMPL + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +// Note, NEON does not have a general purpose shuffled command like SSE. +// When invoking this method, there is special code for a number of the most +// common shuffle permutations +result_t test_mm_shuffle_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // result_t isValid = TEST_SUCCESS; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // Test many permutations of the shuffle operation, including all + // permutations which have an optimized/customized implementation + // __m128 ret; + // ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(0, 1, 2, 3)); + // if (!validate_float(ret, _a[3], _a[2], _b[1], _b[0])) { + // isValid = TEST_FAIL; + // } + // ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 2, 1, 0)); + // if (!validate_float(ret, _a[0], _a[1], _b[2], _b[3])) { + // isValid = TEST_FAIL; + // } + // ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(0, 0, 1, 1)); + // if (!validate_float(ret, _a[1], _a[1], _b[0], _b[0])) { + // isValid = TEST_FAIL; + // } + // ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 0, 2)); + // if (!validate_float(ret, _a[2], _a[0], _b[1], _b[3])) { + // isValid = TEST_FAIL; + // } + // ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2)); + // if (!validate_float(ret, _a[2], _a[3], _b[0], _b[1])) { + // isValid = TEST_FAIL; + // } + // ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 3, 0, 1)); + // if (!validate_float(ret, _a[1], _a[0], _b[3], _b[2])) { + // isValid = TEST_FAIL; + // } + // ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(0, 0, 2, 2)); + // if (!validate_float(ret, _a[2], _a[2], _b[0], _b[0])) { + // isValid = TEST_FAIL; + // } + // ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 2, 0, 0)); + // if (!validate_float(ret, _a[0], _a[0], _b[2], _b[2])) { + // isValid = TEST_FAIL; + // } + // ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 2, 0, 2)); + // if (!validate_float(ret, _a[2], _a[0], _b[2], _b[3])) { + // isValid = TEST_FAIL; + // } + // ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 1, 3, 3)); + // if (!validate_float(ret, _a[3], _a[3], _b[1], _b[1])) { + // isValid = TEST_FAIL; + // } + // ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 1, 0)); + // if (!validate_float(ret, _a[0], _a[1], _b[0], _b[2])) { + // isValid = TEST_FAIL; + // } + // ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 0, 1)); + // if (!validate_float(ret, _a[1], _a[0], _b[0], _b[2])) { + // isValid = TEST_FAIL; + // } + // ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 3, 2)); + // if (!validate_float(ret, _a[2], _a[3], _b[0], _b[2])) { + // isValid = TEST_FAIL; + // } + // + // return isValid; + return TEST_UNIMPL; +} + +result_t test_mm_sqrt_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = (const float *)impl.test_cases_float_pointer1; + // + // float f0 = sqrt(_a[0]); + // float f1 = sqrt(_a[1]); + // float f2 = sqrt(_a[2]); + // float f3 = sqrt(_a[3]); + // + // __m128 a = load_m128(_a); + // __m128 c = _mm_sqrt_ps(a); + // + // return validate_float_error(c, f0, f1, f2, f3, 0.000001f); + return TEST_UNIMPL; +} + +result_t test_mm_sqrt_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = (const float *)impl.test_cases_float_pointer1; + // + // float f0 = sqrt(_a[0]); + // float f1 = _a[1]; + // float f2 = _a[2]; + // float f3 = _a[3]; + // + // __m128 a = load_m128(_a); + // __m128 c = _mm_sqrt_ss(a); + // + // return validate_float_error(c, f0, f1, f2, f3, 0.000001f); + return TEST_UNIMPL; +} + +result_t test_mm_store_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // int32_t *p = impl.test_cases_int_pointer1; + // int32_t x = impl.test_cases_ints[iter]; + // int32_t y = impl.test_cases_ints[iter + 1]; + // int32_t z = impl.test_cases_ints[iter + 2]; + // int32_t w = impl.test_cases_ints[iter + 3]; + // __m128i a = _mm_set_epi32(x, y, z, w); + // _mm_store_ps((float *)p, *(const __m128 *)&a); + // ASSERT_RETURN(p[0] == w); + // ASSERT_RETURN(p[1] == z); + // ASSERT_RETURN(p[2] == y); + // ASSERT_RETURN(p[3] == x); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_store_ps1(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // float *p = impl.test_cases_float_pointer1; + // float d[4]; + // + // __m128 a = load_m128(p); + // _mm_store_ps1(d, a); + // + // ASSERT_RETURN(d[0] == *p); + // ASSERT_RETURN(d[1] == *p); + // ASSERT_RETURN(d[2] == *p); + // ASSERT_RETURN(d[3] == *p); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_store_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // float x = impl.test_cases_floats[iter]; + // float p[4]; + // + // __m128 a = _mm_set_ss(x); + // _mm_store_ss(p, a); + // ASSERT_RETURN(p[0] == x); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_store1_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // float *p = impl.test_cases_float_pointer1; + // float d[4]; + // + // __m128 a = load_m128(p); + // _mm_store1_ps(d, a); + // + // ASSERT_RETURN(d[0] == *p); + // ASSERT_RETURN(d[1] == *p); + // ASSERT_RETURN(d[2] == *p); + // ASSERT_RETURN(d[3] == *p); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_storeh_pi(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *p = impl.test_cases_float_pointer1; + // float d[4] = {1.0f, 2.0f, 3.0f, 4.0f}; + // __m128 a = _mm_load_ps(p); + // __m64 *b = (__m64 *)d; + // + // _mm_storeh_pi(b, a); + // ASSERT_RETURN(d[0] == p[2]); + // ASSERT_RETURN(d[1] == p[3]); + // ASSERT_RETURN(d[2] == 3.0f); + // ASSERT_RETURN(d[3] == 4.0f); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_storel_pi(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *p = impl.test_cases_float_pointer1; + // float d[4] = {1.0f, 2.0f, 3.0f, 4.0f}; + // __m128 a = _mm_load_ps(p); + // __m64 *b = (__m64 *)d; + // + // _mm_storel_pi(b, a); + // ASSERT_RETURN(d[0] == p[0]); + // ASSERT_RETURN(d[1] == p[1]); + // ASSERT_RETURN(d[2] == 3.0f); + // ASSERT_RETURN(d[3] == 4.0f); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_storer_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // float *p = impl.test_cases_float_pointer1; + // float d[4]; + // + // __m128 a = load_m128(p); + // _mm_storer_ps(d, a); + // + // ASSERT_RETURN(d[0] == p[3]); + // ASSERT_RETURN(d[1] == p[2]); + // ASSERT_RETURN(d[2] == p[1]); + // ASSERT_RETURN(d[3] == p[0]); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_storeu_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // float *_a = impl.test_cases_float_pointer1; + // float f[4]; + // __m128 a = _mm_load_ps(_a); + // + // _mm_storeu_ps(f, a); + // return validate_float(a, f[0], f[1], f[2], f[3]); + return TEST_UNIMPL; +} + +result_t test_mm_storeu_si16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // The GCC version before 11 does not implement intrinsic function + // _mm_storeu_si16. Check https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95483 + // for more information. + // #if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10) + // return TEST_UNIMPL; + // #else + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // __m128i b; + // __m128i a = load_m128i(_a); + // _mm_storeu_si16(&b, a); + // int16_t *_b = (int16_t *)&b; + // int16_t *_c = (int16_t *)&a; + // return validate_int16(b, _c[0], _b[1], _b[2], _b[3], _b[4], _b[5], _b[6], + // _b[7]); + // #endif + return TEST_UNIMPL; +} + +result_t test_mm_storeu_si64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // Versions of GCC prior to 9 do not implement intrinsic function + // _mm_storeu_si64. Check https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87558 + // for more information. + // #if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ < 9) + // return TEST_UNIMPL; + // #else + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // __m128i b; + // __m128i a = load_m128i(_a); + // _mm_storeu_si64(&b, a); + // int64_t *_b = (int64_t *)&b; + // int64_t *_c = (int64_t *)&a; + // return validate_int64(b, _c[0], _b[1]); + // #endif + return TEST_UNIMPL; +} + +result_t test_mm_stream_pi(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // __m64 a = load_m64(_a); + // __m64 p; + // + // _mm_stream_pi(&p, a); + // return validate_int64(p, _a[0]); + return TEST_UNIMPL; +} + +result_t test_mm_stream_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // __m128 a = load_m128(_a); + // alignas(16) float p[4]; + // + // _mm_stream_ps(p, a); + // ASSERT_RETURN(p[0] == _a[0]); + // ASSERT_RETURN(p[1] == _a[1]); + // ASSERT_RETURN(p[2] == _a[2]); + // ASSERT_RETURN(p[3] == _a[3]); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_sub_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // float dx = _a[0] - _b[0]; + // float dy = _a[1] - _b[1]; + // float dz = _a[2] - _b[2]; + // float dw = _a[3] - _b[3]; + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 c = _mm_sub_ps(a, b); + // return validate_float(c, dx, dy, dz, dw); + return TEST_UNIMPL; +} + +result_t test_mm_sub_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // float dx = _a[0] - _b[0]; + // float dy = _a[1]; + // float dz = _a[2]; + // float dw = _a[3]; + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 c = _mm_sub_ss(a, b); + // return validate_float(c, dx, dy, dz, dw); + return TEST_UNIMPL; +} + +result_t test_mm_ucomieq_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // _mm_ucomieq_ss is equal to _mm_comieq_ss + // return test_mm_comieq_ss(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_ucomige_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // _mm_ucomige_ss is equal to _mm_comige_ss + // return test_mm_comige_ss(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_ucomigt_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // _mm_ucomigt_ss is equal to _mm_comigt_ss + // return test_mm_comigt_ss(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_ucomile_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // _mm_ucomile_ss is equal to _mm_comile_ss + // return test_mm_comile_ss(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_ucomilt_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // _mm_ucomilt_ss is equal to _mm_comilt_ss + // return test_mm_comilt_ss(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_ucomineq_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // _mm_ucomineq_ss is equal to _mm_comineq_ss + // return test_mm_comineq_ss(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_undefined_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // __m128 a = _mm_undefined_ps(); + // a = _mm_xor_ps(a, a); + // return validate_float(a, 0, 0, 0, 0); + return TEST_UNIMPL; +} + +result_t test_mm_unpackhi_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // float *_a = impl.test_cases_float_pointer1; + // float *_b = impl.test_cases_float_pointer1; + // + // float f0 = _a[2]; + // float f1 = _b[2]; + // float f2 = _a[3]; + // float f3 = _b[3]; + // + // __m128 a = _mm_load_ps(_a); + // __m128 b = _mm_load_ps(_b); + // __m128 c = _mm_unpackhi_ps(a, b); + // return validate_float(c, f0, f1, f2, f3); + return TEST_UNIMPL; +} + +result_t test_mm_unpacklo_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // float *_a = impl.test_cases_float_pointer1; + // float *_b = impl.test_cases_float_pointer1; + // + // float f0 = _a[0]; + // float f1 = _b[0]; + // float f2 = _a[1]; + // float f3 = _b[1]; + // + // __m128 a = _mm_load_ps(_a); + // __m128 b = _mm_load_ps(_b); + // __m128 c = _mm_unpacklo_ps(a, b); + // + // return validate_float(c, f0, f1, f2, f3); + return TEST_UNIMPL; +} + +result_t test_mm_xor_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_float_pointer1; + // const int32_t *_b = (const int32_t *)impl.test_cases_float_pointer2; + // + // int32_t d0 = _a[0] ^ _b[0]; + // int32_t d1 = _a[1] ^ _b[1]; + // int32_t d2 = _a[2] ^ _b[2]; + // int32_t d3 = _a[3] ^ _b[3]; + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 c = _mm_xor_ps(a, b); + // + // return validate_float(c, *((float *)&d0), *((float *)&d1), *((float + // *)&d2), + // *((float *)&d3)); + return TEST_UNIMPL; +} + +/* SSE2 */ +result_t test_mm_add_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // + // int16_t d[8]; + // d[0] = _a[0] + _b[0]; + // d[1] = _a[1] + _b[1]; + // d[2] = _a[2] + _b[2]; + // d[3] = _a[3] + _b[3]; + // d[4] = _a[4] + _b[4]; + // d[5] = _a[5] + _b[5]; + // d[6] = _a[6] + _b[6]; + // d[7] = _a[7] + _b[7]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_add_epi16(a, b); + // + // return VALIDATE_INT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_add_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = impl.test_cases_int_pointer1; + // const int32_t *_b = impl.test_cases_int_pointer2; + // int32_t d[4]; + // d[0] = _a[0] + _b[0]; + // d[1] = _a[1] + _b[1]; + // d[2] = _a[2] + _b[2]; + // d[3] = _a[3] + _b[3]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_add_epi32(a, b); + // return VALIDATE_INT32_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_add_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // const int64_t *_b = (const int64_t *)impl.test_cases_int_pointer2; + // + // int64_t d0 = _a[0] + _b[0]; + // int64_t d1 = _a[1] + _b[1]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_add_epi64(a, b); + // + // return validate_int64(c, d0, d1); + return TEST_UNIMPL; +} + +result_t test_mm_add_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // int8_t d[16]; + // d[0] = _a[0] + _b[0]; + // d[1] = _a[1] + _b[1]; + // d[2] = _a[2] + _b[2]; + // d[3] = _a[3] + _b[3]; + // d[4] = _a[4] + _b[4]; + // d[5] = _a[5] + _b[5]; + // d[6] = _a[6] + _b[6]; + // d[7] = _a[7] + _b[7]; + // d[8] = _a[8] + _b[8]; + // d[9] = _a[9] + _b[9]; + // d[10] = _a[10] + _b[10]; + // d[11] = _a[11] + _b[11]; + // d[12] = _a[12] + _b[12]; + // d[13] = _a[13] + _b[13]; + // d[14] = _a[14] + _b[14]; + // d[15] = _a[15] + _b[15]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_add_epi8(a, b); + // return VALIDATE_INT8_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_add_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // double d0 = _a[0] + _b[0]; + // double d1 = _a[1] + _b[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_add_pd(a, b); + // return validate_double(c, d0, d1); + return TEST_UNIMPL; +} + +result_t test_mm_add_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // double d0 = _a[0] + _b[0]; + // double d1 = _a[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_add_sd(a, b); + // return validate_double(c, d0, d1); + return TEST_UNIMPL; +} + +result_t test_mm_add_si64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // const int64_t *_b = (const int64_t *)impl.test_cases_int_pointer2; + // + // int64_t d0 = _a[0] + _b[0]; + // + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 c = _mm_add_si64(a, b); + // + // return validate_int64(c, d0); + return TEST_UNIMPL; +} + +result_t test_mm_adds_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // int32_t d[8]; + // d[0] = (int32_t)_a[0] + (int32_t)_b[0]; + // if (d[0] > 32767) + // d[0] = 32767; + // if (d[0] < -32768) + // d[0] = -32768; + // d[1] = (int32_t)_a[1] + (int32_t)_b[1]; + // if (d[1] > 32767) + // d[1] = 32767; + // if (d[1] < -32768) + // d[1] = -32768; + // d[2] = (int32_t)_a[2] + (int32_t)_b[2]; + // if (d[2] > 32767) + // d[2] = 32767; + // if (d[2] < -32768) + // d[2] = -32768; + // d[3] = (int32_t)_a[3] + (int32_t)_b[3]; + // if (d[3] > 32767) + // d[3] = 32767; + // if (d[3] < -32768) + // d[3] = -32768; + // d[4] = (int32_t)_a[4] + (int32_t)_b[4]; + // if (d[4] > 32767) + // d[4] = 32767; + // if (d[4] < -32768) + // d[4] = -32768; + // d[5] = (int32_t)_a[5] + (int32_t)_b[5]; + // if (d[5] > 32767) + // d[5] = 32767; + // if (d[5] < -32768) + // d[5] = -32768; + // d[6] = (int32_t)_a[6] + (int32_t)_b[6]; + // if (d[6] > 32767) + // d[6] = 32767; + // if (d[6] < -32768) + // d[6] = -32768; + // d[7] = (int32_t)_a[7] + (int32_t)_b[7]; + // if (d[7] > 32767) + // d[7] = 32767; + // if (d[7] < -32768) + // d[7] = -32768; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // + // __m128i c = _mm_adds_epi16(a, b); + // return VALIDATE_INT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_adds_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // + // int16_t d[16]; + // for (int i = 0; i < 16; i++) { + // d[i] = (int16_t)_a[i] + (int16_t)_b[i]; + // if (d[i] > 127) + // d[i] = 127; + // if (d[i] < -128) + // d[i] = -128; + // } + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_adds_epi8(a, b); + // + // return VALIDATE_INT8_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_adds_epu16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // uint32_t max = 0xFFFF; + // const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1; + // const uint16_t *_b = (const uint16_t *)impl.test_cases_int_pointer2; + // + // uint16_t d[8]; + // d[0] = (uint32_t)_a[0] + (uint32_t)_b[0] > max ? max : _a[0] + _b[0]; + // d[1] = (uint32_t)_a[1] + (uint32_t)_b[1] > max ? max : _a[1] + _b[1]; + // d[2] = (uint32_t)_a[2] + (uint32_t)_b[2] > max ? max : _a[2] + _b[2]; + // d[3] = (uint32_t)_a[3] + (uint32_t)_b[3] > max ? max : _a[3] + _b[3]; + // d[4] = (uint32_t)_a[4] + (uint32_t)_b[4] > max ? max : _a[4] + _b[4]; + // d[5] = (uint32_t)_a[5] + (uint32_t)_b[5] > max ? max : _a[5] + _b[5]; + // d[6] = (uint32_t)_a[6] + (uint32_t)_b[6] > max ? max : _a[6] + _b[6]; + // d[7] = (uint32_t)_a[7] + (uint32_t)_b[7] > max ? max : _a[7] + _b[7]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_adds_epu16(a, b); + // + // return VALIDATE_INT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_adds_epu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // uint8_t d[16]; + // d[0] = (uint8_t)_a[0] + (uint8_t)_b[0]; + // if (d[0] < (uint8_t)_a[0]) + // d[0] = 255; + // d[1] = (uint8_t)_a[1] + (uint8_t)_b[1]; + // if (d[1] < (uint8_t)_a[1]) + // d[1] = 255; + // d[2] = (uint8_t)_a[2] + (uint8_t)_b[2]; + // if (d[2] < (uint8_t)_a[2]) + // d[2] = 255; + // d[3] = (uint8_t)_a[3] + (uint8_t)_b[3]; + // if (d[3] < (uint8_t)_a[3]) + // d[3] = 255; + // d[4] = (uint8_t)_a[4] + (uint8_t)_b[4]; + // if (d[4] < (uint8_t)_a[4]) + // d[4] = 255; + // d[5] = (uint8_t)_a[5] + (uint8_t)_b[5]; + // if (d[5] < (uint8_t)_a[5]) + // d[5] = 255; + // d[6] = (uint8_t)_a[6] + (uint8_t)_b[6]; + // if (d[6] < (uint8_t)_a[6]) + // d[6] = 255; + // d[7] = (uint8_t)_a[7] + (uint8_t)_b[7]; + // if (d[7] < (uint8_t)_a[7]) + // d[7] = 255; + // d[8] = (uint8_t)_a[8] + (uint8_t)_b[8]; + // if (d[8] < (uint8_t)_a[8]) + // d[8] = 255; + // d[9] = (uint8_t)_a[9] + (uint8_t)_b[9]; + // if (d[9] < (uint8_t)_a[9]) + // d[9] = 255; + // d[10] = (uint8_t)_a[10] + (uint8_t)_b[10]; + // if (d[10] < (uint8_t)_a[10]) + // d[10] = 255; + // d[11] = (uint8_t)_a[11] + (uint8_t)_b[11]; + // if (d[11] < (uint8_t)_a[11]) + // d[11] = 255; + // d[12] = (uint8_t)_a[12] + (uint8_t)_b[12]; + // if (d[12] < (uint8_t)_a[12]) + // d[12] = 255; + // d[13] = (uint8_t)_a[13] + (uint8_t)_b[13]; + // if (d[13] < (uint8_t)_a[13]) + // d[13] = 255; + // d[14] = (uint8_t)_a[14] + (uint8_t)_b[14]; + // if (d[14] < (uint8_t)_a[14]) + // d[14] = 255; + // d[15] = (uint8_t)_a[15] + (uint8_t)_b[15]; + // if (d[15] < (uint8_t)_a[15]) + // d[15] = 255; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_adds_epu8(a, b); + // return VALIDATE_INT8_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_and_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_float_pointer1; + // const int64_t *_b = (const int64_t *)impl.test_cases_float_pointer2; + // + // int64_t d0 = _a[0] & _b[0]; + // int64_t d1 = _a[1] & _b[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_and_pd(a, b); + // + // return validate_double(c, *((double *)&d0), *((double *)&d1)); + return TEST_UNIMPL; +} + +result_t test_mm_and_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = impl.test_cases_int_pointer1; + // const int32_t *_b = impl.test_cases_int_pointer2; + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128 fc = _mm_and_ps(*(const __m128 *)&a, *(const __m128 *)&b); + // __m128i c = *(const __m128i *)&fc; + // now for the assertion... + // const uint32_t *ia = (const uint32_t *)&a; + // const uint32_t *ib = (const uint32_t *)&b; + // uint32_t r[4]; + // r[0] = ia[0] & ib[0]; + // r[1] = ia[1] & ib[1]; + // r[2] = ia[2] & ib[2]; + // r[3] = ia[3] & ib[3]; + // __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]); + // result_t res = VALIDATE_INT32_M128(c, r); + // if (res) { + // res = VALIDATE_INT32_M128(ret, r); + // } + // return res; + return TEST_UNIMPL; +} + +result_t test_mm_andnot_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_andnot_pd(a, b); + // + // Take AND operation a complement of 'a' and 'b'. Bitwise operations are + // not allowed on float/double datatype, so 'a' and 'b' are calculated in + // uint64_t datatype. + // const uint64_t *ia = (const uint64_t *)&a; + // const uint64_t *ib = (const uint64_t *)&b; + // uint64_t r0 = ~ia[0] & ib[0]; + // uint64_t r1 = ~ia[1] & ib[1]; + // return validate_uint64(*(const __m128i *)&c, r0, r1); + return TEST_UNIMPL; +} + +result_t test_mm_andnot_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = impl.test_cases_int_pointer1; + // const int32_t *_b = impl.test_cases_int_pointer2; + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128 fc = _mm_andnot_ps(*(const __m128 *)&a, *(const __m128 *)&b); + // __m128i c = *(const __m128i *)&fc; + // now for the assertion... + // const uint32_t *ia = (const uint32_t *)&a; + // const uint32_t *ib = (const uint32_t *)&b; + // uint32_t r[4]; + // r[0] = ~ia[0] & ib[0]; + // r[1] = ~ia[1] & ib[1]; + // r[2] = ~ia[2] & ib[2]; + // r[3] = ~ia[3] & ib[3]; + // __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]); + // result_t res = TEST_SUCCESS; + // res = VALIDATE_INT32_M128(c, r); + // if (res) { + // res = VALIDATE_INT32_M128(ret, r); + // } + // return res; + return TEST_UNIMPL; +} + +result_t test_mm_avg_epu16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // uint16_t d[8]; + // d[0] = ((uint16_t)_a[0] + (uint16_t)_b[0] + 1) >> 1; + // d[1] = ((uint16_t)_a[1] + (uint16_t)_b[1] + 1) >> 1; + // d[2] = ((uint16_t)_a[2] + (uint16_t)_b[2] + 1) >> 1; + // d[3] = ((uint16_t)_a[3] + (uint16_t)_b[3] + 1) >> 1; + // d[4] = ((uint16_t)_a[4] + (uint16_t)_b[4] + 1) >> 1; + // d[5] = ((uint16_t)_a[5] + (uint16_t)_b[5] + 1) >> 1; + // d[6] = ((uint16_t)_a[6] + (uint16_t)_b[6] + 1) >> 1; + // d[7] = ((uint16_t)_a[7] + (uint16_t)_b[7] + 1) >> 1; + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_avg_epu16(a, b); + // return VALIDATE_UINT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_avg_epu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // uint8_t d[16]; + // d[0] = ((uint8_t)_a[0] + (uint8_t)_b[0] + 1) >> 1; + // d[1] = ((uint8_t)_a[1] + (uint8_t)_b[1] + 1) >> 1; + // d[2] = ((uint8_t)_a[2] + (uint8_t)_b[2] + 1) >> 1; + // d[3] = ((uint8_t)_a[3] + (uint8_t)_b[3] + 1) >> 1; + // d[4] = ((uint8_t)_a[4] + (uint8_t)_b[4] + 1) >> 1; + // d[5] = ((uint8_t)_a[5] + (uint8_t)_b[5] + 1) >> 1; + // d[6] = ((uint8_t)_a[6] + (uint8_t)_b[6] + 1) >> 1; + // d[7] = ((uint8_t)_a[7] + (uint8_t)_b[7] + 1) >> 1; + // d[8] = ((uint8_t)_a[8] + (uint8_t)_b[8] + 1) >> 1; + // d[9] = ((uint8_t)_a[9] + (uint8_t)_b[9] + 1) >> 1; + // d[10] = ((uint8_t)_a[10] + (uint8_t)_b[10] + 1) >> 1; + // d[11] = ((uint8_t)_a[11] + (uint8_t)_b[11] + 1) >> 1; + // d[12] = ((uint8_t)_a[12] + (uint8_t)_b[12] + 1) >> 1; + // d[13] = ((uint8_t)_a[13] + (uint8_t)_b[13] + 1) >> 1; + // d[14] = ((uint8_t)_a[14] + (uint8_t)_b[14] + 1) >> 1; + // d[15] = ((uint8_t)_a[15] + (uint8_t)_b[15] + 1) >> 1; + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_avg_epu8(a, b); + // return VALIDATE_UINT8_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_bslli_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // return test_mm_slli_si128(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_bsrli_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // return test_mm_srli_si128(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_castpd_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const __m128d a = load_m128d(_a); + // const __m128 _c = load_m128(_a); + // + // __m128 r = _mm_castpd_ps(a); + // + // return validate_128bits(r, _c); + return TEST_UNIMPL; +} + +result_t test_mm_castpd_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const __m128d a = load_m128d(_a); + // const __m128i *_c = (const __m128i *)_a; + // + // __m128i r = _mm_castpd_si128(a); + // + // return validate_128bits(r, *_c); + return TEST_UNIMPL; +} + +result_t test_mm_castps_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const __m128 a = load_m128(_a); + // const __m128d *_c = (const __m128d *)_a; + // + // __m128d r = _mm_castps_pd(a); + // + // return validate_128bits(r, *_c); + return TEST_UNIMPL; +} + +result_t test_mm_castps_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // + // const __m128i *_c = (const __m128i *)_a; + // + // const __m128 a = load_m128(_a); + // __m128i r = _mm_castps_si128(a); + // + // return validate_128bits(r, *_c); + return TEST_UNIMPL; +} + +result_t test_mm_castsi128_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = impl.test_cases_int_pointer1; + // + // const __m128d *_c = (const __m128d *)_a; + // + // const __m128i a = load_m128i(_a); + // __m128d r = _mm_castsi128_pd(a); + // + // return validate_128bits(r, *_c); + return TEST_UNIMPL; +} + +result_t test_mm_castsi128_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = impl.test_cases_int_pointer1; + // + // const __m128 *_c = (const __m128 *)_a; + // + // const __m128i a = load_m128i(_a); + // __m128 r = _mm_castsi128_ps(a); + // + // return validate_128bits(r, *_c); + return TEST_UNIMPL; +} + +result_t test_mm_clflush(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // /* FIXME: Assume that we have portable mechanisms to flush cache. */ + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_cmpeq_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // int16_t d[8]; + // d[0] = (_a[0] == _b[0]) ? ~UINT16_C(0) : 0x0; + // d[1] = (_a[1] == _b[1]) ? ~UINT16_C(0) : 0x0; + // d[2] = (_a[2] == _b[2]) ? ~UINT16_C(0) : 0x0; + // d[3] = (_a[3] == _b[3]) ? ~UINT16_C(0) : 0x0; + // d[4] = (_a[4] == _b[4]) ? ~UINT16_C(0) : 0x0; + // d[5] = (_a[5] == _b[5]) ? ~UINT16_C(0) : 0x0; + // d[6] = (_a[6] == _b[6]) ? ~UINT16_C(0) : 0x0; + // d[7] = (_a[7] == _b[7]) ? ~UINT16_C(0) : 0x0; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_cmpeq_epi16(a, b); + // return VALIDATE_INT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_cmpeq_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = impl.test_cases_int_pointer1; + // const int32_t *_b = impl.test_cases_int_pointer2; + // + // int32_t d[4]; + // d[0] = (_a[0] == _b[0]) ? ~UINT32_C(0) : 0x0; + // d[1] = (_a[1] == _b[1]) ? ~UINT32_C(0) : 0x0; + // d[2] = (_a[2] == _b[2]) ? ~UINT32_C(0) : 0x0; + // d[3] = (_a[3] == _b[3]) ? ~UINT32_C(0) : 0x0; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_cmpeq_epi32(a, b); + // + // return VALIDATE_INT32_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_cmpeq_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // int8_t d[16]; + // d[0] = (_a[0] == _b[0]) ? ~UINT8_C(0) : 0x00; + // d[1] = (_a[1] == _b[1]) ? ~UINT8_C(0) : 0x00; + // d[2] = (_a[2] == _b[2]) ? ~UINT8_C(0) : 0x00; + // d[3] = (_a[3] == _b[3]) ? ~UINT8_C(0) : 0x00; + // d[4] = (_a[4] == _b[4]) ? ~UINT8_C(0) : 0x00; + // d[5] = (_a[5] == _b[5]) ? ~UINT8_C(0) : 0x00; + // d[6] = (_a[6] == _b[6]) ? ~UINT8_C(0) : 0x00; + // d[7] = (_a[7] == _b[7]) ? ~UINT8_C(0) : 0x00; + // d[8] = (_a[8] == _b[8]) ? ~UINT8_C(0) : 0x00; + // d[9] = (_a[9] == _b[9]) ? ~UINT8_C(0) : 0x00; + // d[10] = (_a[10] == _b[10]) ? ~UINT8_C(0) : 0x00; + // d[11] = (_a[11] == _b[11]) ? ~UINT8_C(0) : 0x00; + // d[12] = (_a[12] == _b[12]) ? ~UINT8_C(0) : 0x00; + // d[13] = (_a[13] == _b[13]) ? ~UINT8_C(0) : 0x00; + // d[14] = (_a[14] == _b[14]) ? ~UINT8_C(0) : 0x00; + // d[15] = (_a[15] == _b[15]) ? ~UINT8_C(0) : 0x00; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_cmpeq_epi8(a, b); + // return VALIDATE_INT8_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_cmpeq_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // uint64_t d0 = (_a[0] == _b[0]) ? 0xffffffffffffffff : 0; + // uint64_t d1 = (_a[1] == _b[1]) ? 0xffffffffffffffff : 0; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_cmpeq_pd(a, b); + // return validate_double(c, *(double *)&d0, *(double *)&d1); + return TEST_UNIMPL; +} + +result_t test_mm_cmpeq_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // const uint64_t d0 = (_a[0] == _b[0]) ? ~UINT64_C(0) : 0; + // const uint64_t d1 = ((const uint64_t *)_a)[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_cmpeq_sd(a, b); + // + // return validate_double(c, *(const double *)&d0, *(const double *)&d1); + return TEST_UNIMPL; +} + +result_t test_mm_cmpge_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // uint64_t d0 = (_a[0] >= _b[0]) ? ~UINT64_C(0) : 0; + // uint64_t d1 = (_a[1] >= _b[1]) ? ~UINT64_C(0) : 0; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_cmpge_pd(a, b); + // + // return validate_double(c, *(double *)&d0, *(double *)&d1); + return TEST_UNIMPL; +} + +result_t test_mm_cmpge_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // double *_a = (double *)impl.test_cases_float_pointer1; + // double *_b = (double *)impl.test_cases_float_pointer2; + // uint64_t d0 = (_a[0] >= _b[0]) ? ~UINT64_C(0) : 0; + // uint64_t d1 = ((uint64_t *)_a)[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_cmpge_sd(a, b); + // + // return validate_double(c, *(double *)&d0, *(double *)&d1); + return TEST_UNIMPL; +} + +result_t test_mm_cmpgt_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // uint16_t d[8]; + // d[0] = _a[0] > _b[0] ? ~UINT16_C(0) : 0; + // d[1] = _a[1] > _b[1] ? ~UINT16_C(0) : 0; + // d[2] = _a[2] > _b[2] ? ~UINT16_C(0) : 0; + // d[3] = _a[3] > _b[3] ? ~UINT16_C(0) : 0; + // d[4] = _a[4] > _b[4] ? ~UINT16_C(0) : 0; + // d[5] = _a[5] > _b[5] ? ~UINT16_C(0) : 0; + // d[6] = _a[6] > _b[6] ? ~UINT16_C(0) : 0; + // d[7] = _a[7] > _b[7] ? ~UINT16_C(0) : 0; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_cmpgt_epi16(a, b); + // + // return VALIDATE_INT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_cmpgt_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = impl.test_cases_int_pointer1; + // const int32_t *_b = impl.test_cases_int_pointer2; + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // + // int32_t result[4]; + // + // result[0] = _a[0] > _b[0] ? -1 : 0; + // result[1] = _a[1] > _b[1] ? -1 : 0; + // result[2] = _a[2] > _b[2] ? -1 : 0; + // result[3] = _a[3] > _b[3] ? -1 : 0; + // + // __m128i iret = _mm_cmpgt_epi32(a, b); + // return VALIDATE_INT32_M128(iret, result); + return TEST_UNIMPL; +} + +result_t test_mm_cmpgt_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // int8_t d[16]; + // d[0] = (_a[0] > _b[0]) ? ~UINT8_C(0) : 0x00; + // d[1] = (_a[1] > _b[1]) ? ~UINT8_C(0) : 0x00; + // d[2] = (_a[2] > _b[2]) ? ~UINT8_C(0) : 0x00; + // d[3] = (_a[3] > _b[3]) ? ~UINT8_C(0) : 0x00; + // d[4] = (_a[4] > _b[4]) ? ~UINT8_C(0) : 0x00; + // d[5] = (_a[5] > _b[5]) ? ~UINT8_C(0) : 0x00; + // d[6] = (_a[6] > _b[6]) ? ~UINT8_C(0) : 0x00; + // d[7] = (_a[7] > _b[7]) ? ~UINT8_C(0) : 0x00; + // d[8] = (_a[8] > _b[8]) ? ~UINT8_C(0) : 0x00; + // d[9] = (_a[9] > _b[9]) ? ~UINT8_C(0) : 0x00; + // d[10] = (_a[10] > _b[10]) ? ~UINT8_C(0) : 0x00; + // d[11] = (_a[11] > _b[11]) ? ~UINT8_C(0) : 0x00; + // d[12] = (_a[12] > _b[12]) ? ~UINT8_C(0) : 0x00; + // d[13] = (_a[13] > _b[13]) ? ~UINT8_C(0) : 0x00; + // d[14] = (_a[14] > _b[14]) ? ~UINT8_C(0) : 0x00; + // d[15] = (_a[15] > _b[15]) ? ~UINT8_C(0) : 0x00; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_cmpgt_epi8(a, b); + // return VALIDATE_INT8_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_cmpgt_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // uint64_t d0 = (_a[0] > _b[0]) ? ~UINT64_C(0) : 0; + // uint64_t d1 = (_a[1] > _b[1]) ? ~UINT64_C(0) : 0; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_cmpgt_pd(a, b); + // + // return validate_double(c, *(double *)&d0, *(double *)&d1); + return TEST_UNIMPL; +} + +result_t test_mm_cmpgt_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // double *_a = (double *)impl.test_cases_float_pointer1; + // double *_b = (double *)impl.test_cases_float_pointer2; + // uint64_t d0 = (_a[0] > _b[0]) ? ~UINT64_C(0) : 0; + // uint64_t d1 = ((uint64_t *)_a)[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_cmpgt_sd(a, b); + // + // return validate_double(c, *(double *)&d0, *(double *)&d1); + return TEST_UNIMPL; +} + +result_t test_mm_cmple_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // uint64_t d0 = (_a[0] <= _b[0]) ? ~UINT64_C(0) : 0; + // uint64_t d1 = (_a[1] <= _b[1]) ? ~UINT64_C(0) : 0; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_cmple_pd(a, b); + // + // return validate_double(c, *(double *)&d0, *(double *)&d1); + return TEST_UNIMPL; +} + +result_t test_mm_cmple_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // double *_a = (double *)impl.test_cases_float_pointer1; + // double *_b = (double *)impl.test_cases_float_pointer2; + // uint64_t d0 = (_a[0] <= _b[0]) ? ~UINT64_C(0) : 0; + // uint64_t d1 = ((uint64_t *)_a)[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_cmple_sd(a, b); + // + // return validate_double(c, *(double *)&d0, *(double *)&d1); + return TEST_UNIMPL; +} + +result_t test_mm_cmplt_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // uint16_t d[8]; + // d[0] = _a[0] < _b[0] ? ~UINT16_C(0) : 0; + // d[1] = _a[1] < _b[1] ? ~UINT16_C(0) : 0; + // d[2] = _a[2] < _b[2] ? ~UINT16_C(0) : 0; + // d[3] = _a[3] < _b[3] ? ~UINT16_C(0) : 0; + // d[4] = _a[4] < _b[4] ? ~UINT16_C(0) : 0; + // d[5] = _a[5] < _b[5] ? ~UINT16_C(0) : 0; + // d[6] = _a[6] < _b[6] ? ~UINT16_C(0) : 0; + // d[7] = _a[7] < _b[7] ? ~UINT16_C(0) : 0; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_cmplt_epi16(a, b); + // + // return VALIDATE_UINT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_cmplt_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = impl.test_cases_int_pointer1; + // const int32_t *_b = impl.test_cases_int_pointer2; + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // + // int32_t result[4]; + // result[0] = _a[0] < _b[0] ? -1 : 0; + // result[1] = _a[1] < _b[1] ? -1 : 0; + // result[2] = _a[2] < _b[2] ? -1 : 0; + // result[3] = _a[3] < _b[3] ? -1 : 0; + // + // __m128i iret = _mm_cmplt_epi32(a, b); + // return VALIDATE_INT32_M128(iret, result); + return TEST_UNIMPL; +} + +result_t test_mm_cmplt_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // int8_t d[16]; + // d[0] = (_a[0] < _b[0]) ? ~UINT8_C(0) : 0x00; + // d[1] = (_a[1] < _b[1]) ? ~UINT8_C(0) : 0x00; + // d[2] = (_a[2] < _b[2]) ? ~UINT8_C(0) : 0x00; + // d[3] = (_a[3] < _b[3]) ? ~UINT8_C(0) : 0x00; + // d[4] = (_a[4] < _b[4]) ? ~UINT8_C(0) : 0x00; + // d[5] = (_a[5] < _b[5]) ? ~UINT8_C(0) : 0x00; + // d[6] = (_a[6] < _b[6]) ? ~UINT8_C(0) : 0x00; + // d[7] = (_a[7] < _b[7]) ? ~UINT8_C(0) : 0x00; + // d[8] = (_a[8] < _b[8]) ? ~UINT8_C(0) : 0x00; + // d[9] = (_a[9] < _b[9]) ? ~UINT8_C(0) : 0x00; + // d[10] = (_a[10] < _b[10]) ? ~UINT8_C(0) : 0x00; + // d[11] = (_a[11] < _b[11]) ? ~UINT8_C(0) : 0x00; + // d[12] = (_a[12] < _b[12]) ? ~UINT8_C(0) : 0x00; + // d[13] = (_a[13] < _b[13]) ? ~UINT8_C(0) : 0x00; + // d[14] = (_a[14] < _b[14]) ? ~UINT8_C(0) : 0x00; + // d[15] = (_a[15] < _b[15]) ? ~UINT8_C(0) : 0x00; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_cmplt_epi8(a, b); + // return VALIDATE_INT8_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_cmplt_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // + // int64_t f0 = (_a[0] < _b[0]) ? ~UINT64_C(0) : UINT64_C(0); + // int64_t f1 = (_a[1] < _b[1]) ? ~UINT64_C(0) : UINT64_C(0); + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_cmplt_pd(a, b); + // + // return validate_double(c, *(double *)&f0, *(double *)&f1); + return TEST_UNIMPL; +} + +result_t test_mm_cmplt_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // double *_a = (double *)impl.test_cases_float_pointer1; + // double *_b = (double *)impl.test_cases_float_pointer2; + // uint64_t d0 = (_a[0] < _b[0]) ? ~UINT64_C(0) : 0; + // uint64_t d1 = ((uint64_t *)_a)[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_cmplt_sd(a, b); + // + // return validate_double(c, *(double *)&d0, *(double *)&d1); + return TEST_UNIMPL; +} + +result_t test_mm_cmpneq_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // + // int64_t f0 = (_a[0] != _b[0]) ? ~UINT64_C(0) : UINT64_C(0); + // int64_t f1 = (_a[1] != _b[1]) ? ~UINT64_C(0) : UINT64_C(0); + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_cmpneq_pd(a, b); + // + // return validate_double(c, *(double *)&f0, *(double *)&f1); + return TEST_UNIMPL; +} + +result_t test_mm_cmpneq_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // double *_a = (double *)impl.test_cases_float_pointer1; + // double *_b = (double *)impl.test_cases_float_pointer2; + // + // int64_t f0 = (_a[0] != _b[0]) ? ~UINT64_C(0) : UINT64_C(0); + // int64_t f1 = ((int64_t *)_a)[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_cmpneq_sd(a, b); + // + // return validate_double(c, *(double *)&f0, *(double *)&f1); + return TEST_UNIMPL; +} + +result_t test_mm_cmpnge_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // uint64_t d0 = !(_a[0] >= _b[0]) ? ~UINT64_C(0) : 0; + // uint64_t d1 = !(_a[1] >= _b[1]) ? ~UINT64_C(0) : 0; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_cmpnge_pd(a, b); + // + // return validate_double(c, *(double *)&d0, *(double *)&d1); + return TEST_UNIMPL; +} + +result_t test_mm_cmpnge_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // double *_a = (double *)impl.test_cases_float_pointer1; + // double *_b = (double *)impl.test_cases_float_pointer2; + // uint64_t d0 = !(_a[0] >= _b[0]) ? ~UINT64_C(0) : 0; + // uint64_t d1 = ((uint64_t *)_a)[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_cmpnge_sd(a, b); + // + // return validate_double(c, *(double *)&d0, *(double *)&d1); + return TEST_UNIMPL; +} + +result_t test_mm_cmpngt_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // uint64_t d0 = !(_a[0] > _b[0]) ? ~UINT64_C(0) : 0; + // uint64_t d1 = !(_a[1] > _b[1]) ? ~UINT64_C(0) : 0; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_cmpngt_pd(a, b); + // + // return validate_double(c, *(double *)&d0, *(double *)&d1); + return TEST_UNIMPL; +} + +result_t test_mm_cmpngt_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // double *_a = (double *)impl.test_cases_float_pointer1; + // double *_b = (double *)impl.test_cases_float_pointer2; + // uint64_t d0 = !(_a[0] > _b[0]) ? ~UINT64_C(0) : 0; + // uint64_t d1 = ((uint64_t *)_a)[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_cmpngt_sd(a, b); + // + // return validate_double(c, *(double *)&d0, *(double *)&d1); + return TEST_UNIMPL; +} + +result_t test_mm_cmpnle_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // uint64_t d0 = !(_a[0] <= _b[0]) ? ~UINT64_C(0) : 0; + // uint64_t d1 = !(_a[1] <= _b[1]) ? ~UINT64_C(0) : 0; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_cmpnle_pd(a, b); + // + // return validate_double(c, *(double *)&d0, *(double *)&d1); + return TEST_UNIMPL; +} + +result_t test_mm_cmpnle_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // double *_a = (double *)impl.test_cases_float_pointer1; + // double *_b = (double *)impl.test_cases_float_pointer2; + // uint64_t d0 = !(_a[0] <= _b[0]) ? ~UINT64_C(0) : 0; + // uint64_t d1 = ((uint64_t *)_a)[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_cmpnle_sd(a, b); + // + // return validate_double(c, *(double *)&d0, *(double *)&d1); + return TEST_UNIMPL; +} + +result_t test_mm_cmpnlt_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // uint64_t d0 = !(_a[0] < _b[0]) ? ~UINT64_C(0) : 0; + // uint64_t d1 = !(_a[1] < _b[1]) ? ~UINT64_C(0) : 0; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_cmpnlt_pd(a, b); + // + // return validate_double(c, *(double *)&d0, *(double *)&d1); + return TEST_UNIMPL; +} + +result_t test_mm_cmpnlt_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // double *_a = (double *)impl.test_cases_float_pointer1; + // double *_b = (double *)impl.test_cases_float_pointer2; + // uint64_t d0 = !(_a[0] < _b[0]) ? ~UINT64_C(0) : 0; + // uint64_t d1 = ((uint64_t *)_a)[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_cmpnlt_sd(a, b); + // + // return validate_double(c, *(double *)&d0, *(double *)&d1); + return TEST_UNIMPL; +} + +result_t test_mm_cmpord_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // __m128d a = _mm_load_pd(_a); + // __m128d b = _mm_load_pd(_b); + // + // double result[2]; + // + // for (uint32_t i = 0; i < 2; i++) { + // result[i] = cmp_noNaN(_a[i], _b[i]); + // } + // + // __m128d ret = _mm_cmpord_pd(a, b); + // + // return validate_double(ret, result[0], result[1]); + return TEST_UNIMPL; +} + +result_t test_mm_cmpord_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // __m128d a = _mm_load_pd(_a); + // __m128d b = _mm_load_pd(_b); + // + // double c0 = cmp_noNaN(_a[0], _b[0]); + // double c1 = _a[1]; + // + // __m128d ret = _mm_cmpord_sd(a, b); + // return validate_double(ret, c0, c1); + return TEST_UNIMPL; +} + +result_t test_mm_cmpunord_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // __m128d a = _mm_load_pd(_a); + // __m128d b = _mm_load_pd(_b); + // + // double result[2]; + // result[0] = cmp_hasNaN(_a[0], _b[0]); + // result[1] = cmp_hasNaN(_a[1], _b[1]); + // + // __m128d ret = _mm_cmpunord_pd(a, b); + // return validate_double(ret, result[0], result[1]); + return TEST_UNIMPL; +} + +result_t test_mm_cmpunord_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // double *_a = (double *)impl.test_cases_float_pointer1; + // double *_b = (double *)impl.test_cases_float_pointer2; + // __m128d a = _mm_load_pd(_a); + // __m128d b = _mm_load_pd(_b); + // + // double result[2]; + // result[0] = cmp_hasNaN(_a[0], _b[0]); + // result[1] = _a[1]; + // + // __m128d ret = _mm_cmpunord_sd(a, b); + // return validate_double(ret, result[0], result[1]); + return TEST_UNIMPL; +} + +result_t test_mm_comieq_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // FIXME: + // The GCC does not implement _mm_comieq_sd correctly. + // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98612 for more + // information. + // #if defined(__GNUC__) && !defined(__clang__) + // return TEST_UNIMPL; + // #else + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // int32_t _c = (_a[0] == _b[0]) ? 1 : 0; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // int32_t c = _mm_comieq_sd(a, b); + // + // ASSERT_RETURN(c == _c); + // return TEST_SUCCESS; + // #endif + return TEST_UNIMPL; +} + +result_t test_mm_comige_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // int32_t _c = (_a[0] >= _b[0]) ? 1 : 0; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // int32_t c = _mm_comige_sd(a, b); + // + // ASSERT_RETURN(c == _c); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_comigt_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // int32_t _c = (_a[0] > _b[0]) ? 1 : 0; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // int32_t c = _mm_comigt_sd(a, b); + // + // ASSERT_RETURN(c == _c); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_comile_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // FIXME: + // The GCC does not implement _mm_comile_sd correctly. + // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98612 for more + // information. + // #if defined(__GNUC__) && !defined(__clang__) + // return TEST_UNIMPL; + // #else + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // int32_t _c = (_a[0] <= _b[0]) ? 1 : 0; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // int32_t c = _mm_comile_sd(a, b); + // + // ASSERT_RETURN(c == _c); + // return TEST_SUCCESS; + // #endif + return TEST_UNIMPL; +} + +result_t test_mm_comilt_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // FIXME: + // The GCC does not implement _mm_comilt_sd correctly. + // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98612 for more + // information. + // #if defined(__GNUC__) && !defined(__clang__) + // return TEST_UNIMPL; + // #else + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // int32_t _c = (_a[0] < _b[0]) ? 1 : 0; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // int32_t c = _mm_comilt_sd(a, b); + // + // ASSERT_RETURN(c == _c); + // return TEST_SUCCESS; + // #endif + return TEST_UNIMPL; +} + +result_t test_mm_comineq_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // FIXME: + // The GCC does not implement _mm_comineq_sd correctly. + // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98612 for more + // information. + // #if defined(__GNUC__) && !defined(__clang__) + // return TEST_UNIMPL; + // #else + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // int32_t _c = (_a[0] != _b[0]) ? 1 : 0; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // int32_t c = _mm_comineq_sd(a, b); + // + // ASSERT_RETURN(c == _c); + // return TEST_SUCCESS; + // #endif + return TEST_UNIMPL; +} + +result_t test_mm_cvtepi32_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = impl.test_cases_int_pointer1; + // __m128i a = load_m128i(_a); + // double trun[2] = {(double)_a[0], (double)_a[1]}; + // + // __m128d ret = _mm_cvtepi32_pd(a); + // return validate_double(ret, trun[0], trun[1]); + return TEST_UNIMPL; +} + +result_t test_mm_cvtepi32_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = impl.test_cases_int_pointer1; + // __m128i a = load_m128i(_a); + // float trun[4]; + // for (uint32_t i = 0; i < 4; i++) { + // trun[i] = (float)_a[i]; + // } + // + // __m128 ret = _mm_cvtepi32_ps(a); + // return validate_float(ret, trun[0], trun[1], trun[2], trun[3]); + return TEST_UNIMPL; +} + +result_t test_mm_cvtpd_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // int32_t d[2]; + // + // switch (iter & 0x3) { + // case 0: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + // d[0] = (int32_t)(bankersRounding(_a[0])); + // d[1] = (int32_t)(bankersRounding(_a[1])); + // break; + // case 1: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); + // d[0] = (int32_t)(floor(_a[0])); + // d[1] = (int32_t)(floor(_a[1])); + // break; + // case 2: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); + // d[0] = (int32_t)(ceil(_a[0])); + // d[1] = (int32_t)(ceil(_a[1])); + // break; + // case 3: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); + // d[0] = (int32_t)(_a[0]); + // d[1] = (int32_t)(_a[1]); + // break; + // } + // + // __m128d a = load_m128d(_a); + // __m128i ret = _mm_cvtpd_epi32(a); + // + // return validate_int32(ret, d[0], d[1], 0, 0); + return TEST_UNIMPL; +} + +result_t test_mm_cvtpd_pi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // int32_t d[2]; + // + // switch (iter & 0x3) { + // case 0: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + // d[0] = (int32_t)(bankersRounding(_a[0])); + // d[1] = (int32_t)(bankersRounding(_a[1])); + // break; + // case 1: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); + // d[0] = (int32_t)(floor(_a[0])); + // d[1] = (int32_t)(floor(_a[1])); + // break; + // case 2: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); + // d[0] = (int32_t)(ceil(_a[0])); + // d[1] = (int32_t)(ceil(_a[1])); + // break; + // case 3: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); + // d[0] = (int32_t)(_a[0]); + // d[1] = (int32_t)(_a[1]); + // break; + // } + // + // __m128d a = load_m128d(_a); + // __m64 ret = _mm_cvtpd_pi32(a); + // + // return VALIDATE_INT32_M64(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_cvtpd_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // float f0 = (float)_a[0]; + // float f1 = (float)_a[1]; + // const __m128d a = load_m128d(_a); + // + // __m128 r = _mm_cvtpd_ps(a); + // + // return validate_float(r, f0, f1, 0, 0); + return TEST_UNIMPL; +} + +result_t test_mm_cvtpi32_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = impl.test_cases_int_pointer1; + // __m64 a = load_m64(_a); + // + // double trun[2] = {(double)_a[0], (double)_a[1]}; + // + // __m128d ret = _mm_cvtpi32_pd(a); + // + // return validate_double(ret, trun[0], trun[1]); + return TEST_UNIMPL; +} + +result_t test_mm_cvtps_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // __m128 a = load_m128(_a); + // int32_t d[4]; + // switch (iter & 0x3) { + // case 0: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + // for (uint32_t i = 0; i < 4; i++) { + // d[i] = (int32_t)(bankersRounding(_a[i])); + // } + // break; + // case 1: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); + // for (uint32_t i = 0; i < 4; i++) { + // d[i] = (int32_t)(floorf(_a[i])); + // } + // break; + // case 2: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); + // for (uint32_t i = 0; i < 4; i++) { + // d[i] = (int32_t)(ceilf(_a[i])); + // } + // break; + // case 3: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); + // for (uint32_t i = 0; i < 4; i++) { + // d[i] = (int32_t)(_a[i]); + // } + // break; + // } + // + // __m128i ret = _mm_cvtps_epi32(a); + // return VALIDATE_INT32_M128(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_cvtps_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // double d0 = (double)_a[0]; + // double d1 = (double)_a[1]; + // const __m128 a = load_m128(_a); + // + // __m128d r = _mm_cvtps_pd(a); + // + // return validate_double(r, d0, d1); + return TEST_UNIMPL; +} + +result_t test_mm_cvtsd_f64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // + // double d = _a[0]; + // + // const __m128d *a = (const __m128d *)_a; + // double r = _mm_cvtsd_f64(*a); + // + // return r == d ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +result_t test_mm_cvtsd_si32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // int32_t d; + // + // switch (iter & 0x3) { + // case 0: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + // d = (int32_t)(bankersRounding(_a[0])); + // break; + // case 1: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); + // d = (int32_t)(floor(_a[0])); + // break; + // case 2: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); + // d = (int32_t)(ceil(_a[0])); + // break; + // case 3: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); + // d = (int32_t)(_a[0]); + // break; + // } + // + // __m128d a = load_m128d(_a); + // int32_t ret = _mm_cvtsd_si32(a); + // + // return ret == d ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +result_t test_mm_cvtsd_si64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // int64_t d; + // + // switch (iter & 0x3) { + // case 0: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + // d = (int64_t)(bankersRounding(_a[0])); + // break; + // case 1: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); + // d = (int64_t)(floor(_a[0])); + // break; + // case 2: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); + // d = (int64_t)(ceil(_a[0])); + // break; + // case 3: + // _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); + // d = (int64_t)(_a[0]); + // break; + // } + // + // __m128d a = load_m128d(_a); + // int64_t ret = _mm_cvtsd_si64(a); + // + // return ret == d ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +result_t test_mm_cvtsd_si64x(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // return test_mm_cvtsd_si64(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_cvtsd_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // + // float f0 = _b[0]; + // float f1 = _a[1]; + // float f2 = _a[2]; + // float f3 = _a[3]; + // + // __m128 a = load_m128(_a); + // __m128d b = load_m128d(_b); + // __m128 c = _mm_cvtsd_ss(a, b); + // + // return validate_float(c, f0, f1, f2, f3); + return TEST_UNIMPL; +} + +result_t test_mm_cvtsi128_si32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // + // int32_t d = _a[0]; + // + // __m128i a = load_m128i(_a); + // int c = _mm_cvtsi128_si32(a); + // + // return d == c ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +result_t test_mm_cvtsi128_si64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // + // int64_t d = _a[0]; + // + // __m128i a = load_m128i(_a); + // int64_t c = _mm_cvtsi128_si64(a); + // + // return d == c ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +result_t test_mm_cvtsi128_si64x(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // return test_mm_cvtsi128_si64(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_cvtsi32_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const int32_t b = (const int32_t)impl.test_cases_ints[iter]; + // + // __m128d a = load_m128d(_a); + // __m128d c = _mm_cvtsi32_sd(a, b); + // + // return validate_double(c, b, _a[1]); + return TEST_UNIMPL; +} + +result_t test_mm_cvtsi32_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // + // int32_t d = _a[0]; + // + // __m128i c = _mm_cvtsi32_si128(*_a); + // + // return validate_int32(c, d, 0, 0, 0); + return TEST_UNIMPL; +} + +result_t test_mm_cvtsi64_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const int64_t b = (const int64_t)impl.test_cases_ints[iter]; + // + // __m128d a = load_m128d(_a); + // __m128d c = _mm_cvtsi64_sd(a, b); + // + // return validate_double(c, b, _a[1]); + return TEST_UNIMPL; +} + +result_t test_mm_cvtsi64_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // + // int64_t d = _a[0]; + // + // __m128i c = _mm_cvtsi64_si128(*_a); + // + // return validate_int64(c, d, 0); + return TEST_UNIMPL; +} + +result_t test_mm_cvtsi64x_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // return test_mm_cvtsi64_sd(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_cvtsi64x_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // return test_mm_cvtsi64_si128(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_cvtss_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // double d0 = double(_b[0]); + // double d1 = _a[1]; + // + // __m128d a = load_m128d(_a); + // __m128 b = load_m128(_b); + // __m128d c = _mm_cvtss_sd(a, b); + // return validate_double(c, d0, d1); + return TEST_UNIMPL; +} + +result_t test_mm_cvttpd_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // + // __m128d a = load_m128d(_a); + // int32_t d0 = (int32_t)(_a[0]); + // int32_t d1 = (int32_t)(_a[1]); + // + // __m128i ret = _mm_cvttpd_epi32(a); + // return validate_int32(ret, d0, d1, 0, 0); + return TEST_UNIMPL; +} + +result_t test_mm_cvttpd_pi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // + // __m128d a = load_m128d(_a); + // int32_t d0 = (int32_t)(_a[0]); + // int32_t d1 = (int32_t)(_a[1]); + // + // __m64 ret = _mm_cvttpd_pi32(a); + // return validate_int32(ret, d0, d1); + return TEST_UNIMPL; +} + +result_t test_mm_cvttps_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // __m128 a = load_m128(_a); + // int32_t trun[4]; + // for (uint32_t i = 0; i < 4; i++) { + // trun[i] = (int32_t)_a[i]; + // } + // + // __m128i ret = _mm_cvttps_epi32(a); + // return VALIDATE_INT32_M128(ret, trun); + return TEST_UNIMPL; +} + +result_t test_mm_cvttsd_si32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // + // __m128d a = _mm_load_sd(_a); + // int32_t ret = _mm_cvttsd_si32(a); + // + // return ret == (int32_t)_a[0] ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +result_t test_mm_cvttsd_si64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // + // __m128d a = _mm_load_sd(_a); + // int64_t ret = _mm_cvttsd_si64(a); + // + // return ret == (int64_t)_a[0] ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +result_t test_mm_cvttsd_si64x(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // #if defined(__clang__) + // The intrinsic _mm_cvttsd_si64x() does not exist in Clang + // return TEST_UNIMPL; + // #else + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // + // __m128d a = _mm_load_sd(_a); + // int64_t ret = _mm_cvttsd_si64x(a); + // + // return ret == (int64_t)_a[0] ? TEST_SUCCESS : TEST_FAIL; + // #endif + return TEST_UNIMPL; +} + +result_t test_mm_div_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // double d0 = 0.0, d1 = 0.0; + // + // if (_b[0] != 0.0) + // d0 = _a[0] / _b[0]; + // if (_b[1] != 0.0) + // d1 = _a[1] / _b[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_div_pd(a, b); + // return validate_double(c, d0, d1); + return TEST_UNIMPL; +} + +result_t test_mm_div_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // + // double d0 = _a[0] / _b[0]; + // double d1 = _a[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // + // __m128d c = _mm_div_sd(a, b); + // + // return validate_double(c, d0, d1); + return TEST_UNIMPL; +} + +result_t test_mm_extract_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // uint16_t *_a = (uint16_t *)impl.test_cases_int_pointer1; + // const int idx = iter & 0x7; + // __m128i a = load_m128i(_a); + // int c; + // switch (idx) { + // case 0: + // c = _mm_extract_epi16(a, 0); + // break; + // case 1: + // c = _mm_extract_epi16(a, 1); + // break; + // case 2: + // c = _mm_extract_epi16(a, 2); + // break; + // case 3: + // c = _mm_extract_epi16(a, 3); + // break; + // case 4: + // c = _mm_extract_epi16(a, 4); + // break; + // case 5: + // c = _mm_extract_epi16(a, 5); + // break; + // case 6: + // c = _mm_extract_epi16(a, 6); + // break; + // case 7: + // c = _mm_extract_epi16(a, 7); + // break; + // } + // + // ASSERT_RETURN(c == *(_a + idx)); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_insert_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t insert = (int16_t)*impl.test_cases_int_pointer2; + // + // #define TEST_IMPL(IDX) + // int16_t d##IDX[8]; + // for (int i = 0; i < 8; i++) { + // d##IDX[i] = _a[i]; + // } + // d##IDX[IDX] = insert; + // + // __m128i a##IDX = load_m128i(_a); + // __m128i b##IDX = _mm_insert_epi16(a##IDX, insert, IDX); + // CHECK_RESULT(VALIDATE_INT16_M128(b##IDX, d##IDX)) + // + // IMM_8_ITER + // #undef TEST_IMPL + // + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_lfence(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // /* FIXME: Assume that memory barriers always function as intended. */ + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_load_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *p = (const double *)impl.test_cases_float_pointer1; + // __m128d a = _mm_load_pd(p); + // return validate_double(a, p[0], p[1]); + return TEST_UNIMPL; +} + +result_t test_mm_load_pd1(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *p = (const double *)impl.test_cases_float_pointer1; + // __m128d a = _mm_load_pd1(p); + // return validate_double(a, p[0], p[0]); + return TEST_UNIMPL; +} + +result_t test_mm_load_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *p = (const double *)impl.test_cases_float_pointer1; + // __m128d a = _mm_load_sd(p); + // return validate_double(a, p[0], 0); + return TEST_UNIMPL; +} + +result_t test_mm_load_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *addr = impl.test_cases_int_pointer1; + // + // __m128i ret = _mm_load_si128((const __m128i *)addr); + // + // return VALIDATE_INT32_M128(ret, addr); + return TEST_UNIMPL; +} + +result_t test_mm_load1_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *addr = (const double *)impl.test_cases_float_pointer1; + // + // __m128d ret = _mm_load1_pd(addr); + // + // return validate_double(ret, addr[0], addr[0]); + return TEST_UNIMPL; +} + +result_t test_mm_loadh_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *addr = (const double *)impl.test_cases_float_pointer2; + // + // __m128d a = load_m128d(_a); + // __m128d ret = _mm_loadh_pd(a, addr); + // + // return validate_double(ret, _a[0], addr[0]); + return TEST_UNIMPL; +} + +result_t test_mm_loadl_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *addr = (const int64_t *)impl.test_cases_int_pointer1; + // + // __m128i ret = _mm_loadl_epi64((const __m128i *)addr); + // + // return validate_int64(ret, addr[0], 0); + return TEST_UNIMPL; +} + +result_t test_mm_loadl_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *addr = (const double *)impl.test_cases_float_pointer2; + // + // __m128d a = load_m128d(_a); + // __m128d ret = _mm_loadl_pd(a, addr); + // + // return validate_double(ret, addr[0], _a[1]); + return TEST_UNIMPL; +} + +result_t test_mm_loadr_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *addr = (const double *)impl.test_cases_float_pointer1; + // + // __m128d ret = _mm_loadr_pd(addr); + // + // return validate_double(ret, addr[1], addr[0]); + return TEST_UNIMPL; +} + +result_t test_mm_loadu_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *p = (const double *)impl.test_cases_float_pointer1; + // __m128d a = _mm_loadu_pd(p); + // return validate_double(a, p[0], p[1]); + return TEST_UNIMPL; +} + +result_t test_mm_loadu_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // __m128i c = _mm_loadu_si128((const __m128i *)_a); + // return VALIDATE_INT32_M128(c, _a); + return TEST_UNIMPL; +} + +result_t test_mm_loadu_si32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // The GCC version before 11 does not implement intrinsic function + // _mm_loadu_si32. Check https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95483 + // for more information. + // #if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10) + // return TEST_UNIMPL; + // #else + // const int32_t *addr = (const int32_t *)impl.test_cases_int_pointer1; + // + // __m128i ret = _mm_loadu_si32((const void *)addr); + // + // return validate_int32(ret, addr[0], 0, 0, 0); + // #endif + return TEST_UNIMPL; +} + +result_t test_mm_madd_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // int32_t d0 = (int32_t)_a[0] * _b[0]; + // int32_t d1 = (int32_t)_a[1] * _b[1]; + // int32_t d2 = (int32_t)_a[2] * _b[2]; + // int32_t d3 = (int32_t)_a[3] * _b[3]; + // int32_t d4 = (int32_t)_a[4] * _b[4]; + // int32_t d5 = (int32_t)_a[5] * _b[5]; + // int32_t d6 = (int32_t)_a[6] * _b[6]; + // int32_t d7 = (int32_t)_a[7] * _b[7]; + // + // int32_t e[4]; + // e[0] = d0 + d1; + // e[1] = d2 + d3; + // e[2] = d4 + d5; + // e[3] = d6 + d7; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_madd_epi16(a, b); + // return VALIDATE_INT32_M128(c, e); + return TEST_UNIMPL; +} + +result_t test_mm_maskmoveu_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1; + // const uint8_t *_mask = (const uint8_t *)impl.test_cases_int_pointer2; + // char mem_addr[16]; + // + // __m128i a = load_m128i(_a); + // __m128i mask = load_m128i(_mask); + // _mm_maskmoveu_si128(a, mask, mem_addr); + // + // for (int i = 0; i < 16; i++) { + // if (_mask[i] >> 7) { + // ASSERT_RETURN(_a[i] == (uint8_t)mem_addr[i]); + // } + // } + // + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_max_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // int16_t d[8]; + // d[0] = _a[0] > _b[0] ? _a[0] : _b[0]; + // d[1] = _a[1] > _b[1] ? _a[1] : _b[1]; + // d[2] = _a[2] > _b[2] ? _a[2] : _b[2]; + // d[3] = _a[3] > _b[3] ? _a[3] : _b[3]; + // d[4] = _a[4] > _b[4] ? _a[4] : _b[4]; + // d[5] = _a[5] > _b[5] ? _a[5] : _b[5]; + // d[6] = _a[6] > _b[6] ? _a[6] : _b[6]; + // d[7] = _a[7] > _b[7] ? _a[7] : _b[7]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // + // __m128i c = _mm_max_epi16(a, b); + // return VALIDATE_INT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_max_epu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // uint8_t d[16]; + // d[0] = + // ((uint8_t)_a[0] > (uint8_t)_b[0]) ? ((uint8_t)_a[0]) : + // ((uint8_t)_b[0]); + // d[1] = + // ((uint8_t)_a[1] > (uint8_t)_b[1]) ? ((uint8_t)_a[1]) : + // ((uint8_t)_b[1]); + // d[2] = + // ((uint8_t)_a[2] > (uint8_t)_b[2]) ? ((uint8_t)_a[2]) : + // ((uint8_t)_b[2]); + // d[3] = + // ((uint8_t)_a[3] > (uint8_t)_b[3]) ? ((uint8_t)_a[3]) : + // ((uint8_t)_b[3]); + // d[4] = + // ((uint8_t)_a[4] > (uint8_t)_b[4]) ? ((uint8_t)_a[4]) : + // ((uint8_t)_b[4]); + // d[5] = + // ((uint8_t)_a[5] > (uint8_t)_b[5]) ? ((uint8_t)_a[5]) : + // ((uint8_t)_b[5]); + // d[6] = + // ((uint8_t)_a[6] > (uint8_t)_b[6]) ? ((uint8_t)_a[6]) : + // ((uint8_t)_b[6]); + // d[7] = + // ((uint8_t)_a[7] > (uint8_t)_b[7]) ? ((uint8_t)_a[7]) : + // ((uint8_t)_b[7]); + // d[8] = + // ((uint8_t)_a[8] > (uint8_t)_b[8]) ? ((uint8_t)_a[8]) : + // ((uint8_t)_b[8]); + // d[9] = + // ((uint8_t)_a[9] > (uint8_t)_b[9]) ? ((uint8_t)_a[9]) : + // ((uint8_t)_b[9]); + // d[10] = ((uint8_t)_a[10] > (uint8_t)_b[10]) ? ((uint8_t)_a[10]) + // : ((uint8_t)_b[10]); + // d[11] = ((uint8_t)_a[11] > (uint8_t)_b[11]) ? ((uint8_t)_a[11]) + // : ((uint8_t)_b[11]); + // d[12] = ((uint8_t)_a[12] > (uint8_t)_b[12]) ? ((uint8_t)_a[12]) + // : ((uint8_t)_b[12]); + // d[13] = ((uint8_t)_a[13] > (uint8_t)_b[13]) ? ((uint8_t)_a[13]) + // : ((uint8_t)_b[13]); + // d[14] = ((uint8_t)_a[14] > (uint8_t)_b[14]) ? ((uint8_t)_a[14]) + // : ((uint8_t)_b[14]); + // d[15] = ((uint8_t)_a[15] > (uint8_t)_b[15]) ? ((uint8_t)_a[15]) + // : ((uint8_t)_b[15]); + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_max_epu8(a, b); + // return VALIDATE_INT8_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_max_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // + // double f0 = _a[0] > _b[0] ? _a[0] : _b[0]; + // double f1 = _a[1] > _b[1] ? _a[1] : _b[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_max_pd(a, b); + // + // return validate_double(c, f0, f1); + return TEST_UNIMPL; +} + +result_t test_mm_max_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // double d0 = _a[0] > _b[0] ? _a[0] : _b[0]; + // double d1 = _a[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_max_sd(a, b); + // + // return validate_double(c, d0, d1); + return TEST_UNIMPL; +} + +result_t test_mm_mfence(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // /* FIXME: Assume that memory barriers always function as intended. */ + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_min_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // int16_t d[8]; + // d[0] = _a[0] < _b[0] ? _a[0] : _b[0]; + // d[1] = _a[1] < _b[1] ? _a[1] : _b[1]; + // d[2] = _a[2] < _b[2] ? _a[2] : _b[2]; + // d[3] = _a[3] < _b[3] ? _a[3] : _b[3]; + // d[4] = _a[4] < _b[4] ? _a[4] : _b[4]; + // d[5] = _a[5] < _b[5] ? _a[5] : _b[5]; + // d[6] = _a[6] < _b[6] ? _a[6] : _b[6]; + // d[7] = _a[7] < _b[7] ? _a[7] : _b[7]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_min_epi16(a, b); + // return VALIDATE_INT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_min_epu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // uint8_t d[16]; + // d[0] = ((uint8_t)_a[0] < (uint8_t)_b[0]) ? (uint8_t)_a[0] : + // (uint8_t)_b[0]; d[1] = ((uint8_t)_a[1] < (uint8_t)_b[1]) ? (uint8_t)_a[1] + // : (uint8_t)_b[1]; d[2] = ((uint8_t)_a[2] < (uint8_t)_b[2]) ? + // (uint8_t)_a[2] : (uint8_t)_b[2]; d[3] = ((uint8_t)_a[3] < (uint8_t)_b[3]) + // ? (uint8_t)_a[3] : (uint8_t)_b[3]; d[4] = ((uint8_t)_a[4] < + // (uint8_t)_b[4]) ? (uint8_t)_a[4] : (uint8_t)_b[4]; d[5] = ((uint8_t)_a[5] + // < (uint8_t)_b[5]) ? (uint8_t)_a[5] : (uint8_t)_b[5]; d[6] = + // ((uint8_t)_a[6] < (uint8_t)_b[6]) ? (uint8_t)_a[6] : (uint8_t)_b[6]; d[7] + // = ((uint8_t)_a[7] < (uint8_t)_b[7]) ? (uint8_t)_a[7] : (uint8_t)_b[7]; + // d[8] = ((uint8_t)_a[8] < (uint8_t)_b[8]) ? (uint8_t)_a[8] : + // (uint8_t)_b[8]; d[9] = ((uint8_t)_a[9] < (uint8_t)_b[9]) ? (uint8_t)_a[9] + // : (uint8_t)_b[9]; d[10] = + // ((uint8_t)_a[10] < (uint8_t)_b[10]) ? (uint8_t)_a[10] : + // (uint8_t)_b[10]; + // d[11] = + // ((uint8_t)_a[11] < (uint8_t)_b[11]) ? (uint8_t)_a[11] : + // (uint8_t)_b[11]; + // d[12] = + // ((uint8_t)_a[12] < (uint8_t)_b[12]) ? (uint8_t)_a[12] : + // (uint8_t)_b[12]; + // d[13] = + // ((uint8_t)_a[13] < (uint8_t)_b[13]) ? (uint8_t)_a[13] : + // (uint8_t)_b[13]; + // d[14] = + // ((uint8_t)_a[14] < (uint8_t)_b[14]) ? (uint8_t)_a[14] : + // (uint8_t)_b[14]; + // d[15] = + // ((uint8_t)_a[15] < (uint8_t)_b[15]) ? (uint8_t)_a[15] : + // (uint8_t)_b[15]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_min_epu8(a, b); + // return VALIDATE_INT8_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_min_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // double f0 = _a[0] < _b[0] ? _a[0] : _b[0]; + // double f1 = _a[1] < _b[1] ? _a[1] : _b[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // + // __m128d c = _mm_min_pd(a, b); + // return validate_double(c, f0, f1); + return TEST_UNIMPL; +} + +result_t test_mm_min_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // double d0 = _a[0] < _b[0] ? _a[0] : _b[0]; + // double d1 = _a[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_min_sd(a, b); + // + // return validate_double(c, d0, d1); + return TEST_UNIMPL; +} + +result_t test_mm_move_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // + // int64_t d0 = _a[0]; + // int64_t d1 = 0; + // + // __m128i a = load_m128i(_a); + // __m128i c = _mm_move_epi64(a); + // + // return validate_int64(c, d0, d1); + return TEST_UNIMPL; +} + +result_t test_mm_move_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // + // double result[2]; + // result[0] = _b[0]; + // result[1] = _a[1]; + // + // __m128d ret = _mm_move_sd(a, b); + // return validate_double(ret, result[0], result[1]); + return TEST_UNIMPL; +} + +result_t test_mm_movemask_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = impl.test_cases_int_pointer1; + // __m128i a = load_m128i(_a); + // + // const uint8_t *ip = (const uint8_t *)_a; + // int ret = 0; + // uint32_t mask = 1; + // for (uint32_t i = 0; i < 16; i++) { + // if (ip[i] & 0x80) { + // ret |= mask; + // } + // mask = mask << 1; + // } + // int test = _mm_movemask_epi8(a); + // ASSERT_RETURN(test == ret); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_movemask_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // unsigned int _c = 0; + // _c |= ((*(const uint64_t *)_a) >> 63) & 0x1; + // _c |= (((*(const uint64_t *)(_a + 1)) >> 62) & 0x2); + // + // __m128d a = load_m128d(_a); + // int c = _mm_movemask_pd(a); + // + // ASSERT_RETURN((unsigned int)c == _c); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_movepi64_pi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // + // int64_t d0 = _a[0]; + // + // __m128i a = load_m128i(_a); + // __m64 c = _mm_movepi64_pi64(a); + // + // return validate_int64(c, d0); + return TEST_UNIMPL; +} + +result_t test_mm_movpi64_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // + // int64_t d0 = _a[0]; + // + // __m64 a = load_m64(_a); + // __m128i c = _mm_movpi64_epi64(a); + // + // return validate_int64(c, d0, 0); + return TEST_UNIMPL; +} + +result_t test_mm_mul_epu32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint32_t *_a = (const uint32_t *)impl.test_cases_int_pointer1; + // const uint32_t *_b = (const uint32_t *)impl.test_cases_int_pointer2; + // uint64_t dx = (uint64_t)(_a[0]) * (uint64_t)(_b[0]); + // uint64_t dy = (uint64_t)(_a[2]) * (uint64_t)(_b[2]); + // + // __m128i a = _mm_loadu_si128((const __m128i *)_a); + // __m128i b = _mm_loadu_si128((const __m128i *)_b); + // __m128i r = _mm_mul_epu32(a, b); + // return validate_uint64(r, dx, dy); + return TEST_UNIMPL; +} + +result_t test_mm_mul_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // double d0 = _a[0] * _b[0]; + // double d1 = _a[1] * _b[1]; + // + // __m128d a = _mm_load_pd(_a); + // __m128d b = _mm_load_pd(_b); + // __m128d c = _mm_mul_pd(a, b); + // return validate_double(c, d0, d1); + return TEST_UNIMPL; +} + +result_t test_mm_mul_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // double dx = _a[0] * _b[0]; + // double dy = _a[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_mul_sd(a, b); + // return validate_double(c, dx, dy); + return TEST_UNIMPL; +} + +result_t test_mm_mul_su32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint32_t *_a = (const uint32_t *)impl.test_cases_int_pointer1; + // const uint32_t *_b = (const uint32_t *)impl.test_cases_int_pointer2; + // + // uint64_t u = (uint64_t)(_a[0]) * (uint64_t)(_b[0]); + // + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 r = _mm_mul_su32(a, b); + // + // return validate_uint64(r, u); + return TEST_UNIMPL; +} + +result_t test_mm_mulhi_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // int16_t d[8]; + // for (uint32_t i = 0; i < 8; i++) { + // int32_t m = (int32_t)_a[i] * (int32_t)_b[i]; + // d[i] = (int16_t)(m >> 16); + // } + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_mulhi_epi16(a, b); + // return VALIDATE_INT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_mulhi_epu16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1; + // const uint16_t *_b = (const uint16_t *)impl.test_cases_int_pointer2; + // uint16_t d[8]; + // for (uint32_t i = 0; i < 8; i++) { + // uint32_t m = (uint32_t)_a[i] * (uint32_t)_b[i]; + // d[i] = (uint16_t)(m >> 16); + // } + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_mulhi_epu16(a, b); + // return VALIDATE_INT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_mullo_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // int16_t d[8]; + // d[0] = _a[0] * _b[0]; + // d[1] = _a[1] * _b[1]; + // d[2] = _a[2] * _b[2]; + // d[3] = _a[3] * _b[3]; + // d[4] = _a[4] * _b[4]; + // d[5] = _a[5] * _b[5]; + // d[6] = _a[6] * _b[6]; + // d[7] = _a[7] * _b[7]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_mullo_epi16(a, b); + // return VALIDATE_INT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_or_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_float_pointer1; + // const int64_t *_b = (const int64_t *)impl.test_cases_float_pointer2; + // + // int64_t d0 = _a[0] | _b[0]; + // int64_t d1 = _a[1] | _b[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_or_pd(a, b); + // + // return validate_double(c, *((double *)&d0), *((double *)&d1)); + return TEST_UNIMPL; +} + +result_t test_mm_or_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = impl.test_cases_int_pointer1; + // const int32_t *_b = impl.test_cases_int_pointer2; + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128 fc = _mm_or_ps(*(const __m128 *)&a, *(const __m128 *)&b); + // __m128i c = *(const __m128i *)&fc; + // now for the assertion... + // const uint32_t *ia = (const uint32_t *)&a; + // const uint32_t *ib = (const uint32_t *)&b; + // uint32_t r[4]; + // r[0] = ia[0] | ib[0]; + // r[1] = ia[1] | ib[1]; + // r[2] = ia[2] | ib[2]; + // r[3] = ia[3] | ib[3]; + // __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]); + // result_t res = VALIDATE_INT32_M128(c, r); + // if (res) { + // res = VALIDATE_INT32_M128(ret, r); + // } + // return res; + return TEST_UNIMPL; +} + +result_t test_mm_packs_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // int8_t max = INT8_MAX; + // int8_t min = INT8_MIN; + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // + // int8_t d[16]; + // for (int i = 0; i < 8; i++) { + // if (_a[i] > max) + // d[i] = max; + // else if (_a[i] < min) + // d[i] = min; + // else + // d[i] = (int8_t)_a[i]; + // } + // for (int i = 0; i < 8; i++) { + // if (_b[i] > max) + // d[i + 8] = max; + // else if (_b[i] < min) + // d[i + 8] = min; + // else + // d[i + 8] = (int8_t)_b[i]; + // } + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_packs_epi16(a, b); + // + // return VALIDATE_INT8_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_packs_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // int16_t max = INT16_MAX; + // int16_t min = INT16_MIN; + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2; + // + // int16_t d[8]; + // for (int i = 0; i < 4; i++) { + // if (_a[i] > max) + // d[i] = max; + // else if (_a[i] < min) + // d[i] = min; + // else + // d[i] = (int16_t)_a[i]; + // } + // for (int i = 0; i < 4; i++) { + // if (_b[i] > max) + // d[i + 4] = max; + // else if (_b[i] < min) + // d[i + 4] = min; + // else + // d[i + 4] = (int16_t)_b[i]; + // } + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_packs_epi32(a, b); + // + // return VALIDATE_INT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_packus_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // uint8_t max = UINT8_MAX; + // uint8_t min = 0; + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // + // uint8_t d[16]; + // for (int i = 0; i < 8; i++) { + // if (_a[i] > (int16_t)max) + // d[i] = max; + // else if (_a[i] < (int16_t)min) + // d[i] = min; + // else + // d[i] = (uint8_t)_a[i]; + // } + // for (int i = 0; i < 8; i++) { + // if (_b[i] > (int16_t)max) + // d[i + 8] = max; + // else if (_b[i] < (int16_t)min) + // d[i + 8] = min; + // else + // d[i + 8] = (uint8_t)_b[i]; + // } + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_packus_epi16(a, b); + // + // return VALIDATE_UINT8_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_pause(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // _mm_pause(); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_sad_epu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1; + // const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2; + // uint16_t d0 = 0; + // uint16_t d1 = 0; + // for (int i = 0; i < 8; i++) { + // d0 += abs(_a[i] - _b[i]); + // } + // for (int i = 8; i < 16; i++) { + // d1 += abs(_a[i] - _b[i]); + // } + // + // const __m128i a = load_m128i(_a); + // const __m128i b = load_m128i(_b); + // __m128i c = _mm_sad_epu8(a, b); + // return validate_uint16(c, d0, 0, 0, 0, d1, 0, 0, 0); + return TEST_UNIMPL; +} + +result_t test_mm_set_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // int16_t d[8]; + // d[0] = _a[0]; + // d[1] = _a[1]; + // d[2] = _a[2]; + // d[3] = _a[3]; + // d[4] = _a[4]; + // d[5] = _a[5]; + // d[6] = _a[6]; + // d[7] = _a[7]; + // + // __m128i c = _mm_set_epi16(d[7], d[6], d[5], d[4], d[3], d[2], d[1], + // d[0]); return VALIDATE_INT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_set_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // int32_t d[4]; + // d[3] = impl.test_cases_ints[iter]; + // d[2] = impl.test_cases_ints[iter + 1]; + // d[1] = impl.test_cases_ints[iter + 2]; + // d[0] = impl.test_cases_ints[iter + 3]; + // __m128i a = _mm_set_epi32(d[3], d[2], d[1], d[0]); + // return VALIDATE_INT32_M128(a, d); + return TEST_UNIMPL; +} + +result_t test_mm_set_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // + // __m128i ret = _mm_set_epi64(load_m64(&_a[1]), load_m64(&_a[0])); + // + // return validate_int64(ret, _a[0], _a[1]); + return TEST_UNIMPL; +} + +result_t test_mm_set_epi64x(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // + // __m128i ret = _mm_set_epi64x(_a[1], _a[0]); + // + // return validate_int64(ret, _a[0], _a[1]); + return TEST_UNIMPL; +} + +result_t test_mm_set_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // int8_t d[16]; + // d[0] = _a[0]; + // d[1] = _a[1]; + // d[2] = _a[2]; + // d[3] = _a[3]; + // d[4] = _a[4]; + // d[5] = _a[5]; + // d[6] = _a[6]; + // d[7] = _a[7]; + // d[8] = _a[8]; + // d[9] = _a[9]; + // d[10] = _a[10]; + // d[11] = _a[11]; + // d[12] = _a[12]; + // d[13] = _a[13]; + // d[14] = _a[14]; + // d[15] = _a[15]; + // + // __m128i c = _mm_set_epi8(d[15], d[14], d[13], d[12], d[11], d[10], d[9], + // d[8], + // d[7], d[6], d[5], d[4], d[3], d[2], d[1], d[0]); + // return VALIDATE_INT8_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_set_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *p = (const double *)impl.test_cases_float_pointer1; + // double x = p[0]; + // double y = p[1]; + // __m128d a = _mm_set_pd(x, y); + // return validate_double(a, y, x); + return TEST_UNIMPL; +} + +result_t test_mm_set_pd1(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double _a = impl.test_cases_floats[iter]; + // + // __m128d a = _mm_set_pd1(_a); + // + // return validate_double(a, _a, _a); + return TEST_UNIMPL; +} + +result_t test_mm_set_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // + // double f0 = _a[0]; + // double f1 = 0.0; + // + // __m128d a = _mm_set_sd(_a[0]); + // return validate_double(a, f0, f1); + return TEST_UNIMPL; +} + +result_t test_mm_set1_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // int16_t d0 = _a[0]; + // + // __m128i c = _mm_set1_epi16(d0); + // return validate_int16(c, d0, d0, d0, d0, d0, d0, d0, d0); + return TEST_UNIMPL; +} + +result_t test_mm_set1_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // int32_t x = impl.test_cases_ints[iter]; + // __m128i a = _mm_set1_epi32(x); + // return validate_int32(a, x, x, x, x); + return TEST_UNIMPL; +} + +result_t test_mm_set1_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // + // __m128i ret = _mm_set1_epi64(load_m64(&_a[0])); + // + // return validate_int64(ret, _a[0], _a[0]); + return TEST_UNIMPL; +} + +result_t test_mm_set1_epi64x(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // + // __m128i ret = _mm_set1_epi64x(_a[0]); + // + // return validate_int64(ret, _a[0], _a[0]); + return TEST_UNIMPL; +} + +result_t test_mm_set1_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // int8_t d0 = _a[0]; + // __m128i c = _mm_set1_epi8(d0); + // return validate_int8(c, d0, d0, d0, d0, d0, d0, d0, d0, d0, d0, d0, d0, + // d0, + // d0, d0, d0); + return TEST_UNIMPL; +} + +result_t test_mm_set1_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // double d0 = _a[0]; + // __m128d c = _mm_set1_pd(d0); + // return validate_double(c, d0, d0); + return TEST_UNIMPL; +} + +result_t test_mm_setr_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // + // __m128i c = + // _mm_setr_epi16(_a[0], _a[1], _a[2], _a[3], _a[4], _a[5], _a[6], + // _a[7]); + // + // return VALIDATE_INT16_M128(c, _a); + return TEST_UNIMPL; +} + +result_t test_mm_setr_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // __m128i c = _mm_setr_epi32(_a[0], _a[1], _a[2], _a[3]); + // return VALIDATE_INT32_M128(c, _a); + return TEST_UNIMPL; +} + +result_t test_mm_setr_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // __m128i c = _mm_setr_epi64(load_m64(&_a[0]), load_m64(&_a[1])); + // return validate_int64(c, _a[0], _a[1]); + return TEST_UNIMPL; +} + +result_t test_mm_setr_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // + // __m128i c = _mm_setr_epi8(_a[0], _a[1], _a[2], _a[3], _a[4], _a[5], + // _a[6], + // _a[7], _a[8], _a[9], _a[10], _a[11], _a[12], + // _a[13], _a[14], _a[15]); + // + // return VALIDATE_INT8_M128(c, _a); + return TEST_UNIMPL; +} + +result_t test_mm_setr_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *p = (const double *)impl.test_cases_float_pointer1; + // + // double x = p[0]; + // double y = p[1]; + // + // __m128d a = _mm_setr_pd(x, y); + // + // return validate_double(a, x, y); + return TEST_UNIMPL; +} + +result_t test_mm_setzero_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // __m128d a = _mm_setzero_pd(); + // return validate_double(a, 0, 0); + return TEST_UNIMPL; +} + +result_t test_mm_setzero_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // __m128i a = _mm_setzero_si128(); + // return validate_int32(a, 0, 0, 0, 0); + return TEST_UNIMPL; +} + +result_t test_mm_shuffle_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = impl.test_cases_int_pointer1; + // __m128i a, c; + // + // #define TEST_IMPL(IDX) + // int32_t d##IDX[4]; + // d##IDX[0] = _a[((IDX) & 0x3)]; + // d##IDX[1] = _a[((IDX >> 2) & 0x3)]; + // d##IDX[2] = _a[((IDX >> 4) & 0x3)]; + // d##IDX[3] = _a[((IDX >> 6) & 0x3)]; + // + // a = load_m128i(_a); + // c = _mm_shuffle_epi32(a, IDX); + // CHECK_RESULT(VALIDATE_INT32_M128(c, d##IDX)) + // + // IMM_256_ITER + // #undef TEST_IMPL + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_shuffle_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // __m128d a, b, c; + // + // #define TEST_IMPL(IDX) + // a = load_m128d(_a); + // b = load_m128d(_b); + // c = _mm_shuffle_pd(a, b, IDX); + // + // double d0##IDX = _a[IDX & 0x1]; + // double d1##IDX = _b[(IDX & 0x2) >> 1]; + // CHECK_RESULT(validate_double(c, d0##IDX, d1##IDX)) + // + // IMM_4_ITER + // #undef TEST_IMPL + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_shufflehi_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // __m128i a, c; + // + // #define TEST_IMPL(IDX) + // int16_t d##IDX[8]; + // d##IDX[0] = _a[0]; + // d##IDX[1] = _a[1]; + // d##IDX[2] = _a[2]; + // d##IDX[3] = _a[3]; + // d##IDX[4] = ((const int64_t *)_a)[1] >> ((IDX & 0x3) * 16); + // d##IDX[5] = ((const int64_t *)_a)[1] >> (((IDX >> 2) & 0x3) * 16); + // d##IDX[6] = ((const int64_t *)_a)[1] >> (((IDX >> 4) & 0x3) * 16); + // d##IDX[7] = ((const int64_t *)_a)[1] >> (((IDX >> 6) & 0x3) * 16); + // + // a = load_m128i(_a); + // c = _mm_shufflehi_epi16(a, IDX); + // + // CHECK_RESULT(VALIDATE_INT16_M128(c, d##IDX)) + // + // IMM_256_ITER + // #undef TEST_IMPL + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_shufflelo_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // __m128i a, c; + // + // #define TEST_IMPL(IDX) + // int16_t d##IDX[8]; + // d##IDX[0] = ((const int64_t *)_a)[0] >> ((IDX & 0x3) * 16); + // d##IDX[1] = ((const int64_t *)_a)[0] >> (((IDX >> 2) & 0x3) * 16); + // d##IDX[2] = ((const int64_t *)_a)[0] >> (((IDX >> 4) & 0x3) * 16); + // d##IDX[3] = ((const int64_t *)_a)[0] >> (((IDX >> 6) & 0x3) * 16); + // d##IDX[4] = _a[4]; + // d##IDX[5] = _a[5]; + // d##IDX[6] = _a[6]; + // d##IDX[7] = _a[7]; + // + // a = load_m128i(_a); + // c = _mm_shufflelo_epi16(a, IDX); + // + // CHECK_RESULT(VALIDATE_INT16_M128(c, d##IDX)) + // + // IMM_256_ITER + // #undef TEST_IMPL + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_sll_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // __m128i a, b, c; + // + // #define TEST_IMPL(IDX) + // uint16_t d##IDX[8]; + // d##IDX[0] = (IDX > 15) ? 0 : _a[0] << IDX; + // d##IDX[1] = (IDX > 15) ? 0 : _a[1] << IDX; + // d##IDX[2] = (IDX > 15) ? 0 : _a[2] << IDX; + // d##IDX[3] = (IDX > 15) ? 0 : _a[3] << IDX; + // d##IDX[4] = (IDX > 15) ? 0 : _a[4] << IDX; + // d##IDX[5] = (IDX > 15) ? 0 : _a[5] << IDX; + // d##IDX[6] = (IDX > 15) ? 0 : _a[6] << IDX; + // d##IDX[7] = (IDX > 15) ? 0 : _a[7] << IDX; + // + // a = load_m128i(_a); + // b = _mm_set1_epi64x(IDX); + // c = _mm_sll_epi16(a, b); + // CHECK_RESULT(VALIDATE_INT16_M128(c, d##IDX)) + // + // IMM_64_ITER + // #undef TEST_IMPL + // + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_sll_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // __m128i a, b, c; + // + // #define TEST_IMPL(IDX) + // uint32_t d##IDX[4]; + // d##IDX[0] = (IDX > 31) ? 0 : _a[0] << IDX; + // d##IDX[1] = (IDX > 31) ? 0 : _a[1] << IDX; + // d##IDX[2] = (IDX > 31) ? 0 : _a[2] << IDX; + // d##IDX[3] = (IDX > 31) ? 0 : _a[3] << IDX; + // + // a = load_m128i(_a); + // b = _mm_set1_epi64x(IDX); + // c = _mm_sll_epi32(a, b); + // CHECK_RESULT(VALIDATE_INT32_M128(c, d##IDX)) + // + // IMM_64_ITER + // #undef TEST_IMPL + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_sll_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // __m128i a, b, c; + // + // #define TEST_IMPL(IDX) + // uint64_t d0##IDX = (IDX & ~63) ? 0 : _a[0] << IDX; + // uint64_t d1##IDX = (IDX & ~63) ? 0 : _a[1] << IDX; + // + // a = load_m128i(_a); + // b = _mm_set1_epi64x(IDX); + // c = _mm_sll_epi64(a, b); + // + // CHECK_RESULT(validate_int64(c, d0##IDX, d1##IDX)) + // + // IMM_64_ITER + // #undef TEST_IMPL + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_slli_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // __m128i a, c; + // + // #define TEST_IMPL(IDX) + // int16_t d##IDX[8]; + // d##IDX[0] = (IDX > 15) ? 0 : _a[0] << IDX; + // d##IDX[1] = (IDX > 15) ? 0 : _a[1] << IDX; + // d##IDX[2] = (IDX > 15) ? 0 : _a[2] << IDX; + // d##IDX[3] = (IDX > 15) ? 0 : _a[3] << IDX; + // d##IDX[4] = (IDX > 15) ? 0 : _a[4] << IDX; + // d##IDX[5] = (IDX > 15) ? 0 : _a[5] << IDX; + // d##IDX[6] = (IDX > 15) ? 0 : _a[6] << IDX; + // d##IDX[7] = (IDX > 15) ? 0 : _a[7] << IDX; + // + // a = load_m128i(_a); + // c = _mm_slli_epi16(a, IDX); + // CHECK_RESULT(VALIDATE_INT16_M128(c, d##IDX)) + // + // IMM_64_ITER + // #undef TEST_IMPL + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_slli_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // #if defined(__clang__) + // Clang compiler does not allow the second argument of _mm_slli_epi32() to + // be greater than 31. + // const int count = (int)(iter % 33 - 1); // range: -1 ~ 31 + // #else + // const int count = (int)(iter % 34 - 1); // range: -1 ~ 32 + // #endif + // + // int32_t d[4]; + // d[0] = (count & ~31) ? 0 : _a[0] << count; + // d[1] = (count & ~31) ? 0 : _a[1] << count; + // d[2] = (count & ~31) ? 0 : _a[2] << count; + // d[3] = (count & ~31) ? 0 : _a[3] << count; + // + // __m128i a = load_m128i(_a); + // __m128i c = _mm_slli_epi32(a, count); + // return VALIDATE_INT32_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_slli_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // #if defined(__clang__) + // Clang compiler does not allow the second argument of "_mm_slli_epi64()" + // to be greater than 63. + // const int count = (int)(iter % 65 - 1); // range: -1 ~ 63 + // #else + // const int count = (int)(iter % 66 - 1); // range: -1 ~ 64 + // #endif + // int64_t d0 = (count & ~63) ? 0 : _a[0] << count; + // int64_t d1 = (count & ~63) ? 0 : _a[1] << count; + // + // __m128i a = load_m128i(_a); + // __m128i c = _mm_slli_epi64(a, count); + // return validate_int64(c, d0, d1); + return TEST_UNIMPL; +} + +result_t test_mm_slli_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = impl.test_cases_int_pointer1; + // + // int8_t d[16]; + // int count = (iter % 5) << 2; + // for (int i = 0; i < 16; i++) { + // if (i < count) + // d[i] = 0; + // else + // d[i] = ((const int8_t *)_a)[i - count]; + // } + // + // __m128i a = load_m128i(_a); + // __m128i ret; + // switch (iter % 5) { + // case 0: + // ret = _mm_slli_si128(a, 0); + // break; + // case 1: + // ret = _mm_slli_si128(a, 4); + // break; + // case 2: + // ret = _mm_slli_si128(a, 8); + // break; + // case 3: + // ret = _mm_slli_si128(a, 12); + // break; + // case 4: + // ret = _mm_slli_si128(a, 16); + // break; + // } + // + // return VALIDATE_INT8_M128(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_sqrt_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // + // double f0 = sqrt(_a[0]); + // double f1 = sqrt(_a[1]); + // + // __m128d a = load_m128d(_a); + // __m128d c = _mm_sqrt_pd(a); + // + // return validate_double_error(c, f0, f1, 1.0e-15); + return TEST_UNIMPL; +} + +result_t test_mm_sqrt_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // + // double f0 = sqrt(_b[0]); + // double f1 = _a[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_sqrt_sd(a, b); + // + // return validate_double_error(c, f0, f1, 1.0e-15); + return TEST_UNIMPL; +} + +result_t test_mm_sra_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int64_t count = (int64_t)(iter % 18 - 1); // range: -1 ~ 16 + // + // int16_t d[8]; + // d[0] = (count & ~15) ? (_a[0] < 0 ? ~UINT16_C(0) : 0) : (_a[0] >> count); + // d[1] = (count & ~15) ? (_a[1] < 0 ? ~UINT16_C(0) : 0) : (_a[1] >> count); + // d[2] = (count & ~15) ? (_a[2] < 0 ? ~UINT16_C(0) : 0) : (_a[2] >> count); + // d[3] = (count & ~15) ? (_a[3] < 0 ? ~UINT16_C(0) : 0) : (_a[3] >> count); + // d[4] = (count & ~15) ? (_a[4] < 0 ? ~UINT16_C(0) : 0) : (_a[4] >> count); + // d[5] = (count & ~15) ? (_a[5] < 0 ? ~UINT16_C(0) : 0) : (_a[5] >> count); + // d[6] = (count & ~15) ? (_a[6] < 0 ? ~UINT16_C(0) : 0) : (_a[6] >> count); + // d[7] = (count & ~15) ? (_a[7] < 0 ? ~UINT16_C(0) : 0) : (_a[7] >> count); + // + // __m128i a = _mm_load_si128((const __m128i *)_a); + // __m128i b = _mm_set1_epi64x(count); + // __m128i c = _mm_sra_epi16(a, b); + // + // return VALIDATE_INT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_sra_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // const int64_t count = (int64_t)(iter % 34 - 1); // range: -1 ~ 32 + // + // int32_t d[4]; + // d[0] = (count & ~31) ? (_a[0] < 0 ? ~UINT32_C(0) : 0) : _a[0] >> count; + // d[1] = (count & ~31) ? (_a[1] < 0 ? ~UINT32_C(0) : 0) : _a[1] >> count; + // d[2] = (count & ~31) ? (_a[2] < 0 ? ~UINT32_C(0) : 0) : _a[2] >> count; + // d[3] = (count & ~31) ? (_a[3] < 0 ? ~UINT32_C(0) : 0) : _a[3] >> count; + // + // __m128i a = _mm_load_si128((const __m128i *)_a); + // __m128i b = _mm_set1_epi64x(count); + // __m128i c = _mm_sra_epi32(a, b); + // + // return VALIDATE_INT32_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_srai_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int32_t b = (int32_t)(iter % 18 - 1); // range: -1 ~ 16 + // int16_t d[8]; + // int count = (b & ~15) ? 15 : b; + // + // for (int i = 0; i < 8; i++) { + // d[i] = _a[i] >> count; + // } + // + // __m128i a = _mm_load_si128((const __m128i *)_a); + // __m128i c = _mm_srai_epi16(a, b); + // + // return VALIDATE_INT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_srai_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // const int32_t b = (int32_t)(iter % 34 - 1); // range: -1 ~ 32 + // + // int32_t d[4]; + // int count = (b & ~31) ? 31 : b; + // for (int i = 0; i < 4; i++) { + // d[i] = _a[i] >> count; + // } + // + // __m128i a = _mm_load_si128((const __m128i *)_a); + // __m128i c = _mm_srai_epi32(a, b); + // + // return VALIDATE_INT32_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_srl_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int64_t count = (int64_t)(iter % 18 - 1); // range: -1 ~ 16 + // + // uint16_t d[8]; + // d[0] = (count & ~15) ? 0 : (uint16_t)(_a[0]) >> count; + // d[1] = (count & ~15) ? 0 : (uint16_t)(_a[1]) >> count; + // d[2] = (count & ~15) ? 0 : (uint16_t)(_a[2]) >> count; + // d[3] = (count & ~15) ? 0 : (uint16_t)(_a[3]) >> count; + // d[4] = (count & ~15) ? 0 : (uint16_t)(_a[4]) >> count; + // d[5] = (count & ~15) ? 0 : (uint16_t)(_a[5]) >> count; + // d[6] = (count & ~15) ? 0 : (uint16_t)(_a[6]) >> count; + // d[7] = (count & ~15) ? 0 : (uint16_t)(_a[7]) >> count; + // + // __m128i a = load_m128i(_a); + // __m128i b = _mm_set1_epi64x(count); + // __m128i c = _mm_srl_epi16(a, b); + // + // return VALIDATE_INT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_srl_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // const int64_t count = (int64_t)(iter % 34 - 1); // range: -1 ~ 32 + // + // uint32_t d[4]; + // d[0] = (count & ~31) ? 0 : (uint32_t)(_a[0]) >> count; + // d[1] = (count & ~31) ? 0 : (uint32_t)(_a[1]) >> count; + // d[2] = (count & ~31) ? 0 : (uint32_t)(_a[2]) >> count; + // d[3] = (count & ~31) ? 0 : (uint32_t)(_a[3]) >> count; + // + // __m128i a = load_m128i(_a); + // __m128i b = _mm_set1_epi64x(count); + // __m128i c = _mm_srl_epi32(a, b); + // + // return VALIDATE_INT32_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_srl_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // const int64_t count = (int64_t)(iter % 66 - 1); // range: -1 ~ 64 + // + // uint64_t d0 = (count & ~63) ? 0 : (uint64_t)(_a[0]) >> count; + // uint64_t d1 = (count & ~63) ? 0 : (uint64_t)(_a[1]) >> count; + // + // __m128i a = load_m128i(_a); + // __m128i b = _mm_set1_epi64x(count); + // __m128i c = _mm_srl_epi64(a, b); + // + // return validate_int64(c, d0, d1); + return TEST_UNIMPL; +} + +result_t test_mm_srli_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int count = (int)(iter % 18 - 1); // range: -1 ~ 16 + // + // int16_t d[8]; + // d[0] = count & (~15) ? 0 : (uint16_t)(_a[0]) >> count; + // d[1] = count & (~15) ? 0 : (uint16_t)(_a[1]) >> count; + // d[2] = count & (~15) ? 0 : (uint16_t)(_a[2]) >> count; + // d[3] = count & (~15) ? 0 : (uint16_t)(_a[3]) >> count; + // d[4] = count & (~15) ? 0 : (uint16_t)(_a[4]) >> count; + // d[5] = count & (~15) ? 0 : (uint16_t)(_a[5]) >> count; + // d[6] = count & (~15) ? 0 : (uint16_t)(_a[6]) >> count; + // d[7] = count & (~15) ? 0 : (uint16_t)(_a[7]) >> count; + // + // __m128i a = load_m128i(_a); + // __m128i c = _mm_srli_epi16(a, count); + // + // return VALIDATE_INT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_srli_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // const int count = (int)(iter % 34 - 1); // range: -1 ~ 32 + // + // int32_t d[4]; + // d[0] = count & (~31) ? 0 : (uint32_t)(_a[0]) >> count; + // d[1] = count & (~31) ? 0 : (uint32_t)(_a[1]) >> count; + // d[2] = count & (~31) ? 0 : (uint32_t)(_a[2]) >> count; + // d[3] = count & (~31) ? 0 : (uint32_t)(_a[3]) >> count; + // + // __m128i a = load_m128i(_a); + // __m128i c = _mm_srli_epi32(a, count); + // + // return VALIDATE_INT32_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_srli_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // const int count = (int)(iter % 66 - 1); // range: -1 ~ 64 + // + // int64_t d0 = count & (~63) ? 0 : (uint64_t)(_a[0]) >> count; + // int64_t d1 = count & (~63) ? 0 : (uint64_t)(_a[1]) >> count; + // + // __m128i a = load_m128i(_a); + // __m128i c = _mm_srli_epi64(a, count); + // + // return validate_int64(c, d0, d1); + return TEST_UNIMPL; +} + +result_t test_mm_srli_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int count = (iter % 5) << 2; + // + // int8_t d[16]; + // for (int i = 0; i < 16; i++) { + // if (i >= (16 - count)) + // d[i] = 0; + // else + // d[i] = _a[i + count]; + // } + // + // __m128i a = load_m128i(_a); + // __m128i ret; + // switch (iter % 5) { + // case 0: + // ret = _mm_srli_si128(a, 0); + // break; + // case 1: + // ret = _mm_srli_si128(a, 4); + // break; + // case 2: + // ret = _mm_srli_si128(a, 8); + // break; + // case 3: + // ret = _mm_srli_si128(a, 12); + // break; + // case 4: + // ret = _mm_srli_si128(a, 16); + // break; + // } + // + // return VALIDATE_INT8_M128(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_store_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // double *p = (double *)impl.test_cases_float_pointer1; + // double x = impl.test_cases_floats[iter + 4]; + // double y = impl.test_cases_floats[iter + 6]; + // + // __m128d a = _mm_set_pd(x, y); + // _mm_store_pd(p, a); + // ASSERT_RETURN(p[0] == y); + // ASSERT_RETURN(p[1] == x); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_store_pd1(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // double *p = (double *)impl.test_cases_float_pointer1; + // double _a[2] = {(double)impl.test_cases_floats[iter], + // (double)impl.test_cases_floats[iter + 1]}; + // + // __m128d a = load_m128d(_a); + // _mm_store_pd1(p, a); + // ASSERT_RETURN(p[0] == impl.test_cases_floats[iter]); + // ASSERT_RETURN(p[1] == impl.test_cases_floats[iter]); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_store_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // double *p = (double *)impl.test_cases_float_pointer1; + // double _a[2] = {(double)impl.test_cases_floats[iter], + // (double)impl.test_cases_floats[iter + 1]}; + // + // __m128d a = load_m128d(_a); + // _mm_store_sd(p, a); + // ASSERT_RETURN(p[0] == impl.test_cases_floats[iter]); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_store_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // alignas(16) int32_t p[4]; + // + // __m128i a = load_m128i(_a); + // _mm_store_si128((__m128i *)p, a); + // + // return VALIDATE_INT32_M128(a, p); + return TEST_UNIMPL; +} + +result_t test_mm_store1_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // return test_mm_store_pd1(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_storeh_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // double *p = (double *)impl.test_cases_float_pointer1; + // double mem; + // + // __m128d a = load_m128d(p); + // _mm_storeh_pd(&mem, a); + // + // ASSERT_RETURN(mem == p[1]); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_storel_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // int64_t *p = (int64_t *)impl.test_cases_int_pointer1; + // __m128i mem; + // + // __m128i a = load_m128i(p); + // _mm_storel_epi64(&mem, a); + // + // ASSERT_RETURN(((SIMDVec *)&mem)->m128_u64[0] == (uint64_t)p[0]); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_storel_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // double *p = (double *)impl.test_cases_float_pointer1; + // double mem; + // + // __m128d a = load_m128d(p); + // _mm_storel_pd(&mem, a); + // + // ASSERT_RETURN(mem == p[0]); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_storer_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // double *p = (double *)impl.test_cases_float_pointer1; + // double mem[2]; + // + // __m128d a = load_m128d(p); + // _mm_storer_pd(mem, a); + // + // __m128d res = load_m128d(mem); + // return validate_double(res, p[1], p[0]); + return TEST_UNIMPL; +} + +result_t test_mm_storeu_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // double *p = (double *)impl.test_cases_float_pointer1; + // double x = impl.test_cases_floats[iter + 4]; + // double y = impl.test_cases_floats[iter + 6]; + // + // __m128d a = _mm_set_pd(x, y); + // _mm_storeu_pd(p, a); + // ASSERT_RETURN(p[0] == y); + // ASSERT_RETURN(p[1] == x); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_storeu_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // __m128i b; + // __m128i a = load_m128i(_a); + // _mm_storeu_si128(&b, a); + // int32_t *_b = (int32_t *)&b; + // return VALIDATE_INT32_M128(a, _b); + return TEST_UNIMPL; +} + +result_t test_mm_storeu_si32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // The GCC version before 11 does not implement intrinsic function + // _mm_storeu_si32. Check https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95483 + // for more information. + // #if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10) + // return TEST_UNIMPL; + // #else + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // __m128i b; + // __m128i a = load_m128i(_a); + // _mm_storeu_si32(&b, a); + // int32_t *_b = (int32_t *)&b; + // return validate_int32(b, _a[0], _b[1], _b[2], _b[3]); + // #endif + return TEST_UNIMPL; +} + +result_t test_mm_stream_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // double p[2]; + // + // __m128d a = load_m128d(_a); + // _mm_stream_pd(p, a); + // + // return validate_double(a, p[0], p[1]); + return TEST_UNIMPL; +} + +result_t test_mm_stream_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // alignas(16) int32_t p[4]; + // + // __m128i a = load_m128i(_a); + // _mm_stream_si128((__m128i *)p, a); + // + // return VALIDATE_INT32_M128(a, p); + return TEST_UNIMPL; +} + +result_t test_mm_stream_si32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t a = (const int32_t)impl.test_cases_ints[iter]; + // int32_t p; + // + // _mm_stream_si32(&p, a); + // + // ASSERT_RETURN(a == p) + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_stream_si64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t a = (const int64_t)impl.test_cases_ints[iter]; + // __int64 p[1]; + // _mm_stream_si64(p, a); + // ASSERT_RETURN(p[0] == a); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_sub_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // int16_t d[8]; + // d[0] = _a[0] - _b[0]; + // d[1] = _a[1] - _b[1]; + // d[2] = _a[2] - _b[2]; + // d[3] = _a[3] - _b[3]; + // d[4] = _a[4] - _b[4]; + // d[5] = _a[5] - _b[5]; + // d[6] = _a[6] - _b[6]; + // d[7] = _a[7] - _b[7]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_sub_epi16(a, b); + // return VALIDATE_INT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_sub_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = impl.test_cases_int_pointer1; + // const int32_t *_b = impl.test_cases_int_pointer2; + // int32_t d[4]; + // d[0] = _a[0] - _b[0]; + // d[1] = _a[1] - _b[1]; + // d[2] = _a[2] - _b[2]; + // d[3] = _a[3] - _b[3]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_sub_epi32(a, b); + // return VALIDATE_INT32_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_sub_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (int64_t *)impl.test_cases_int_pointer1; + // const int64_t *_b = (int64_t *)impl.test_cases_int_pointer2; + // int64_t d0 = _a[0] - _b[0]; + // int64_t d1 = _a[1] - _b[1]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_sub_epi64(a, b); + // return validate_int64(c, d0, d1); + return TEST_UNIMPL; +} + +result_t test_mm_sub_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // int8_t d[16]; + // d[0] = _a[0] - _b[0]; + // d[1] = _a[1] - _b[1]; + // d[2] = _a[2] - _b[2]; + // d[3] = _a[3] - _b[3]; + // d[4] = _a[4] - _b[4]; + // d[5] = _a[5] - _b[5]; + // d[6] = _a[6] - _b[6]; + // d[7] = _a[7] - _b[7]; + // d[8] = _a[8] - _b[8]; + // d[9] = _a[9] - _b[9]; + // d[10] = _a[10] - _b[10]; + // d[11] = _a[11] - _b[11]; + // d[12] = _a[12] - _b[12]; + // d[13] = _a[13] - _b[13]; + // d[14] = _a[14] - _b[14]; + // d[15] = _a[15] - _b[15]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_sub_epi8(a, b); + // return VALIDATE_INT8_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_sub_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // double d0 = _a[0] - _b[0]; + // double d1 = _a[1] - _b[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_sub_pd(a, b); + // return validate_double(c, d0, d1); + return TEST_UNIMPL; +} + +result_t test_mm_sub_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // double d0 = _a[0] - _b[0]; + // double d1 = _a[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_sub_sd(a, b); + // return validate_double(c, d0, d1); + return TEST_UNIMPL; +} + +result_t test_mm_sub_si64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // const int64_t *_b = (const int64_t *)impl.test_cases_int_pointer2; + // + // int64_t d = _a[0] - _b[0]; + // + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 c = _mm_sub_si64(a, b); + // + // return validate_int64(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_subs_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // int32_t max = 32767; + // int32_t min = -32768; + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // + // int16_t d[8]; + // for (int i = 0; i < 8; i++) { + // int32_t res = (int32_t)_a[i] - (int32_t)_b[i]; + // if (res > max) + // d[i] = max; + // else if (res < min) + // d[i] = min; + // else + // d[i] = (int16_t)res; + // } + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_subs_epi16(a, b); + // + // return VALIDATE_INT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_subs_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // int16_t max = 127; + // int16_t min = -128; + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // + // int8_t d[16]; + // for (int i = 0; i < 16; i++) { + // int16_t res = (int16_t)_a[i] - (int16_t)_b[i]; + // if (res > max) + // d[i] = max; + // else if (res < min) + // d[i] = min; + // else + // d[i] = (int8_t)res; + // } + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_subs_epi8(a, b); + // + // return VALIDATE_INT8_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_subs_epu16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // uint16_t d[8]; + // d[0] = (uint16_t)_a[0] - (uint16_t)_b[0]; + // if (d[0] > (uint16_t)_a[0]) + // d[0] = 0; + // d[1] = (uint16_t)_a[1] - (uint16_t)_b[1]; + // if (d[1] > (uint16_t)_a[1]) + // d[1] = 0; + // d[2] = (uint16_t)_a[2] - (uint16_t)_b[2]; + // if (d[2] > (uint16_t)_a[2]) + // d[2] = 0; + // d[3] = (uint16_t)_a[3] - (uint16_t)_b[3]; + // if (d[3] > (uint16_t)_a[3]) + // d[3] = 0; + // d[4] = (uint16_t)_a[4] - (uint16_t)_b[4]; + // if (d[4] > (uint16_t)_a[4]) + // d[4] = 0; + // d[5] = (uint16_t)_a[5] - (uint16_t)_b[5]; + // if (d[5] > (uint16_t)_a[5]) + // d[5] = 0; + // d[6] = (uint16_t)_a[6] - (uint16_t)_b[6]; + // if (d[6] > (uint16_t)_a[6]) + // d[6] = 0; + // d[7] = (uint16_t)_a[7] - (uint16_t)_b[7]; + // if (d[7] > (uint16_t)_a[7]) + // d[7] = 0; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // + // __m128i c = _mm_subs_epu16(a, b); + // return VALIDATE_INT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_subs_epu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // uint8_t d[16]; + // d[0] = (uint8_t)_a[0] - (uint8_t)_b[0]; + // if (d[0] > (uint8_t)_a[0]) + // d[0] = 0; + // d[1] = (uint8_t)_a[1] - (uint8_t)_b[1]; + // if (d[1] > (uint8_t)_a[1]) + // d[1] = 0; + // d[2] = (uint8_t)_a[2] - (uint8_t)_b[2]; + // if (d[2] > (uint8_t)_a[2]) + // d[2] = 0; + // d[3] = (uint8_t)_a[3] - (uint8_t)_b[3]; + // if (d[3] > (uint8_t)_a[3]) + // d[3] = 0; + // d[4] = (uint8_t)_a[4] - (uint8_t)_b[4]; + // if (d[4] > (uint8_t)_a[4]) + // d[4] = 0; + // d[5] = (uint8_t)_a[5] - (uint8_t)_b[5]; + // if (d[5] > (uint8_t)_a[5]) + // d[5] = 0; + // d[6] = (uint8_t)_a[6] - (uint8_t)_b[6]; + // if (d[6] > (uint8_t)_a[6]) + // d[6] = 0; + // d[7] = (uint8_t)_a[7] - (uint8_t)_b[7]; + // if (d[7] > (uint8_t)_a[7]) + // d[7] = 0; + // d[8] = (uint8_t)_a[8] - (uint8_t)_b[8]; + // if (d[8] > (uint8_t)_a[8]) + // d[8] = 0; + // d[9] = (uint8_t)_a[9] - (uint8_t)_b[9]; + // if (d[9] > (uint8_t)_a[9]) + // d[9] = 0; + // d[10] = (uint8_t)_a[10] - (uint8_t)_b[10]; + // if (d[10] > (uint8_t)_a[10]) + // d[10] = 0; + // d[11] = (uint8_t)_a[11] - (uint8_t)_b[11]; + // if (d[11] > (uint8_t)_a[11]) + // d[11] = 0; + // d[12] = (uint8_t)_a[12] - (uint8_t)_b[12]; + // if (d[12] > (uint8_t)_a[12]) + // d[12] = 0; + // d[13] = (uint8_t)_a[13] - (uint8_t)_b[13]; + // if (d[13] > (uint8_t)_a[13]) + // d[13] = 0; + // d[14] = (uint8_t)_a[14] - (uint8_t)_b[14]; + // if (d[14] > (uint8_t)_a[14]) + // d[14] = 0; + // d[15] = (uint8_t)_a[15] - (uint8_t)_b[15]; + // if (d[15] > (uint8_t)_a[15]) + // d[15] = 0; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_subs_epu8(a, b); + // return VALIDATE_INT8_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_ucomieq_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // return test_mm_comieq_sd(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_ucomige_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // return test_mm_comige_sd(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_ucomigt_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // return test_mm_comigt_sd(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_ucomile_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // return test_mm_comile_sd(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_ucomilt_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // return test_mm_comilt_sd(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_ucomineq_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // return test_mm_comineq_sd(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_undefined_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // __m128d a = _mm_undefined_pd(); + // a = _mm_xor_pd(a, a); + // return validate_double(a, 0, 0); + return TEST_UNIMPL; +} + +result_t test_mm_undefined_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // __m128i a = _mm_undefined_si128(); + // a = _mm_xor_si128(a, a); + // return validate_int64(a, 0, 0); + return TEST_UNIMPL; +} + +result_t test_mm_unpackhi_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // + // int16_t d[8]; + // d[0] = _a[4]; + // d[1] = _b[4]; + // d[2] = _a[5]; + // d[3] = _b[5]; + // d[4] = _a[6]; + // d[5] = _b[6]; + // d[6] = _a[7]; + // d[7] = _b[7]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i ret = _mm_unpackhi_epi16(a, b); + // + // return VALIDATE_INT16_M128(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_unpackhi_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2; + // + // int32_t d[4]; + // d[0] = _a[2]; + // d[1] = _b[2]; + // d[2] = _a[3]; + // d[3] = _b[3]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i ret = _mm_unpackhi_epi32(a, b); + // + // return VALIDATE_INT32_M128(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_unpackhi_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // const int64_t *_b = (const int64_t *)impl.test_cases_int_pointer2; + // + // int64_t i0 = _a[1]; + // int64_t i1 = _b[1]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i ret = _mm_unpackhi_epi64(a, b); + // + // return validate_int64(ret, i0, i1); + return TEST_UNIMPL; +} + +result_t test_mm_unpackhi_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // + // int8_t d[16]; + // d[0] = _a[8]; + // d[1] = _b[8]; + // d[2] = _a[9]; + // d[3] = _b[9]; + // d[4] = _a[10]; + // d[5] = _b[10]; + // d[6] = _a[11]; + // d[7] = _b[11]; + // d[8] = _a[12]; + // d[9] = _b[12]; + // d[10] = _a[13]; + // d[11] = _b[13]; + // d[12] = _a[14]; + // d[13] = _b[14]; + // d[14] = _a[15]; + // d[15] = _b[15]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i ret = _mm_unpackhi_epi8(a, b); + // + // return VALIDATE_INT8_M128(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_unpackhi_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d ret = _mm_unpackhi_pd(a, b); + // + // return validate_double(ret, _a[1], _b[1]); + return TEST_UNIMPL; +} + +result_t test_mm_unpacklo_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // + // int16_t d[8]; + // d[0] = _a[0]; + // d[1] = _b[0]; + // d[2] = _a[1]; + // d[3] = _b[1]; + // d[4] = _a[2]; + // d[5] = _b[2]; + // d[6] = _a[3]; + // d[7] = _b[3]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i ret = _mm_unpacklo_epi16(a, b); + // + // return VALIDATE_INT16_M128(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_unpacklo_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2; + // + // int32_t d[4]; + // d[0] = _a[0]; + // d[1] = _b[0]; + // d[2] = _a[1]; + // d[3] = _b[1]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i ret = _mm_unpacklo_epi32(a, b); + // + // return VALIDATE_INT32_M128(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_unpacklo_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // const int64_t *_b = (const int64_t *)impl.test_cases_int_pointer2; + // + // int64_t i0 = _a[0]; + // int64_t i1 = _b[0]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i ret = _mm_unpacklo_epi64(a, b); + // + // return validate_int64(ret, i0, i1); + return TEST_UNIMPL; +} + +result_t test_mm_unpacklo_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // + // int8_t d[16]; + // d[0] = _a[0]; + // d[1] = _b[0]; + // d[2] = _a[1]; + // d[3] = _b[1]; + // d[4] = _a[2]; + // d[5] = _b[2]; + // d[6] = _a[3]; + // d[7] = _b[3]; + // d[8] = _a[4]; + // d[9] = _b[4]; + // d[10] = _a[5]; + // d[11] = _b[5]; + // d[12] = _a[6]; + // d[13] = _b[6]; + // d[14] = _a[7]; + // d[15] = _b[7]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i ret = _mm_unpacklo_epi8(a, b); + // + // return VALIDATE_INT8_M128(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_unpacklo_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d ret = _mm_unpacklo_pd(a, b); + // + // return validate_double(ret, _a[0], _b[0]); + return TEST_UNIMPL; +} + +result_t test_mm_xor_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_float_pointer1; + // const int64_t *_b = (const int64_t *)impl.test_cases_float_pointer2; + // + // int64_t d0 = _a[0] ^ _b[0]; + // int64_t d1 = _a[1] ^ _b[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_xor_pd(a, b); + // + // return validate_double(c, *((double *)&d0), *((double *)&d1)); + return TEST_UNIMPL; +} + +result_t test_mm_xor_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // const int64_t *_b = (const int64_t *)impl.test_cases_int_pointer2; + // + // int64_t d0 = _a[0] ^ _b[0]; + // int64_t d1 = _a[1] ^ _b[1]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_xor_si128(a, b); + // + // return validate_int64(c, d0, d1); + return TEST_UNIMPL; +} + +/* SSE3 */ +result_t test_mm_addsub_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // + // double d0 = _a[0] - _b[0]; + // double d1 = _a[1] + _b[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_addsub_pd(a, b); + // + // return validate_double(c, d0, d1); + return TEST_UNIMPL; +} + +result_t test_mm_addsub_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // + // float f0 = _a[0] - _b[0]; + // float f1 = _a[1] + _b[1]; + // float f2 = _a[2] - _b[2]; + // float f3 = _a[3] + _b[3]; + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 c = _mm_addsub_ps(a, b); + // + // return validate_float(c, f0, f1, f2, f3); + return TEST_UNIMPL; +} + +result_t test_mm_hadd_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // + // double f0 = _a[0] + _a[1]; + // double f1 = _b[0] + _b[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_hadd_pd(a, b); + // + // return validate_double(c, f0, f1); + return TEST_UNIMPL; +} + +result_t test_mm_hadd_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // + // float f0 = _a[0] + _a[1]; + // float f1 = _a[2] + _a[3]; + // float f2 = _b[0] + _b[1]; + // float f3 = _b[2] + _b[3]; + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 c = _mm_hadd_ps(a, b); + // + // return validate_float(c, f0, f1, f2, f3); + return TEST_UNIMPL; +} + +result_t test_mm_hsub_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // + // double f0 = _a[0] - _a[1]; + // double f1 = _b[0] - _b[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d c = _mm_hsub_pd(a, b); + // + // return validate_double(c, f0, f1); + return TEST_UNIMPL; +} + +result_t test_mm_hsub_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // + // float f0 = _a[0] - _a[1]; + // float f1 = _a[2] - _a[3]; + // float f2 = _b[0] - _b[1]; + // float f3 = _b[2] - _b[3]; + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 c = _mm_hsub_ps(a, b); + // + // return validate_float(c, f0, f1, f2, f3); + return TEST_UNIMPL; +} + +result_t test_mm_lddqu_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // return test_mm_loadu_si128(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_loaddup_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *addr = (const double *)impl.test_cases_float_pointer1; + // + // __m128d ret = _mm_loaddup_pd(addr); + // + // return validate_double(ret, addr[0], addr[0]); + return TEST_UNIMPL; +} + +result_t test_mm_movedup_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *p = (const double *)impl.test_cases_float_pointer1; + // __m128d a = load_m128d(p); + // __m128d b = _mm_movedup_pd(a); + // + // return validate_double(b, p[0], p[0]); + return TEST_UNIMPL; +} + +result_t test_mm_movehdup_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *p = impl.test_cases_float_pointer1; + // __m128 a = load_m128(p); + // return validate_float(_mm_movehdup_ps(a), p[1], p[1], p[3], p[3]); + return TEST_UNIMPL; +} + +result_t test_mm_moveldup_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *p = impl.test_cases_float_pointer1; + // __m128 a = load_m128(p); + // return validate_float(_mm_moveldup_ps(a), p[0], p[0], p[2], p[2]); + return TEST_UNIMPL; +} + +/* SSSE3 */ +result_t test_mm_abs_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // __m128i a = load_m128i(_a); + // __m128i c = _mm_abs_epi16(a); + // + // uint32_t d[8]; + // d[0] = (_a[0] < 0) ? -_a[0] : _a[0]; + // d[1] = (_a[1] < 0) ? -_a[1] : _a[1]; + // d[2] = (_a[2] < 0) ? -_a[2] : _a[2]; + // d[3] = (_a[3] < 0) ? -_a[3] : _a[3]; + // d[4] = (_a[4] < 0) ? -_a[4] : _a[4]; + // d[5] = (_a[5] < 0) ? -_a[5] : _a[5]; + // d[6] = (_a[6] < 0) ? -_a[6] : _a[6]; + // d[7] = (_a[7] < 0) ? -_a[7] : _a[7]; + // + // return VALIDATE_UINT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_abs_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // __m128i a = load_m128i(_a); + // __m128i c = _mm_abs_epi32(a); + // + // uint32_t d[4]; + // d[0] = (_a[0] < 0) ? -_a[0] : _a[0]; + // d[1] = (_a[1] < 0) ? -_a[1] : _a[1]; + // d[2] = (_a[2] < 0) ? -_a[2] : _a[2]; + // d[3] = (_a[3] < 0) ? -_a[3] : _a[3]; + // + // return VALIDATE_UINT32_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_abs_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // __m128i a = load_m128i(_a); + // __m128i c = _mm_abs_epi8(a); + // + // uint32_t d[16]; + // for (int i = 0; i < 16; i++) { + // d[i] = (_a[i] < 0) ? -_a[i] : _a[i]; + // } + // + // return VALIDATE_UINT8_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_abs_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // __m64 a = load_m64(_a); + // __m64 c = _mm_abs_pi16(a); + // + // uint32_t d[4]; + // d[0] = (_a[0] < 0) ? -_a[0] : _a[0]; + // d[1] = (_a[1] < 0) ? -_a[1] : _a[1]; + // d[2] = (_a[2] < 0) ? -_a[2] : _a[2]; + // d[3] = (_a[3] < 0) ? -_a[3] : _a[3]; + // + // return VALIDATE_UINT16_M64(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_abs_pi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // __m64 a = load_m64(_a); + // __m64 c = _mm_abs_pi32(a); + // + // uint32_t d[2]; + // d[0] = (_a[0] < 0) ? -_a[0] : _a[0]; + // d[1] = (_a[1] < 0) ? -_a[1] : _a[1]; + // + // return VALIDATE_UINT32_M64(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_abs_pi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // __m64 a = load_m64(_a); + // __m64 c = _mm_abs_pi8(a); + // + // uint32_t d[8]; + // d[0] = (_a[0] < 0) ? -_a[0] : _a[0]; + // d[1] = (_a[1] < 0) ? -_a[1] : _a[1]; + // d[2] = (_a[2] < 0) ? -_a[2] : _a[2]; + // d[3] = (_a[3] < 0) ? -_a[3] : _a[3]; + // d[4] = (_a[4] < 0) ? -_a[4] : _a[4]; + // d[5] = (_a[5] < 0) ? -_a[5] : _a[5]; + // d[6] = (_a[6] < 0) ? -_a[6] : _a[6]; + // d[7] = (_a[7] < 0) ? -_a[7] : _a[7]; + // + // return VALIDATE_UINT8_M64(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_alignr_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // #if defined(__clang__) + // return TEST_UNIMPL; + // #else + // const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1; + // const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2; + // unsigned int shift = (iter % 5) << 3; + // uint8_t d[32]; + // + // if (shift >= 32) { + // memset((void *)d, 0, sizeof(d)); + // } else { + // memcpy((void *)d, (const void *)_b, 16); + // memcpy((void *)(d + 16), (const void *)_a, 16); + // // shifting + // for (size_t x = 0; x < sizeof(d); x++) { + // if (x + shift >= sizeof(d)) + // d[x] = 0; + // else + // d[x] = d[x + shift]; + // } + // } + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i ret; + // switch (iter % 5) { + // case 0: + // ret = _mm_alignr_epi8(a, b, 0); + // break; + // case 1: + // ret = _mm_alignr_epi8(a, b, 8); + // break; + // case 2: + // ret = _mm_alignr_epi8(a, b, 16); + // break; + // case 3: + // ret = _mm_alignr_epi8(a, b, 24); + // break; + // case 4: + // ret = _mm_alignr_epi8(a, b, 32); + // break; + // } + // + // return VALIDATE_UINT8_M128(ret, d); + // #endif + return TEST_UNIMPL; +} + +result_t test_mm_alignr_pi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // #if defined(__clang__) + // return TEST_UNIMPL; + // #else + // const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1; + // const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2; + // unsigned int shift = (iter % 3) << 3; + // uint8_t d[16]; + // + // if (shift >= 16) { + // memset((void *)d, 0, sizeof(d)); + // } else { + // memcpy((void *)d, (const void *)_b, 8); + // memcpy((void *)(d + 8), (const void *)_a, 8); + // // shifting + // for (size_t x = 0; x < sizeof(d); x++) { + // if (x + shift >= sizeof(d)) + // d[x] = 0; + // else + // d[x] = d[x + shift]; + // } + // } + // + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 ret; + // switch (iter % 3) { + // case 0: + // ret = _mm_alignr_pi8(a, b, 0); + // break; + // case 1: + // ret = _mm_alignr_pi8(a, b, 8); + // break; + // case 2: + // ret = _mm_alignr_pi8(a, b, 16); + // break; + // } + // + // return VALIDATE_UINT8_M64(ret, d); + // #endif + return TEST_UNIMPL; +} + +result_t test_mm_hadd_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // int16_t d[8]; + // d[0] = _a[0] + _a[1]; + // d[1] = _a[2] + _a[3]; + // d[2] = _a[4] + _a[5]; + // d[3] = _a[6] + _a[7]; + // d[4] = _b[0] + _b[1]; + // d[5] = _b[2] + _b[3]; + // d[6] = _b[4] + _b[5]; + // d[7] = _b[6] + _b[7]; + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i ret = _mm_hadd_epi16(a, b); + // return VALIDATE_INT16_M128(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_hadd_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2; + // int32_t d[4]; + // d[0] = _a[0] + _a[1]; + // d[1] = _a[2] + _a[3]; + // d[2] = _b[0] + _b[1]; + // d[3] = _b[2] + _b[3]; + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i ret = _mm_hadd_epi32(a, b); + // return VALIDATE_INT32_M128(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_hadd_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // int16_t d[4]; + // d[0] = _a[0] + _a[1]; + // d[1] = _a[2] + _a[3]; + // d[2] = _b[0] + _b[1]; + // d[3] = _b[2] + _b[3]; + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 ret = _mm_hadd_pi16(a, b); + // return VALIDATE_INT16_M64(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_hadd_pi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2; + // int32_t d[2]; + // d[0] = _a[0] + _a[1]; + // d[1] = _b[0] + _b[1]; + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 ret = _mm_hadd_pi32(a, b); + // return VALIDATE_INT32_M64(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_hadds_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer1; + // + // int16_t d16[8]; + // int32_t d32[8]; + // d32[0] = (int32_t)_a[0] + (int32_t)_a[1]; + // d32[1] = (int32_t)_a[2] + (int32_t)_a[3]; + // d32[2] = (int32_t)_a[4] + (int32_t)_a[5]; + // d32[3] = (int32_t)_a[6] + (int32_t)_a[7]; + // d32[4] = (int32_t)_b[0] + (int32_t)_b[1]; + // d32[5] = (int32_t)_b[2] + (int32_t)_b[3]; + // d32[6] = (int32_t)_b[4] + (int32_t)_b[5]; + // d32[7] = (int32_t)_b[6] + (int32_t)_b[7]; + // for (int i = 0; i < 8; i++) { + // if (d32[i] > (int32_t)INT16_MAX) + // d16[i] = INT16_MAX; + // else if (d32[i] < (int32_t)INT16_MIN) + // d16[i] = INT16_MIN; + // else + // d16[i] = (int16_t)d32[i]; + // } + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_hadds_epi16(a, b); + // + // return VALIDATE_INT16_M128(c, d16); + return TEST_UNIMPL; +} + +result_t test_mm_hadds_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer1; + // + // int16_t d16[8]; + // int32_t d32[8]; + // d32[0] = (int32_t)_a[0] + (int32_t)_a[1]; + // d32[1] = (int32_t)_a[2] + (int32_t)_a[3]; + // d32[2] = (int32_t)_b[0] + (int32_t)_b[1]; + // d32[3] = (int32_t)_b[2] + (int32_t)_b[3]; + // for (int i = 0; i < 8; i++) { + // if (d32[i] > (int32_t)INT16_MAX) + // d16[i] = INT16_MAX; + // else if (d32[i] < (int32_t)INT16_MIN) + // d16[i] = INT16_MIN; + // else + // d16[i] = (int16_t)d32[i]; + // } + // + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 c = _mm_hadds_pi16(a, b); + // + // return VALIDATE_INT16_M64(c, d16); + return TEST_UNIMPL; +} + +result_t test_mm_hsub_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer1; + // + // int16_t d[8]; + // d[0] = _a[0] - _a[1]; + // d[1] = _a[2] - _a[3]; + // d[2] = _a[4] - _a[5]; + // d[3] = _a[6] - _a[7]; + // d[4] = _b[0] - _b[1]; + // d[5] = _b[2] - _b[3]; + // d[6] = _b[4] - _b[5]; + // d[7] = _b[6] - _b[7]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_hsub_epi16(a, b); + // + // return VALIDATE_INT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_hsub_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = impl.test_cases_int_pointer1; + // const int32_t *_b = impl.test_cases_int_pointer1; + // + // int32_t d[4]; + // d[0] = _a[0] - _a[1]; + // d[1] = _a[2] - _a[3]; + // d[2] = _b[0] - _b[1]; + // d[3] = _b[2] - _b[3]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_hsub_epi32(a, b); + // + // return VALIDATE_INT32_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_hsub_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // + // int16_t d[4]; + // d[0] = _a[0] - _a[1]; + // d[1] = _a[2] - _a[3]; + // d[2] = _b[0] - _b[1]; + // d[3] = _b[2] - _b[3]; + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 c = _mm_hsub_pi16(a, b); + // + // return VALIDATE_INT16_M64(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_hsub_pi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = impl.test_cases_int_pointer1; + // const int32_t *_b = impl.test_cases_int_pointer2; + // + // int32_t d[2]; + // d[0] = _a[0] - _a[1]; + // d[1] = _b[0] - _b[1]; + // + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 c = _mm_hsub_pi32(a, b); + // + // return VALIDATE_INT32_M64(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_hsubs_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer1; + // + // int16_t d16[8]; + // int32_t d32[8]; + // d32[0] = (int32_t)_a[0] - (int32_t)_a[1]; + // d32[1] = (int32_t)_a[2] - (int32_t)_a[3]; + // d32[2] = (int32_t)_a[4] - (int32_t)_a[5]; + // d32[3] = (int32_t)_a[6] - (int32_t)_a[7]; + // d32[4] = (int32_t)_b[0] - (int32_t)_b[1]; + // d32[5] = (int32_t)_b[2] - (int32_t)_b[3]; + // d32[6] = (int32_t)_b[4] - (int32_t)_b[5]; + // d32[7] = (int32_t)_b[6] - (int32_t)_b[7]; + // for (int i = 0; i < 8; i++) { + // if (d32[i] > (int32_t)INT16_MAX) + // d16[i] = INT16_MAX; + // else if (d32[i] < (int32_t)INT16_MIN) + // d16[i] = INT16_MIN; + // else + // d16[i] = (int16_t)d32[i]; + // } + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_hsubs_epi16(a, b); + // + // return VALIDATE_INT16_M128(c, d16); + return TEST_UNIMPL; +} + +result_t test_mm_hsubs_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer1; + // + // int32_t _d[4]; + // _d[0] = (int32_t)_a[0] - (int32_t)_a[1]; + // _d[1] = (int32_t)_a[2] - (int32_t)_a[3]; + // _d[2] = (int32_t)_b[0] - (int32_t)_b[1]; + // _d[3] = (int32_t)_b[2] - (int32_t)_b[3]; + // + // for (int i = 0; i < 4; i++) { + // if (_d[i] > (int32_t)INT16_MAX) { + // _d[i] = INT16_MAX; + // } else if (_d[i] < (int32_t)INT16_MIN) { + // _d[i] = INT16_MIN; + // } + // } + // + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 c = _mm_hsubs_pi16(a, b); + // + // return VALIDATE_INT16_M64(c, _d); + return TEST_UNIMPL; +} + +result_t test_mm_maddubs_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // int32_t d0 = (int32_t)(_a[0] * _b[0]); + // int32_t d1 = (int32_t)(_a[1] * _b[1]); + // int32_t d2 = (int32_t)(_a[2] * _b[2]); + // int32_t d3 = (int32_t)(_a[3] * _b[3]); + // int32_t d4 = (int32_t)(_a[4] * _b[4]); + // int32_t d5 = (int32_t)(_a[5] * _b[5]); + // int32_t d6 = (int32_t)(_a[6] * _b[6]); + // int32_t d7 = (int32_t)(_a[7] * _b[7]); + // int32_t d8 = (int32_t)(_a[8] * _b[8]); + // int32_t d9 = (int32_t)(_a[9] * _b[9]); + // int32_t d10 = (int32_t)(_a[10] * _b[10]); + // int32_t d11 = (int32_t)(_a[11] * _b[11]); + // int32_t d12 = (int32_t)(_a[12] * _b[12]); + // int32_t d13 = (int32_t)(_a[13] * _b[13]); + // int32_t d14 = (int32_t)(_a[14] * _b[14]); + // int32_t d15 = (int32_t)(_a[15] * _b[15]); + // + // int16_t e[8]; + // e[0] = saturate_16(d0 + d1); + // e[1] = saturate_16(d2 + d3); + // e[2] = saturate_16(d4 + d5); + // e[3] = saturate_16(d6 + d7); + // e[4] = saturate_16(d8 + d9); + // e[5] = saturate_16(d10 + d11); + // e[6] = saturate_16(d12 + d13); + // e[7] = saturate_16(d14 + d15); + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_maddubs_epi16(a, b); + // return VALIDATE_INT16_M128(c, e); + return TEST_UNIMPL; +} + +result_t test_mm_maddubs_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // int16_t d0 = (int16_t)(_a[0] * _b[0]); + // int16_t d1 = (int16_t)(_a[1] * _b[1]); + // int16_t d2 = (int16_t)(_a[2] * _b[2]); + // int16_t d3 = (int16_t)(_a[3] * _b[3]); + // int16_t d4 = (int16_t)(_a[4] * _b[4]); + // int16_t d5 = (int16_t)(_a[5] * _b[5]); + // int16_t d6 = (int16_t)(_a[6] * _b[6]); + // int16_t d7 = (int16_t)(_a[7] * _b[7]); + // + // int16_t e[4]; + // e[0] = saturate_16(d0 + d1); + // e[1] = saturate_16(d2 + d3); + // e[2] = saturate_16(d4 + d5); + // e[3] = saturate_16(d6 + d7); + // + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 c = _mm_maddubs_pi16(a, b); + // + // return VALIDATE_INT16_M64(c, e); + return TEST_UNIMPL; +} + +result_t test_mm_mulhrs_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // int32_t _c[8]; + // for (int i = 0; i < 8; i++) { + // _c[i] = (((((int32_t)_a[i] * (int32_t)_b[i]) >> 14) + 1) & 0x1FFFE) >> + // 1; + // } + // __m128i c = _mm_mulhrs_epi16(a, b); + // + // return VALIDATE_INT16_M128(c, _c); + return TEST_UNIMPL; +} + +result_t test_mm_mulhrs_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // int32_t _c[4]; + // for (int i = 0; i < 4; i++) { + // _c[i] = (((((int32_t)_a[i] * (int32_t)_b[i]) >> 14) + 1) & 0x1FFFE) >> + // 1; + // } + // __m64 c = _mm_mulhrs_pi16(a, b); + // + // return VALIDATE_INT16_M64(c, _c); + return TEST_UNIMPL; +} + +result_t test_mm_shuffle_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // int8_t dst[16]; + // + // for (int i = 0; i < 16; i++) { + // if (_b[i] & 0x80) { + // dst[i] = 0; + // } else { + // dst[i] = _a[_b[i] & 0x0F]; + // } + // } + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i ret = _mm_shuffle_epi8(a, b); + // + // return VALIDATE_INT8_M128(ret, dst); + return TEST_UNIMPL; +} + +result_t test_mm_shuffle_pi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // int8_t dst[8]; + // + // for (int i = 0; i < 8; i++) { + // if (_b[i] & 0x80) { + // dst[i] = 0; + // } else { + // dst[i] = _a[_b[i] & 0x07]; + // } + // } + // + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 ret = _mm_shuffle_pi8(a, b); + // + // return VALIDATE_INT8_M64(ret, dst); + return TEST_UNIMPL; +} + +result_t test_mm_sign_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // + // int16_t d[8]; + // for (int i = 0; i < 8; i++) { + // if (_b[i] < 0) { + // d[i] = -_a[i]; + // } else if (_b[i] == 0) { + // d[i] = 0; + // } else { + // d[i] = _a[i]; + // } + // } + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_sign_epi16(a, b); + // + // return VALIDATE_INT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_sign_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2; + // + // int32_t d[4]; + // for (int i = 0; i < 4; i++) { + // if (_b[i] < 0) { + // d[i] = -_a[i]; + // } else if (_b[i] == 0) { + // d[i] = 0; + // } else { + // d[i] = _a[i]; + // } + // } + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_sign_epi32(a, b); + // + // return VALIDATE_INT32_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_sign_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // + // int8_t d[16]; + // for (int i = 0; i < 16; i++) { + // if (_b[i] < 0) { + // d[i] = -_a[i]; + // } else if (_b[i] == 0) { + // d[i] = 0; + // } else { + // d[i] = _a[i]; + // } + // } + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_sign_epi8(a, b); + // + // return VALIDATE_INT8_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_sign_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // + // int16_t d[4]; + // for (int i = 0; i < 4; i++) { + // if (_b[i] < 0) { + // d[i] = -_a[i]; + // } else if (_b[i] == 0) { + // d[i] = 0; + // } else { + // d[i] = _a[i]; + // } + // } + // + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 c = _mm_sign_pi16(a, b); + // + // return VALIDATE_INT16_M64(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_sign_pi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2; + // + // int32_t d[2]; + // for (int i = 0; i < 2; i++) { + // if (_b[i] < 0) { + // d[i] = -_a[i]; + // } else if (_b[i] == 0) { + // d[i] = 0; + // } else { + // d[i] = _a[i]; + // } + // } + // + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 c = _mm_sign_pi32(a, b); + // + // return VALIDATE_INT32_M64(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_sign_pi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // + // int8_t d[8]; + // for (int i = 0; i < 8; i++) { + // if (_b[i] < 0) { + // d[i] = -_a[i]; + // } else if (_b[i] == 0) { + // d[i] = 0; + // } else { + // d[i] = _a[i]; + // } + // } + // + // __m64 a = load_m64(_a); + // __m64 b = load_m64(_b); + // __m64 c = _mm_sign_pi8(a, b); + // + // return VALIDATE_INT8_M64(c, d); + return TEST_UNIMPL; +} + +/* SSE4.1 */ +result_t test_mm_blend_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + // int16_t _c[8]; + // __m128i a, b, c; + // + // #define TEST_IMPL(IDX) + // for (int j = 0; j < 8; j++) { + // if ((IDX >> j) & 0x1) { + // _c[j] = _b[j]; + // } else { + // _c[j] = _a[j]; + // } + // } + // a = load_m128i(_a); + // b = load_m128i(_b); + // c = _mm_blend_epi16(a, b, IDX); + // CHECK_RESULT(VALIDATE_INT16_M128(c, _c)); + // + // IMM_256_ITER + // #undef TEST_IMPL + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_blend_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // __m128d a, b, c; + // + // #define TEST_IMPL(IDX) + // double _c##IDX[2]; + // for (int j = 0; j < 2; j++) { + // if ((IDX >> j) & 0x1) { + // _c##IDX[j] = _b[j]; + // } else { + // _c##IDX[j] = _a[j]; + // } + // } + // + // a = load_m128d(_a); + // b = load_m128d(_b); + // c = _mm_blend_pd(a, b, IDX); + // CHECK_RESULT(validate_double(c, _c##IDX[0], _c##IDX[1])) + // + // IMM_4_ITER + // #undef TEST_IMPL + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_blend_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 c; + // + // gcc and clang can't compile call to _mm_blend_ps with 3rd argument as + // integer type due 4 bit size limitation. + // #define TEST_IMPL(IDX) + // float _c##IDX[4]; + // for (int i = 0; i < 4; i++) { + // if (IDX & (1 << i)) { + // _c##IDX[i] = _b[i]; + // } else { + // _c##IDX[i] = _a[i]; + // } + // } + // + // c = _mm_blend_ps(a, b, IDX); + // CHECK_RESULT( + // validate_float(c, _c##IDX[0], _c##IDX[1], _c##IDX[2], _c##IDX[3])) + // + // IMM_4_ITER + // #undef TEST_IMPL + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_blendv_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // const int8_t _mask[16] = {(const int8_t)impl.test_cases_ints[iter], + // (const int8_t)impl.test_cases_ints[iter + 1], + // (const int8_t)impl.test_cases_ints[iter + 2], + // (const int8_t)impl.test_cases_ints[iter + 3], + // (const int8_t)impl.test_cases_ints[iter + 4], + // (const int8_t)impl.test_cases_ints[iter + 5], + // (const int8_t)impl.test_cases_ints[iter + 6], + // (const int8_t)impl.test_cases_ints[iter + 7]}; + // + // int8_t _c[16]; + // for (int i = 0; i < 16; i++) { + // if (_mask[i] >> 7) { + // _c[i] = _b[i]; + // } else { + // _c[i] = _a[i]; + // } + // } + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i mask = load_m128i(_mask); + // __m128i c = _mm_blendv_epi8(a, b, mask); + // + // return VALIDATE_INT8_M128(c, _c); + return TEST_UNIMPL; +} + +result_t test_mm_blendv_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // const double _mask[] = {(double)impl.test_cases_floats[iter], + // (double)impl.test_cases_floats[iter + 1]}; + // + // double _c[2]; + // for (int i = 0; i < 2; i++) { + // // signed shift right would return a result which is either all 1's + // from + // // negative numbers or all 0's from positive numbers + // if ((*(const int64_t *)(_mask + i)) >> 63) { + // _c[i] = _b[i]; + // } else { + // _c[i] = _a[i]; + // } + // } + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d mask = load_m128d(_mask); + // + // __m128d c = _mm_blendv_pd(a, b, mask); + // + // return validate_double(c, _c[0], _c[1]); + return TEST_UNIMPL; +} + +result_t test_mm_blendv_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // const float _mask[] = { + // impl.test_cases_floats[iter], impl.test_cases_floats[iter + 1], + // impl.test_cases_floats[iter + 2], impl.test_cases_floats[iter + 3]}; + // + // float _c[4]; + // for (int i = 0; i < 4; i++) { + // // signed shift right would return a result which is either all 1's + // from + // // negative numbers or all 0's from positive numbers + // if ((*(const int32_t *)(_mask + i)) >> 31) { + // _c[i] = _b[i]; + // } else { + // _c[i] = _a[i]; + // } + // } + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 mask = load_m128(_mask); + // + // __m128 c = _mm_blendv_ps(a, b, mask); + // + // return validate_float(c, _c[0], _c[1], _c[2], _c[3]); + return TEST_UNIMPL; +} + +result_t test_mm_ceil_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // + // double dx = ceil(_a[0]); + // double dy = ceil(_a[1]); + // + // __m128d a = load_m128d(_a); + // __m128d ret = _mm_ceil_pd(a); + // + // return validate_double(ret, dx, dy); + return TEST_UNIMPL; +} + +result_t test_mm_ceil_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // float dx = ceilf(_a[0]); + // float dy = ceilf(_a[1]); + // float dz = ceilf(_a[2]); + // float dw = ceilf(_a[3]); + // + // __m128 a = _mm_load_ps(_a); + // __m128 c = _mm_ceil_ps(a); + // return validate_float(c, dx, dy, dz, dw); + return TEST_UNIMPL; +} + +result_t test_mm_ceil_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // + // double dx = ceil(_b[0]); + // double dy = _a[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d ret = _mm_ceil_sd(a, b); + // + // return validate_double(ret, dx, dy); + return TEST_UNIMPL; +} + +result_t test_mm_ceil_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer1; + // + // float f0 = ceilf(_b[0]); + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 c = _mm_ceil_ss(a, b); + // + // return validate_float(c, f0, _a[1], _a[2], _a[3]); + return TEST_UNIMPL; +} + +result_t test_mm_cmpeq_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // const int64_t *_b = (const int64_t *)impl.test_cases_int_pointer2; + // int64_t d0 = (_a[0] == _b[0]) ? 0xffffffffffffffff : 0x0; + // int64_t d1 = (_a[1] == _b[1]) ? 0xffffffffffffffff : 0x0; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_cmpeq_epi64(a, b); + // return validate_int64(c, d0, d1); + return TEST_UNIMPL; +} + +result_t test_mm_cvtepi16_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // + // int32_t d[4]; + // d[0] = (int32_t)_a[0]; + // d[1] = (int32_t)_a[1]; + // d[2] = (int32_t)_a[2]; + // d[3] = (int32_t)_a[3]; + // + // __m128i a = load_m128i(_a); + // __m128i ret = _mm_cvtepi16_epi32(a); + // + // return VALIDATE_INT32_M128(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_cvtepi16_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // + // int64_t i0 = (int64_t)_a[0]; + // int64_t i1 = (int64_t)_a[1]; + // + // __m128i a = load_m128i(_a); + // __m128i ret = _mm_cvtepi16_epi64(a); + // + // return validate_int64(ret, i0, i1); + return TEST_UNIMPL; +} + +result_t test_mm_cvtepi32_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // + // int64_t i0 = (int64_t)_a[0]; + // int64_t i1 = (int64_t)_a[1]; + // + // __m128i a = load_m128i(_a); + // __m128i ret = _mm_cvtepi32_epi64(a); + // + // return validate_int64(ret, i0, i1); + return TEST_UNIMPL; +} + +result_t test_mm_cvtepi8_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // + // int16_t d[8]; + // d[0] = (int16_t)_a[0]; + // d[1] = (int16_t)_a[1]; + // d[2] = (int16_t)_a[2]; + // d[3] = (int16_t)_a[3]; + // d[4] = (int16_t)_a[4]; + // d[5] = (int16_t)_a[5]; + // d[6] = (int16_t)_a[6]; + // d[7] = (int16_t)_a[7]; + // + // __m128i a = load_m128i(_a); + // __m128i ret = _mm_cvtepi8_epi16(a); + // + // return VALIDATE_INT16_M128(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_cvtepi8_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // + // int32_t d[4]; + // d[0] = (int32_t)_a[0]; + // d[1] = (int32_t)_a[1]; + // d[2] = (int32_t)_a[2]; + // d[3] = (int32_t)_a[3]; + // + // __m128i a = load_m128i(_a); + // __m128i ret = _mm_cvtepi8_epi32(a); + // + // return VALIDATE_INT32_M128(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_cvtepi8_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // + // int64_t i0 = (int64_t)_a[0]; + // int64_t i1 = (int64_t)_a[1]; + // + // __m128i a = load_m128i(_a); + // __m128i ret = _mm_cvtepi8_epi64(a); + // + // return validate_int64(ret, i0, i1); + return TEST_UNIMPL; +} + +result_t test_mm_cvtepu16_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1; + // + // int32_t d[4]; + // d[0] = (int32_t)_a[0]; + // d[1] = (int32_t)_a[1]; + // d[2] = (int32_t)_a[2]; + // d[3] = (int32_t)_a[3]; + // + // __m128i a = load_m128i(_a); + // __m128i ret = _mm_cvtepu16_epi32(a); + // + // return VALIDATE_INT32_M128(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_cvtepu16_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1; + // + // int64_t i0 = (int64_t)_a[0]; + // int64_t i1 = (int64_t)_a[1]; + // + // __m128i a = load_m128i(_a); + // __m128i ret = _mm_cvtepu16_epi64(a); + // + // return validate_int64(ret, i0, i1); + return TEST_UNIMPL; +} + +result_t test_mm_cvtepu32_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint32_t *_a = (const uint32_t *)impl.test_cases_int_pointer1; + // + // int64_t i0 = (int64_t)_a[0]; + // int64_t i1 = (int64_t)_a[1]; + // + // __m128i a = load_m128i(_a); + // __m128i ret = _mm_cvtepu32_epi64(a); + // + // return validate_int64(ret, i0, i1); + return TEST_UNIMPL; +} + +result_t test_mm_cvtepu8_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1; + // + // int16_t d[8]; + // d[0] = (int16_t)_a[0]; + // d[1] = (int16_t)_a[1]; + // d[2] = (int16_t)_a[2]; + // d[3] = (int16_t)_a[3]; + // d[4] = (int16_t)_a[4]; + // d[5] = (int16_t)_a[5]; + // d[6] = (int16_t)_a[6]; + // d[7] = (int16_t)_a[7]; + // + // __m128i a = load_m128i(_a); + // __m128i ret = _mm_cvtepu8_epi16(a); + // + // return VALIDATE_INT16_M128(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_cvtepu8_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1; + // + // int32_t d[4]; + // d[0] = (int32_t)_a[0]; + // d[1] = (int32_t)_a[1]; + // d[2] = (int32_t)_a[2]; + // d[3] = (int32_t)_a[3]; + // + // __m128i a = load_m128i(_a); + // __m128i ret = _mm_cvtepu8_epi32(a); + // + // return VALIDATE_INT32_M128(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_cvtepu8_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1; + // + // int64_t i0 = (int64_t)_a[0]; + // int64_t i1 = (int64_t)_a[1]; + // + // __m128i a = load_m128i(_a); + // __m128i ret = _mm_cvtepu8_epi64(a); + // + // return validate_int64(ret, i0, i1); + return TEST_UNIMPL; +} + +#define MM_DP_PD_TEST_CASE_WITH(imm8) \ + do { \ + const double *_a = (const double *)impl.test_cases_float_pointer1; \ + const double *_b = (const double *)impl.test_cases_float_pointer2; \ + const int imm = imm8; \ + double d[2]; \ + double sum = 0; \ + for (size_t i = 0; i < 2; i++) \ + sum += ((imm) & (1 << (i + 4))) ? _a[i] * _b[i] : 0; \ + for (size_t i = 0; i < 2; i++) \ + d[i] = (imm & (1 << i)) ? sum : 0; \ + __m128d a = load_m128d(_a); \ + __m128d b = load_m128d(_b); \ + __m128d ret = _mm_dp_pd(a, b, imm); \ + if (validate_double(ret, d[0], d[1]) != TEST_SUCCESS) \ + return TEST_FAIL; \ + } while (0) + +#define GENERATE_MM_DP_PD_TEST_CASES \ + MM_DP_PD_TEST_CASE_WITH(0xF0); \ + MM_DP_PD_TEST_CASE_WITH(0xF1); \ + MM_DP_PD_TEST_CASE_WITH(0xF2); \ + MM_DP_PD_TEST_CASE_WITH(0xFF); \ + MM_DP_PD_TEST_CASE_WITH(0x10); \ + MM_DP_PD_TEST_CASE_WITH(0x11); \ + MM_DP_PD_TEST_CASE_WITH(0x12); \ + MM_DP_PD_TEST_CASE_WITH(0x13); \ + MM_DP_PD_TEST_CASE_WITH(0x00); \ + MM_DP_PD_TEST_CASE_WITH(0x01); \ + MM_DP_PD_TEST_CASE_WITH(0x02); \ + MM_DP_PD_TEST_CASE_WITH(0x03); \ + MM_DP_PD_TEST_CASE_WITH(0x20); \ + MM_DP_PD_TEST_CASE_WITH(0x21); \ + MM_DP_PD_TEST_CASE_WITH(0x22); \ + MM_DP_PD_TEST_CASE_WITH(0x23); + +result_t test_mm_dp_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // GENERATE_MM_DP_PD_TEST_CASES + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +#define MM_DP_PS_TEST_CASE_WITH(IMM) \ + do { \ + const float *_a = impl.test_cases_float_pointer1; \ + const float *_b = impl.test_cases_float_pointer2; \ + const int imm = IMM; \ + __m128 a = load_m128(_a); \ + __m128 b = load_m128(_b); \ + __m128 out = _mm_dp_ps(a, b, imm); \ + float r[4]; /* the reference */ \ + float sum = 0; \ + for (size_t i = 0; i < 4; i++) \ + sum += ((imm) & (1 << (i + 4))) ? _a[i] * _b[i] : 0; \ + for (size_t i = 0; i < 4; i++) \ + r[i] = (imm & (1 << i)) ? sum : 0; \ + /* the epsilon has to be large enough, otherwise test suite fails. */ \ + if (validate_float_epsilon(out, r[0], r[1], r[2], r[3], 2050.0f) != \ + TEST_SUCCESS) \ + return TEST_FAIL; \ + } while (0) + +#define GENERATE_MM_DP_PS_TEST_CASES \ + MM_DP_PS_TEST_CASE_WITH(0xFF); \ + MM_DP_PS_TEST_CASE_WITH(0x7F); \ + MM_DP_PS_TEST_CASE_WITH(0x9F); \ + MM_DP_PS_TEST_CASE_WITH(0x2F); \ + MM_DP_PS_TEST_CASE_WITH(0x0F); \ + MM_DP_PS_TEST_CASE_WITH(0x23); \ + MM_DP_PS_TEST_CASE_WITH(0xB5); + +result_t test_mm_dp_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // GENERATE_MM_DP_PS_TEST_CASES + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_extract_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // int32_t *_a = (int32_t *)impl.test_cases_int_pointer1; + // __m128i a = load_m128i(_a); + // int c; + // + // #define TEST_IMPL(IDX) + // c = _mm_extract_epi32(a, IDX); + // ASSERT_RETURN(c == *(_a + IDX)); + // + // IMM_4_ITER + // #undef TEST_IMPL + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_extract_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // int64_t *_a = (int64_t *)impl.test_cases_int_pointer1; + // __m128i a = load_m128i(_a); + // __int64 c; + // + // #define TEST_IMPL(IDX) + // c = _mm_extract_epi64(a, IDX); + // ASSERT_RETURN(c == *(_a + IDX)); + // + // IMM_2_ITER + // #undef TEST_IMPL + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_extract_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // uint8_t *_a = (uint8_t *)impl.test_cases_int_pointer1; + // __m128i a = load_m128i(_a); + // int c; + // + // #define TEST_IMPL(IDX) + // c = _mm_extract_epi8(a, IDX); + // ASSERT_RETURN(c == *(_a + IDX)); + // + // IMM_8_ITER + // #undef TEST_IMPL + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_extract_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = (const float *)impl.test_cases_float_pointer1; + // + // __m128 a = _mm_load_ps(_a); + // int32_t c; + // + // #define TEST_IMPL(IDX) + // c = _mm_extract_ps(a, IDX); + // ASSERT_RETURN(c == *(const int32_t *)(_a + IDX)); + // + // IMM_4_ITER + // #undef TEST_IMPL + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_floor_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // + // double dx = floor(_a[0]); + // double dy = floor(_a[1]); + // + // __m128d a = load_m128d(_a); + // __m128d ret = _mm_floor_pd(a); + // + // return validate_double(ret, dx, dy); + return TEST_UNIMPL; +} + +result_t test_mm_floor_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // float dx = floorf(_a[0]); + // float dy = floorf(_a[1]); + // float dz = floorf(_a[2]); + // float dw = floorf(_a[3]); + // + // __m128 a = load_m128(_a); + // __m128 c = _mm_floor_ps(a); + // return validate_float(c, dx, dy, dz, dw); + return TEST_UNIMPL; +} + +result_t test_mm_floor_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (const double *)impl.test_cases_float_pointer1; + // const double *_b = (const double *)impl.test_cases_float_pointer2; + // + // double dx = floor(_b[0]); + // double dy = _a[1]; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // __m128d ret = _mm_floor_sd(a, b); + // + // return validate_double(ret, dx, dy); + return TEST_UNIMPL; +} + +result_t test_mm_floor_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer1; + // + // float f0 = floorf(_b[0]); + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // __m128 c = _mm_floor_ss(a, b); + // + // return validate_float(c, f0, _a[1], _a[2], _a[3]); + return TEST_UNIMPL; +} + +result_t test_mm_insert_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // const int32_t insert = (int32_t)*impl.test_cases_int_pointer2; + // __m128i a, b; + // + // #define TEST_IMPL(IDX) + // int32_t d##IDX[4]; + // for (int i = 0; i < 4; i++) { + // d##IDX[i] = _a[i]; + // } + // d##IDX[IDX] = insert; + // + // a = load_m128i(_a); + // b = _mm_insert_epi32(a, (int)insert, IDX); + // CHECK_RESULT(VALIDATE_INT32_M128(b, d##IDX)); + // + // IMM_4_ITER + // #undef TEST_IMPL + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_insert_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // int64_t insert = (int64_t)*impl.test_cases_int_pointer2; + // + // __m128i a, b; + // int64_t d[2]; + // #define TEST_IMPL(IDX) + // d[0] = _a[0]; + // d[1] = _a[1]; + // d[IDX] = insert; + // a = load_m128i(_a); + // b = _mm_insert_epi64(a, insert, IDX); + // CHECK_RESULT(validate_int64(b, d[0], d[1])); + // + // IMM_2_ITER + // #undef TEST_IMPL + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_insert_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int8_t insert = (int8_t)*impl.test_cases_int_pointer2; + // __m128i a, b; + // int8_t d[16]; + // + // #define TEST_IMPL(IDX) + // for (int i = 0; i < 16; i++) { + // d[i] = _a[i]; + // } + // d[IDX] = insert; + // a = load_m128i(_a); + // b = _mm_insert_epi8(a, insert, IDX); + // CHECK_RESULT(VALIDATE_INT8_M128(b, d)); + // + // IMM_16_ITER + // #undef TEST_IMPL + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_insert_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // + // __m128 a, b, c; + // #define TEST_IMPL(IDX) + // float d##IDX[4] = {_a[0], _a[1], _a[2], _a[3]}; + // d##IDX[(IDX >> 4) & 0x3] = _b[(IDX >> 6) & 0x3]; + // + // for (int j = 0; j < 4; j++) { + // if (IDX & (1 << j)) { + // d##IDX[j] = 0; + // } + // } + // + // a = _mm_load_ps(_a); + // b = _mm_load_ps(_b); + // c = _mm_insert_ps(a, b, IDX); + // CHECK_RESULT(validate_float(c, d##IDX[0], d##IDX[1], d##IDX[2], + // d##IDX[3])); + // + // IMM_256_ITER + // #undef TEST_IMPL + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_max_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2; + // + // int32_t d[4]; + // d[0] = _a[0] > _b[0] ? _a[0] : _b[0]; + // d[1] = _a[1] > _b[1] ? _a[1] : _b[1]; + // d[2] = _a[2] > _b[2] ? _a[2] : _b[2]; + // d[3] = _a[3] > _b[3] ? _a[3] : _b[3]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_max_epi32(a, b); + // + // return VALIDATE_INT32_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_max_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // int8_t d[16]; + // d[0] = _a[0] > _b[0] ? _a[0] : _b[0]; + // d[1] = _a[1] > _b[1] ? _a[1] : _b[1]; + // d[2] = _a[2] > _b[2] ? _a[2] : _b[2]; + // d[3] = _a[3] > _b[3] ? _a[3] : _b[3]; + // d[4] = _a[4] > _b[4] ? _a[4] : _b[4]; + // d[5] = _a[5] > _b[5] ? _a[5] : _b[5]; + // d[6] = _a[6] > _b[6] ? _a[6] : _b[6]; + // d[7] = _a[7] > _b[7] ? _a[7] : _b[7]; + // d[8] = _a[8] > _b[8] ? _a[8] : _b[8]; + // d[9] = _a[9] > _b[9] ? _a[9] : _b[9]; + // d[10] = _a[10] > _b[10] ? _a[10] : _b[10]; + // d[11] = _a[11] > _b[11] ? _a[11] : _b[11]; + // d[12] = _a[12] > _b[12] ? _a[12] : _b[12]; + // d[13] = _a[13] > _b[13] ? _a[13] : _b[13]; + // d[14] = _a[14] > _b[14] ? _a[14] : _b[14]; + // d[15] = _a[15] > _b[15] ? _a[15] : _b[15]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // + // __m128i c = _mm_max_epi8(a, b); + // return VALIDATE_INT8_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_max_epu16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1; + // const uint16_t *_b = (const uint16_t *)impl.test_cases_int_pointer2; + // + // uint16_t d[8]; + // d[0] = _a[0] > _b[0] ? _a[0] : _b[0]; + // d[1] = _a[1] > _b[1] ? _a[1] : _b[1]; + // d[2] = _a[2] > _b[2] ? _a[2] : _b[2]; + // d[3] = _a[3] > _b[3] ? _a[3] : _b[3]; + // d[4] = _a[4] > _b[4] ? _a[4] : _b[4]; + // d[5] = _a[5] > _b[5] ? _a[5] : _b[5]; + // d[6] = _a[6] > _b[6] ? _a[6] : _b[6]; + // d[7] = _a[7] > _b[7] ? _a[7] : _b[7]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_max_epu16(a, b); + // + // return VALIDATE_UINT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_max_epu32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint32_t *_a = (const uint32_t *)impl.test_cases_int_pointer1; + // const uint32_t *_b = (const uint32_t *)impl.test_cases_int_pointer2; + // + // uint32_t d[4]; + // d[0] = _a[0] > _b[0] ? _a[0] : _b[0]; + // d[1] = _a[1] > _b[1] ? _a[1] : _b[1]; + // d[2] = _a[2] > _b[2] ? _a[2] : _b[2]; + // d[3] = _a[3] > _b[3] ? _a[3] : _b[3]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_max_epu32(a, b); + // + // return VALIDATE_UINT32_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_min_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2; + // + // int32_t d[4]; + // d[0] = _a[0] < _b[0] ? _a[0] : _b[0]; + // d[1] = _a[1] < _b[1] ? _a[1] : _b[1]; + // d[2] = _a[2] < _b[2] ? _a[2] : _b[2]; + // d[3] = _a[3] < _b[3] ? _a[3] : _b[3]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_min_epi32(a, b); + // + // return VALIDATE_INT32_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_min_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1; + // const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2; + // + // int8_t d[16]; + // d[0] = _a[0] < _b[0] ? _a[0] : _b[0]; + // d[1] = _a[1] < _b[1] ? _a[1] : _b[1]; + // d[2] = _a[2] < _b[2] ? _a[2] : _b[2]; + // d[3] = _a[3] < _b[3] ? _a[3] : _b[3]; + // d[4] = _a[4] < _b[4] ? _a[4] : _b[4]; + // d[5] = _a[5] < _b[5] ? _a[5] : _b[5]; + // d[6] = _a[6] < _b[6] ? _a[6] : _b[6]; + // d[7] = _a[7] < _b[7] ? _a[7] : _b[7]; + // d[8] = _a[8] < _b[8] ? _a[8] : _b[8]; + // d[9] = _a[9] < _b[9] ? _a[9] : _b[9]; + // d[10] = _a[10] < _b[10] ? _a[10] : _b[10]; + // d[11] = _a[11] < _b[11] ? _a[11] : _b[11]; + // d[12] = _a[12] < _b[12] ? _a[12] : _b[12]; + // d[13] = _a[13] < _b[13] ? _a[13] : _b[13]; + // d[14] = _a[14] < _b[14] ? _a[14] : _b[14]; + // d[15] = _a[15] < _b[15] ? _a[15] : _b[15]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // + // __m128i c = _mm_min_epi8(a, b); + // return VALIDATE_INT8_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_min_epu16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1; + // const uint16_t *_b = (const uint16_t *)impl.test_cases_int_pointer2; + // + // uint16_t d[8]; + // d[0] = _a[0] < _b[0] ? _a[0] : _b[0]; + // d[1] = _a[1] < _b[1] ? _a[1] : _b[1]; + // d[2] = _a[2] < _b[2] ? _a[2] : _b[2]; + // d[3] = _a[3] < _b[3] ? _a[3] : _b[3]; + // d[4] = _a[4] < _b[4] ? _a[4] : _b[4]; + // d[5] = _a[5] < _b[5] ? _a[5] : _b[5]; + // d[6] = _a[6] < _b[6] ? _a[6] : _b[6]; + // d[7] = _a[7] < _b[7] ? _a[7] : _b[7]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_min_epu16(a, b); + // + // return VALIDATE_UINT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_min_epu32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint32_t *_a = (const uint32_t *)impl.test_cases_int_pointer1; + // const uint32_t *_b = (const uint32_t *)impl.test_cases_int_pointer2; + // + // uint32_t d[4]; + // d[0] = _a[0] < _b[0] ? _a[0] : _b[0]; + // d[1] = _a[1] < _b[1] ? _a[1] : _b[1]; + // d[2] = _a[2] < _b[2] ? _a[2] : _b[2]; + // d[3] = _a[3] < _b[3] ? _a[3] : _b[3]; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_min_epu32(a, b); + // + // return VALIDATE_UINT32_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_minpos_epu16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + // uint16_t index = 0, min = (uint16_t)_a[0]; + // for (int i = 0; i < 8; i++) { + // if ((uint16_t)_a[i] < min) { + // index = (uint16_t)i; + // min = (uint16_t)_a[i]; + // } + // } + // + // uint16_t d[8] = {min, index, 0, 0, 0, 0, 0, 0}; + // + // __m128i a = load_m128i(_a); + // __m128i ret = _mm_minpos_epu16(a); + // return VALIDATE_UINT16_M128(ret, d); + return TEST_UNIMPL; +} + +result_t test_mm_mpsadbw_epu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1; + // const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c; + // #define TEST_IMPL(IDX) + // uint8_t a_offset##IDX = ((IDX >> 2) & 0x1) * 4; + // uint8_t b_offset##IDX = (IDX & 0x3) * 4; + // + // uint16_t d##IDX[8] = {}; + // for (int i = 0; i < 8; i++) { + // for (int j = 0; j < 4; j++) { + // d##IDX[i] += abs(_a[(a_offset##IDX + i) + j] - _b[b_offset##IDX + + // j]); + // } + // } + // c = _mm_mpsadbw_epu8(a, b, IDX); + // CHECK_RESULT(VALIDATE_UINT16_M128(c,d##IDX)); + // + // IMM_8_ITER + // #undef TEST_IMPL + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_mul_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2; + // + // int64_t dx = (int64_t)(_a[0]) * (int64_t)(_b[0]); + // int64_t dy = (int64_t)(_a[2]) * (int64_t)(_b[2]); + // + // __m128i a = _mm_loadu_si128((const __m128i *)_a); + // __m128i b = _mm_loadu_si128((const __m128i *)_b); + // __m128i r = _mm_mul_epi32(a, b); + // + // return validate_int64(r, dx, dy); + return TEST_UNIMPL; +} + +result_t test_mm_mullo_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = impl.test_cases_int_pointer1; + // const int32_t *_b = impl.test_cases_int_pointer2; + // int32_t d[4]; + // + // for (int i = 0; i < 4; i++) { + // d[i] = (int32_t)((int64_t)_a[i] * (int64_t)_b[i]); + // } + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_mullo_epi32(a, b); + // return VALIDATE_INT32_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_packus_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // uint16_t max = UINT16_MAX; + // uint16_t min = 0; + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2; + // + // uint16_t d[8]; + // for (int i = 0; i < 4; i++) { + // if (_a[i] > (int32_t)max) + // d[i] = max; + // else if (_a[i] < (int32_t)min) + // d[i] = min; + // else + // d[i] = (uint16_t)_a[i]; + // } + // for (int i = 0; i < 4; i++) { + // if (_b[i] > (int32_t)max) + // d[i + 4] = max; + // else if (_b[i] < (int32_t)min) + // d[i + 4] = min; + // else + // d[i + 4] = (uint16_t)_b[i]; + // } + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i c = _mm_packus_epi32(a, b); + // + // return VALIDATE_UINT16_M128(c, d); + return TEST_UNIMPL; +} + +result_t test_mm_round_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (double *)impl.test_cases_float_pointer1; + // double d[2]; + // __m128d ret; + // + // __m128d a = load_m128d(_a); + // switch (iter & 0x7) { + // case 0: + // d[0] = bankersRounding(_a[0]); + // d[1] = bankersRounding(_a[1]); + // + // ret = _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + // break; + // case 1: + // d[0] = floor(_a[0]); + // d[1] = floor(_a[1]); + // + // ret = _mm_round_pd(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + // break; + // case 2: + // d[0] = ceil(_a[0]); + // d[1] = ceil(_a[1]); + // + // ret = _mm_round_pd(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + // break; + // case 3: + // d[0] = _a[0] > 0 ? floor(_a[0]) : ceil(_a[0]); + // d[1] = _a[1] > 0 ? floor(_a[1]) : ceil(_a[1]); + // + // ret = _mm_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + // break; + // case 4: + // d[0] = bankersRounding(_a[0]); + // d[1] = bankersRounding(_a[1]); + // + // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + // ret = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); + // break; + // case 5: + // d[0] = floor(_a[0]); + // d[1] = floor(_a[1]); + // + // _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); + // ret = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); + // break; + // case 6: + // d[0] = ceil(_a[0]); + // d[1] = ceil(_a[1]); + // + // _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); + // ret = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); + // break; + // case 7: + // d[0] = _a[0] > 0 ? floor(_a[0]) : ceil(_a[0]); + // d[1] = _a[1] > 0 ? floor(_a[1]) : ceil(_a[1]); + // + // _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); + // ret = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); + // break; + // } + // + // return validate_double(ret, d[0], d[1]); + return TEST_UNIMPL; +} + +result_t test_mm_round_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // float f[4]; + // __m128 ret; + // + // __m128 a = load_m128(_a); + // switch (iter & 0x7) { + // case 0: + // f[0] = bankersRounding(_a[0]); + // f[1] = bankersRounding(_a[1]); + // f[2] = bankersRounding(_a[2]); + // f[3] = bankersRounding(_a[3]); + // + // ret = _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + // break; + // case 1: + // f[0] = floorf(_a[0]); + // f[1] = floorf(_a[1]); + // f[2] = floorf(_a[2]); + // f[3] = floorf(_a[3]); + // + // ret = _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + // break; + // case 2: + // f[0] = ceilf(_a[0]); + // f[1] = ceilf(_a[1]); + // f[2] = ceilf(_a[2]); + // f[3] = ceilf(_a[3]); + // + // ret = _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + // break; + // case 3: + // f[0] = _a[0] > 0 ? floorf(_a[0]) : ceilf(_a[0]); + // f[1] = _a[1] > 0 ? floorf(_a[1]) : ceilf(_a[1]); + // f[2] = _a[2] > 0 ? floorf(_a[2]) : ceilf(_a[2]); + // f[3] = _a[3] > 0 ? floorf(_a[3]) : ceilf(_a[3]); + // + // ret = _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + // break; + // case 4: + // f[0] = bankersRounding(_a[0]); + // f[1] = bankersRounding(_a[1]); + // f[2] = bankersRounding(_a[2]); + // f[3] = bankersRounding(_a[3]); + // + // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + // ret = _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION); + // break; + // case 5: + // f[0] = floorf(_a[0]); + // f[1] = floorf(_a[1]); + // f[2] = floorf(_a[2]); + // f[3] = floorf(_a[3]); + // + // _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); + // ret = _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION); + // break; + // case 6: + // f[0] = ceilf(_a[0]); + // f[1] = ceilf(_a[1]); + // f[2] = ceilf(_a[2]); + // f[3] = ceilf(_a[3]); + // + // _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); + // ret = _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION); + // break; + // case 7: + // f[0] = _a[0] > 0 ? floorf(_a[0]) : ceilf(_a[0]); + // f[1] = _a[1] > 0 ? floorf(_a[1]) : ceilf(_a[1]); + // f[2] = _a[2] > 0 ? floorf(_a[2]) : ceilf(_a[2]); + // f[3] = _a[3] > 0 ? floorf(_a[3]) : ceilf(_a[3]); + // + // _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); + // ret = _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION); + // break; + // } + // + // return validate_float(ret, f[0], f[1], f[2], f[3]); + return TEST_UNIMPL; +} + +result_t test_mm_round_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const double *_a = (double *)impl.test_cases_float_pointer1; + // const double *_b = (double *)impl.test_cases_float_pointer2; + // double d[2]; + // __m128d ret; + // + // __m128d a = load_m128d(_a); + // __m128d b = load_m128d(_b); + // d[1] = _a[1]; + // switch (iter & 0x7) { + // case 0: + // d[0] = bankersRounding(_b[0]); + // + // ret = _mm_round_sd(a, b, _MM_FROUND_TO_NEAREST_INT | + // _MM_FROUND_NO_EXC); break; + // case 1: + // d[0] = floor(_b[0]); + // + // ret = _mm_round_sd(a, b, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + // break; + // case 2: + // d[0] = ceil(_b[0]); + // + // ret = _mm_round_sd(a, b, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + // break; + // case 3: + // d[0] = _b[0] > 0 ? floor(_b[0]) : ceil(_b[0]); + // + // ret = _mm_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + // break; + // case 4: + // d[0] = bankersRounding(_b[0]); + // + // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + // ret = _mm_round_sd(a, b, _MM_FROUND_CUR_DIRECTION); + // break; + // case 5: + // d[0] = floor(_b[0]); + // + // _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); + // ret = _mm_round_sd(a, b, _MM_FROUND_CUR_DIRECTION); + // break; + // case 6: + // d[0] = ceil(_b[0]); + // + // _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); + // ret = _mm_round_sd(a, b, _MM_FROUND_CUR_DIRECTION); + // break; + // case 7: + // d[0] = _b[0] > 0 ? floor(_b[0]) : ceil(_b[0]); + // + // _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); + // ret = _mm_round_sd(a, b, _MM_FROUND_CUR_DIRECTION); + // break; + // } + // + // return validate_double(ret, d[0], d[1]); + return TEST_UNIMPL; +} + +result_t test_mm_round_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const float *_a = impl.test_cases_float_pointer1; + // const float *_b = impl.test_cases_float_pointer2; + // float f[4]; + // __m128 ret; + // + // __m128 a = load_m128(_a); + // __m128 b = load_m128(_b); + // switch (iter & 0x7) { + // case 0: + // f[0] = bankersRounding(_b[0]); + // + // ret = _mm_round_ss(a, b, _MM_FROUND_TO_NEAREST_INT | + // _MM_FROUND_NO_EXC); break; + // case 1: + // f[0] = floorf(_b[0]); + // + // ret = _mm_round_ss(a, b, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + // break; + // case 2: + // f[0] = ceilf(_b[0]); + // + // ret = _mm_round_ss(a, b, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + // break; + // case 3: + // f[0] = _b[0] > 0 ? floorf(_b[0]) : ceilf(_b[0]); + // + // ret = _mm_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + // break; + // case 4: + // f[0] = bankersRounding(_b[0]); + // + // _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + // ret = _mm_round_ss(a, b, _MM_FROUND_CUR_DIRECTION); + // break; + // case 5: + // f[0] = floorf(_b[0]); + // + // _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); + // ret = _mm_round_ss(a, b, _MM_FROUND_CUR_DIRECTION); + // break; + // case 6: + // f[0] = ceilf(_b[0]); + // + // _MM_SET_ROUNDING_MODE(_MM_ROUND_UP); + // ret = _mm_round_ss(a, b, _MM_FROUND_CUR_DIRECTION); + // break; + // case 7: + // f[0] = _b[0] > 0 ? floorf(_b[0]) : ceilf(_b[0]); + // + // _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); + // ret = _mm_round_ss(a, b, _MM_FROUND_CUR_DIRECTION); + // break; + // } + // f[1] = _a[1]; + // f[2] = _a[2]; + // f[3] = _a[3]; + // + // return validate_float(ret, f[0], f[1], f[2], f[3]); + return TEST_UNIMPL; +} + +result_t test_mm_stream_load_si128(const SSE2RVV_TEST_IMPL &impl, + uint32_t iter) { + // int32_t *addr = impl.test_cases_int_pointer1; + // + // __m128i ret = _mm_stream_load_si128((__m128i *)addr); + // + // return VALIDATE_INT32_M128(ret, addr); + return TEST_UNIMPL; +} + +result_t test_mm_test_all_ones(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // __m128i a = load_m128i(_a); + // + // int32_t d0 = ~_a[0] & (~(uint32_t)0); + // int32_t d1 = ~_a[1] & (~(uint32_t)0); + // int32_t d2 = ~_a[2] & (~(uint32_t)0); + // int32_t d3 = ~_a[3] & (~(uint32_t)0); + // int32_t result = ((d0 | d1 | d2 | d3) == 0) ? 1 : 0; + // + // int32_t ret = _mm_test_all_ones(a); + // + // return result == ret ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +result_t test_mm_test_all_zeros(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // const int32_t *_mask = (const int32_t *)impl.test_cases_int_pointer2; + // __m128i a = load_m128i(_a); + // __m128i mask = load_m128i(_mask); + // + // int32_t d0 = _a[0] & _mask[0]; + // int32_t d1 = _a[1] & _mask[1]; + // int32_t d2 = _a[2] & _mask[2]; + // int32_t d3 = _a[3] & _mask[3]; + // int32_t result = ((d0 | d1 | d2 | d3) == 0) ? 1 : 0; + // + // int32_t ret = _mm_test_all_zeros(a, mask); + // + // return result == ret ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +result_t test_mm_test_mix_ones_zeros(const SSE2RVV_TEST_IMPL &impl, + uint32_t iter) { + // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + // const int32_t *_mask = (const int32_t *)impl.test_cases_int_pointer2; + // __m128i a = load_m128i(_a); + // __m128i mask = load_m128i(_mask); + // + // int32_t d0 = !((_a[0]) & _mask[0]) & !((!_a[0]) & _mask[0]); + // int32_t d1 = !((_a[1]) & _mask[1]) & !((!_a[1]) & _mask[1]); + // int32_t d2 = !((_a[2]) & _mask[2]) & !((!_a[2]) & _mask[2]); + // int32_t d3 = !((_a[3]) & _mask[3]) & !((!_a[3]) & _mask[3]); + // int32_t result = ((d0 & d1 & d2 & d3) == 0) ? 1 : 0; + // + // int32_t ret = _mm_test_mix_ones_zeros(a, mask); + // + // return result == ret ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +result_t test_mm_testc_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = impl.test_cases_int_pointer1; + // const int32_t *_b = impl.test_cases_int_pointer2; + // __m128i a = _mm_load_si128((const __m128i *)_a); + // __m128i b = _mm_load_si128((const __m128i *)_b); + // int testc = 1; + // for (int i = 0; i < 2; i++) { + // if ((~(((SIMDVec *)&a)->m128_u64[i]) & ((SIMDVec *)&b)->m128_u64[i])) { + // testc = 0; + // break; + // } + // } + // return _mm_testc_si128(a, b) == testc ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +result_t test_mm_testnzc_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // return test_mm_test_mix_ones_zeros(impl, iter); + return TEST_UNIMPL; +} + +result_t test_mm_testz_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *_a = impl.test_cases_int_pointer1; + // const int32_t *_b = impl.test_cases_int_pointer2; + // __m128i a = _mm_load_si128((const __m128i *)_a); + // __m128i b = _mm_load_si128((const __m128i *)_b); + // int testz = 1; + // for (int i = 0; i < 2; i++) { + // if ((((SIMDVec *)&a)->m128_u64[i] & ((SIMDVec *)&b)->m128_u64[i])) { + // testz = 0; + // break; + // } + // } + // return _mm_testz_si128(a, b) == testz ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +/* SSE4.2 */ + +result_t test_mm_cmpestrc(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // GENERATE_MM_CMPESTRC_TEST_CASES + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_cmpgt_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1; + // const int64_t *_b = (const int64_t *)impl.test_cases_int_pointer2; + // + // int64_t result[2]; + // result[0] = _a[0] > _b[0] ? -1 : 0; + // result[1] = _a[1] > _b[1] ? -1 : 0; + // + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // __m128i iret = _mm_cmpgt_epi64(a, b); + // + // return validate_int64(iret, result[0], result[1]); + return TEST_UNIMPL; +} + +result_t test_mm_cmpistrs(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // GENERATE_MM_CMPISTRS_TEST_CASES + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_cmpistrz(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // GENERATE_MM_CMPISTRZ_TEST_CASES + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_crc32_u16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // uint32_t crc = *(const uint32_t *)impl.test_cases_int_pointer1; + // uint16_t v = iter; + // uint32_t result = _mm_crc32_u16(crc, v); + // ASSERT_RETURN(result == canonical_crc32_u16(crc, v)); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_crc32_u32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // uint32_t crc = *(const uint32_t *)impl.test_cases_int_pointer1; + // uint32_t v = *(const uint32_t *)impl.test_cases_int_pointer2; + // uint32_t result = _mm_crc32_u32(crc, v); + // ASSERT_RETURN(result == canonical_crc32_u32(crc, v)); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_crc32_u64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // uint64_t crc = *(const uint64_t *)impl.test_cases_int_pointer1; + // uint64_t v = *(const uint64_t *)impl.test_cases_int_pointer2; + // uint64_t result = _mm_crc32_u64(crc, v); + // ASSERT_RETURN(result == canonical_crc32_u64(crc, v)); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_crc32_u8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // uint32_t crc = *(const uint32_t *)impl.test_cases_int_pointer1; + // uint8_t v = iter; + // uint32_t result = _mm_crc32_u8(crc, v); + // ASSERT_RETURN(result == canonical_crc32_u8(crc, v)); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +/* AES */ +result_t test_mm_aesenc_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *a = (int32_t *)impl.test_cases_int_pointer1; + // const int32_t *b = (int32_t *)impl.test_cases_int_pointer2; + // __m128i data = _mm_loadu_si128((const __m128i *)a); + // __m128i rk = _mm_loadu_si128((const __m128i *)b); + // + // __m128i resultReference = aesenc_128_reference(data, rk); + // __m128i resultIntrinsic = _mm_aesenc_si128(data, rk); + // + // return validate_128bits(resultReference, resultIntrinsic); + return TEST_UNIMPL; +} + +result_t test_mm_aesdec_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const int32_t *a = (int32_t *)impl.test_cases_int_pointer1; + // const int32_t *b = (int32_t *)impl.test_cases_int_pointer2; + // __m128i data = _mm_loadu_si128((const __m128i *)a); + // __m128i rk = _mm_loadu_si128((const __m128i *)b); + // + // __m128i resultReference = aesdec_128_reference(data, rk); + // __m128i resultIntrinsic = _mm_aesdec_si128(data, rk); + // + // return validate_128bits(resultReference, resultIntrinsic); + return TEST_UNIMPL; +} + +result_t test_mm_aesenclast_si128(const SSE2RVV_TEST_IMPL &impl, + uint32_t iter) { + // const int32_t *a = (const int32_t *)impl.test_cases_int_pointer1; + // const int32_t *b = (const int32_t *)impl.test_cases_int_pointer2; + // __m128i data = _mm_loadu_si128((const __m128i *)a); + // __m128i rk = _mm_loadu_si128((const __m128i *)b); + // + // __m128i resultReference = aesenclast_128_reference(data, rk); + // __m128i resultIntrinsic = _mm_aesenclast_si128(data, rk); + // + // return validate_128bits(resultReference, resultIntrinsic); + return TEST_UNIMPL; +} + +result_t test_mm_aesdeclast_si128(const SSE2RVV_TEST_IMPL &impl, + uint32_t iter) { + // const uint8_t *a = (uint8_t *)impl.test_cases_int_pointer1; + // const uint8_t *rk = (uint8_t *)impl.test_cases_int_pointer2; + // __m128i _a = _mm_loadu_si128((const __m128i *)a); + // __m128i _rk = _mm_loadu_si128((const __m128i *)rk); + // uint8_t c[16] = {}; + // + // uint8_t v[4][4]; + // for (int i = 0; i < 16; ++i) { + // v[((i / 4) + (i % 4)) % 4][i % 4] = crypto_aes_rsbox[a[i]]; + // } + // for (int i = 0; i < 16; ++i) { + // c[i] = v[i / 4][i % 4] ^ rk[i]; + // } + // + // __m128i result_reference = _mm_loadu_si128((const __m128i *)c); + // __m128i result_intrinsic = _mm_aesdeclast_si128(_a, _rk); + // + // return validate_128bits(result_reference, result_intrinsic); + return TEST_UNIMPL; +} + +result_t test_mm_aesimc_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint8_t *a = (uint8_t *)impl.test_cases_int_pointer1; + // __m128i _a = _mm_loadu_si128((const __m128i *)a); + // + // uint8_t e, f, g, h, v[4][4]; + // for (int i = 0; i < 16; ++i) { + // ((uint8_t *)v)[i] = a[i]; + // } + // for (int i = 0; i < 4; ++i) { + // e = v[i][0]; + // f = v[i][1]; + // g = v[i][2]; + // h = v[i][3]; + // + // v[i][0] = MULTIPLY(e, 0x0e) ^ MULTIPLY(f, 0x0b) ^ MULTIPLY(g, 0x0d) ^ + // MULTIPLY(h, 0x09); + // v[i][1] = MULTIPLY(e, 0x09) ^ MULTIPLY(f, 0x0e) ^ MULTIPLY(g, 0x0b) ^ + // MULTIPLY(h, 0x0d); + // v[i][2] = MULTIPLY(e, 0x0d) ^ MULTIPLY(f, 0x09) ^ MULTIPLY(g, 0x0e) ^ + // MULTIPLY(h, 0x0b); + // v[i][3] = MULTIPLY(e, 0x0b) ^ MULTIPLY(f, 0x0d) ^ MULTIPLY(g, 0x09) ^ + // MULTIPLY(h, 0x0e); + // } + // + // __m128i result_reference = _mm_loadu_si128((const __m128i *)v); + // __m128i result_intrinsic = _mm_aesimc_si128(_a); + // + // return validate_128bits(result_reference, result_intrinsic); + return TEST_UNIMPL; +} + +static inline uint32_t sub_word(uint32_t in) { + return (crypto_aes_sbox[(in >> 24) & 0xff] << 24) | + (crypto_aes_sbox[(in >> 16) & 0xff] << 16) | + (crypto_aes_sbox[(in >> 8) & 0xff] << 8) | + (crypto_aes_sbox[in & 0xff]); +} + +// FIXME: improve the test case for AES-256 key expansion. +// Reference: +// https://github.com/randombit/botan/blob/master/src/lib/block/aes/aes_ni/aes_ni.cpp +result_t test_mm_aeskeygenassist_si128(const SSE2RVV_TEST_IMPL &impl, + uint32_t iter) { + // const uint32_t *a = (uint32_t *)impl.test_cases_int_pointer1; + // __m128i data = load_m128i(a); + // uint32_t sub_x1 = sub_word(a[1]); + // uint32_t sub_x3 = sub_word(a[3]); + // __m128i result_reference; + // __m128i result_intrinsic; + // #define TEST_IMPL(IDX) + // uint32_t res##IDX[4] = { + // sub_x1, + // rotr(sub_x1, 8) ^ IDX, + // sub_x3, + // rotr(sub_x3, 8) ^ IDX, + // }; + // result_reference = load_m128i(res##IDX); + // result_intrinsic = _mm_aeskeygenassist_si128(data, IDX); + // CHECK_RESULT(validate_128bits(result_reference, result_intrinsic)); + // + // IMM_256_ITER + // #undef TEST_IMPL + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +/* Others */ +result_t test_mm_clmulepi64_si128(const SSE2RVV_TEST_IMPL &impl, + uint32_t iter) { + // const uint64_t *_a = (const uint64_t *)impl.test_cases_int_pointer1; + // const uint64_t *_b = (const uint64_t *)impl.test_cases_int_pointer2; + // __m128i a = load_m128i(_a); + // __m128i b = load_m128i(_b); + // auto result = clmul_64(_a[0], _b[0]); + // if (!validate_uint64(_mm_clmulepi64_si128(a, b, 0x00), result.first, + // result.second)) + // return TEST_FAIL; + // result = clmul_64(_a[1], _b[0]); + // if (!validate_uint64(_mm_clmulepi64_si128(a, b, 0x01), result.first, + // result.second)) + // return TEST_FAIL; + // result = clmul_64(_a[0], _b[1]); + // if (!validate_uint64(_mm_clmulepi64_si128(a, b, 0x10), result.first, + // result.second)) + // return TEST_FAIL; + // result = clmul_64(_a[1], _b[1]); + // if (!validate_uint64(_mm_clmulepi64_si128(a, b, 0x11), result.first, + // result.second)) + // return TEST_FAIL; + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_get_denormals_zero_mode(const SSE2RVV_TEST_IMPL &impl, + uint32_t iter) { + // int res_denormals_zero_on, res_denormals_zero_off; + // + // _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); + // res_denormals_zero_on = + // _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_ON; + // + // _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF); + // res_denormals_zero_off = + // _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_OFF; + // + // return (res_denormals_zero_on && res_denormals_zero_off) ? TEST_SUCCESS + // : TEST_FAIL; + return TEST_UNIMPL; +} + +// static int popcnt_reference(uint64_t a) { +// int count = 0; +// while (a != 0) { +// count += a & 1; +// a >>= 1; +// } +// return count; +// } + +result_t test_mm_popcnt_u32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint64_t *a = (const uint64_t *)impl.test_cases_int_pointer1; + // ASSERT_RETURN(popcnt_reference((uint32_t)a[0]) == _mm_popcnt_u32(a[0])); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_popcnt_u64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // const uint64_t *a = (const uint64_t *)impl.test_cases_int_pointer1; + // ASSERT_RETURN(popcnt_reference(a[0]) == _mm_popcnt_u64(a[0])); + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_mm_set_denormals_zero_mode(const SSE2RVV_TEST_IMPL &impl, + uint32_t iter) { + // result_t res_set_denormals_zero_on, res_set_denormals_zero_off; + // float factor = 2; + // float denormal = FLT_MIN / factor; + // float denormals[4] = {denormal, denormal, denormal, denormal}; + // float factors[4] = {factor, factor, factor, factor}; + // __m128 ret; + // + // _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); + // ret = _mm_mul_ps(load_m128(denormals), load_m128(factors)); + // res_set_denormals_zero_on = validate_float(ret, 0, 0, 0, 0); + // + // _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF); + // ret = _mm_mul_ps(load_m128(denormals), load_m128(factors)); + // res_set_denormals_zero_off = + // validate_float(ret, FLT_MIN, FLT_MIN, FLT_MIN, FLT_MIN); + // + // if (res_set_denormals_zero_on == TEST_FAIL || + // res_set_denormals_zero_off == TEST_FAIL) + // return TEST_FAIL; + // return TEST_SUCCESS; + return TEST_UNIMPL; +} + +result_t test_rdtsc(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + // uint64_t start = _rdtsc(); + // for (int i = 0; i < 100000; i++) { + // #if defined(_MSC_VER) + // _ReadWriteBarrier(); + // #else + // __asm__ __volatile__("" ::: "memory"); + // #endif + // } + // uint64_t end = _rdtsc(); + // return end > start ? TEST_SUCCESS : TEST_FAIL; + return TEST_UNIMPL; +} + +#if defined(__riscv_v_elen) +#define REGISTER_SIZE __riscv_v_elen +#elif defined(__aarch64__) +#define REGISTER_SIZE 128 +#elif (defined(__x86_64__) || defined(__i386__)) +#define REGISTER_SIZE sizeof(__m128) +#endif + +SSE2RVV_TEST_IMPL::SSE2RVV_TEST_IMPL(void) { + test_cases_float_pointer1 = (float *)platform_aligned_alloc(REGISTER_SIZE); + test_cases_float_pointer2 = (float *)platform_aligned_alloc(REGISTER_SIZE); + test_cases_int_pointer1 = (int32_t *)platform_aligned_alloc(REGISTER_SIZE); + test_cases_int_pointer2 = (int32_t *)platform_aligned_alloc(REGISTER_SIZE); + SSE2RVV_INIT_RNG(123456); + for (uint32_t i = 0; i < MAX_TEST_VALUE; i++) { + test_cases_floats[i] = ranf(-100000, 100000); + test_cases_ints[i] = (int32_t)ranf(-100000, 100000); + } +} + +// Dummy function to match the case label in run_single_test. +result_t test_last(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { + return TEST_SUCCESS; +} + +result_t SSE2RVV_TEST_IMPL::load_test_float_pointers(uint32_t i) { + // result_t ret = do_mm_store_ps( + // test_cases_float_pointer1, test_cases_floats[i], test_cases_floats[i + + // 1], test_cases_floats[i + 2], test_cases_floats[i + 3]); + // if (ret == TEST_SUCCESS) { + // ret = do_mm_store_ps(test_cases_float_pointer2, test_cases_floats[i + 4], + // test_cases_floats[i + 5], test_cases_floats[i + 6], + // test_cases_floats[i + 7]); + // } + // return ret; + return TEST_UNIMPL; +} + +result_t SSE2RVV_TEST_IMPL::load_test_int_pointers(uint32_t i) { + // result_t ret = do_mm_store_ps(test_cases_int_pointer1, test_cases_ints[i], + // test_cases_ints[i + 1], test_cases_ints[i + + // 2], test_cases_ints[i + 3]); + // if (ret == TEST_SUCCESS) { + // ret = do_mm_store_ps(test_cases_int_pointer2, test_cases_ints[i + 4], + // test_cases_ints[i + 5], test_cases_ints[i + 6], + // test_cases_ints[i + 7]); + // } + + // return ret; + return TEST_UNIMPL; +} + +result_t SSE2RVV_TEST_IMPL::run_single_test(INSTRUCTION_TEST test, uint32_t i) { + result_t ret = TEST_SUCCESS; + + switch (test) { +#define _(x) \ + case it_##x: \ + ret = test_##x(*this, i); \ + break; + INTRIN_LIST +#undef _ + } + + return ret; +} + +const char *instruction_string[] = { +#define _(x) #x, + INTRIN_LIST +#undef _ +}; + +SSE2RVV_TEST *SSE2RVV_TEST::create(void) { + SSE2RVV_TEST_IMPL *st = new SSE2RVV_TEST_IMPL; + return static_cast(st); +} + +} // namespace SSE2RVV diff --git a/tests/impl.h b/tests/impl.h new file mode 100644 index 0000000..bdf4931 --- /dev/null +++ b/tests/impl.h @@ -0,0 +1,570 @@ +#ifndef SSE2RVV_TEST_H +#define SSE2RVV_TEST_H + +#include "common.h" + +#define INTRIN_LIST \ + /* MMX */ \ + _(mm_empty) \ + /* SSE */ \ + _(mm_add_ps) \ + _(mm_add_ss) \ + _(mm_and_ps) \ + _(mm_andnot_ps) \ + _(mm_avg_pu16) \ + _(mm_avg_pu8) \ + _(mm_cmpeq_ps) \ + _(mm_cmpeq_ss) \ + _(mm_cmpge_ps) \ + _(mm_cmpge_ss) \ + _(mm_cmpgt_ps) \ + _(mm_cmpgt_ss) \ + _(mm_cmple_ps) \ + _(mm_cmple_ss) \ + _(mm_cmplt_ps) \ + _(mm_cmplt_ss) \ + _(mm_cmpneq_ps) \ + _(mm_cmpneq_ss) \ + _(mm_cmpnge_ps) \ + _(mm_cmpnge_ss) \ + _(mm_cmpngt_ps) \ + _(mm_cmpngt_ss) \ + _(mm_cmpnle_ps) \ + _(mm_cmpnle_ss) \ + _(mm_cmpnlt_ps) \ + _(mm_cmpnlt_ss) \ + _(mm_cmpord_ps) \ + _(mm_cmpord_ss) \ + _(mm_cmpunord_ps) \ + _(mm_cmpunord_ss) \ + _(mm_comieq_ss) \ + _(mm_comige_ss) \ + _(mm_comigt_ss) \ + _(mm_comile_ss) \ + _(mm_comilt_ss) \ + _(mm_comineq_ss) \ + _(mm_cvt_pi2ps) \ + _(mm_cvt_ps2pi) \ + _(mm_cvt_si2ss) \ + _(mm_cvt_ss2si) \ + _(mm_cvtpi16_ps) \ + _(mm_cvtpi32_ps) \ + _(mm_cvtpi32x2_ps) \ + _(mm_cvtpi8_ps) \ + _(mm_cvtps_pi16) \ + _(mm_cvtps_pi32) \ + _(mm_cvtps_pi8) \ + _(mm_cvtpu16_ps) \ + _(mm_cvtpu8_ps) \ + _(mm_cvtsi32_ss) \ + _(mm_cvtsi64_ss) \ + _(mm_cvtss_f32) \ + _(mm_cvtss_si32) \ + _(mm_cvtss_si64) \ + _(mm_cvtt_ps2pi) \ + _(mm_cvtt_ss2si) \ + _(mm_cvttps_pi32) \ + _(mm_cvttss_si32) \ + _(mm_cvttss_si64) \ + _(mm_div_ps) \ + _(mm_div_ss) \ + _(mm_extract_pi16) \ + _(mm_free) \ + _(mm_get_flush_zero_mode) \ + _(mm_get_rounding_mode) \ + _(mm_getcsr) \ + _(mm_insert_pi16) \ + _(mm_load_ps) \ + _(mm_load_ps1) \ + _(mm_load_ss) \ + _(mm_load1_ps) \ + _(mm_loadh_pi) \ + _(mm_loadl_pi) \ + _(mm_loadr_ps) \ + _(mm_loadu_ps) \ + _(mm_loadu_si16) \ + _(mm_loadu_si64) \ + _(mm_malloc) \ + _(mm_maskmove_si64) \ + _(m_maskmovq) \ + _(mm_max_pi16) \ + _(mm_max_ps) \ + _(mm_max_pu8) \ + _(mm_max_ss) \ + _(mm_min_pi16) \ + _(mm_min_ps) \ + _(mm_min_pu8) \ + _(mm_min_ss) \ + _(mm_move_ss) \ + _(mm_movehl_ps) \ + _(mm_movelh_ps) \ + _(mm_movemask_pi8) \ + _(mm_movemask_ps) \ + _(mm_mul_ps) \ + _(mm_mul_ss) \ + _(mm_mulhi_pu16) \ + _(mm_or_ps) \ + _(m_pavgb) \ + _(m_pavgw) \ + _(m_pextrw) \ + _(m_pinsrw) \ + _(m_pmaxsw) \ + _(m_pmaxub) \ + _(m_pminsw) \ + _(m_pminub) \ + _(m_pmovmskb) \ + _(m_pmulhuw) \ + _(mm_prefetch) \ + _(m_psadbw) \ + _(m_pshufw) \ + _(mm_rcp_ps) \ + _(mm_rcp_ss) \ + _(mm_rsqrt_ps) \ + _(mm_rsqrt_ss) \ + _(mm_sad_pu8) \ + _(mm_set_flush_zero_mode) \ + _(mm_set_ps) \ + _(mm_set_ps1) \ + _(mm_set_rounding_mode) \ + _(mm_set_ss) \ + _(mm_set1_ps) \ + _(mm_setcsr) \ + _(mm_setr_ps) \ + _(mm_setzero_ps) \ + _(mm_sfence) \ + _(mm_shuffle_pi16) \ + _(mm_shuffle_ps) \ + _(mm_sqrt_ps) \ + _(mm_sqrt_ss) \ + _(mm_store_ps) \ + _(mm_store_ps1) \ + _(mm_store_ss) \ + _(mm_store1_ps) \ + _(mm_storeh_pi) \ + _(mm_storel_pi) \ + _(mm_storer_ps) \ + _(mm_storeu_ps) \ + _(mm_storeu_si16) \ + _(mm_storeu_si64) \ + _(mm_stream_pi) \ + _(mm_stream_ps) \ + _(mm_sub_ps) \ + _(mm_sub_ss) \ + _(mm_ucomieq_ss) \ + _(mm_ucomige_ss) \ + _(mm_ucomigt_ss) \ + _(mm_ucomile_ss) \ + _(mm_ucomilt_ss) \ + _(mm_ucomineq_ss) \ + _(mm_undefined_ps) \ + _(mm_unpackhi_ps) \ + _(mm_unpacklo_ps) \ + _(mm_xor_ps) \ + /* SSE2 */ \ + _(mm_add_epi16) \ + _(mm_add_epi32) \ + _(mm_add_epi64) \ + _(mm_add_epi8) \ + _(mm_add_pd) \ + _(mm_add_sd) \ + _(mm_add_si64) \ + _(mm_adds_epi16) \ + _(mm_adds_epi8) \ + _(mm_adds_epu16) \ + _(mm_adds_epu8) \ + _(mm_and_pd) \ + _(mm_and_si128) \ + _(mm_andnot_pd) \ + _(mm_andnot_si128) \ + _(mm_avg_epu16) \ + _(mm_avg_epu8) \ + _(mm_bslli_si128) \ + _(mm_bsrli_si128) \ + _(mm_castpd_ps) \ + _(mm_castpd_si128) \ + _(mm_castps_pd) \ + _(mm_castps_si128) \ + _(mm_castsi128_pd) \ + _(mm_castsi128_ps) \ + _(mm_clflush) \ + _(mm_cmpeq_epi16) \ + _(mm_cmpeq_epi32) \ + _(mm_cmpeq_epi8) \ + _(mm_cmpeq_pd) \ + _(mm_cmpeq_sd) \ + _(mm_cmpge_pd) \ + _(mm_cmpge_sd) \ + _(mm_cmpgt_epi16) \ + _(mm_cmpgt_epi32) \ + _(mm_cmpgt_epi8) \ + _(mm_cmpgt_pd) \ + _(mm_cmpgt_sd) \ + _(mm_cmple_pd) \ + _(mm_cmple_sd) \ + _(mm_cmplt_epi16) \ + _(mm_cmplt_epi32) \ + _(mm_cmplt_epi8) \ + _(mm_cmplt_pd) \ + _(mm_cmplt_sd) \ + _(mm_cmpneq_pd) \ + _(mm_cmpneq_sd) \ + _(mm_cmpnge_pd) \ + _(mm_cmpnge_sd) \ + _(mm_cmpngt_pd) \ + _(mm_cmpngt_sd) \ + _(mm_cmpnle_pd) \ + _(mm_cmpnle_sd) \ + _(mm_cmpnlt_pd) \ + _(mm_cmpnlt_sd) \ + _(mm_cmpord_pd) \ + _(mm_cmpord_sd) \ + _(mm_cmpunord_pd) \ + _(mm_cmpunord_sd) \ + _(mm_comieq_sd) \ + _(mm_comige_sd) \ + _(mm_comigt_sd) \ + _(mm_comile_sd) \ + _(mm_comilt_sd) \ + _(mm_comineq_sd) \ + _(mm_cvtepi32_pd) \ + _(mm_cvtepi32_ps) \ + _(mm_cvtpd_epi32) \ + _(mm_cvtpd_pi32) \ + _(mm_cvtpd_ps) \ + _(mm_cvtpi32_pd) \ + _(mm_cvtps_epi32) \ + _(mm_cvtps_pd) \ + _(mm_cvtsd_f64) \ + _(mm_cvtsd_si32) \ + _(mm_cvtsd_si64) \ + _(mm_cvtsd_si64x) \ + _(mm_cvtsd_ss) \ + _(mm_cvtsi128_si32) \ + _(mm_cvtsi128_si64) \ + _(mm_cvtsi128_si64x) \ + _(mm_cvtsi32_sd) \ + _(mm_cvtsi32_si128) \ + _(mm_cvtsi64_sd) \ + _(mm_cvtsi64_si128) \ + _(mm_cvtsi64x_sd) \ + _(mm_cvtsi64x_si128) \ + _(mm_cvtss_sd) \ + _(mm_cvttpd_epi32) \ + _(mm_cvttpd_pi32) \ + _(mm_cvttps_epi32) \ + _(mm_cvttsd_si32) \ + _(mm_cvttsd_si64) \ + _(mm_cvttsd_si64x) \ + _(mm_div_pd) \ + _(mm_div_sd) \ + _(mm_extract_epi16) \ + _(mm_insert_epi16) \ + _(mm_lfence) \ + _(mm_load_pd) \ + _(mm_load_pd1) \ + _(mm_load_sd) \ + _(mm_load_si128) \ + _(mm_load1_pd) \ + _(mm_loadh_pd) \ + _(mm_loadl_epi64) \ + _(mm_loadl_pd) \ + _(mm_loadr_pd) \ + _(mm_loadu_pd) \ + _(mm_loadu_si128) \ + _(mm_loadu_si32) \ + _(mm_madd_epi16) \ + _(mm_maskmoveu_si128) \ + _(mm_max_epi16) \ + _(mm_max_epu8) \ + _(mm_max_pd) \ + _(mm_max_sd) \ + _(mm_mfence) \ + _(mm_min_epi16) \ + _(mm_min_epu8) \ + _(mm_min_pd) \ + _(mm_min_sd) \ + _(mm_move_epi64) \ + _(mm_move_sd) \ + _(mm_movemask_epi8) \ + _(mm_movemask_pd) \ + _(mm_movepi64_pi64) \ + _(mm_movpi64_epi64) \ + _(mm_mul_epu32) \ + _(mm_mul_pd) \ + _(mm_mul_sd) \ + _(mm_mul_su32) \ + _(mm_mulhi_epi16) \ + _(mm_mulhi_epu16) \ + _(mm_mullo_epi16) \ + _(mm_or_pd) \ + _(mm_or_si128) \ + _(mm_packs_epi16) \ + _(mm_packs_epi32) \ + _(mm_packus_epi16) \ + _(mm_pause) \ + _(mm_sad_epu8) \ + _(mm_set_epi16) \ + _(mm_set_epi32) \ + _(mm_set_epi64) \ + _(mm_set_epi64x) \ + _(mm_set_epi8) \ + _(mm_set_pd) \ + _(mm_set_pd1) \ + _(mm_set_sd) \ + _(mm_set1_epi16) \ + _(mm_set1_epi32) \ + _(mm_set1_epi64) \ + _(mm_set1_epi64x) \ + _(mm_set1_epi8) \ + _(mm_set1_pd) \ + _(mm_setr_epi16) \ + _(mm_setr_epi32) \ + _(mm_setr_epi64) \ + _(mm_setr_epi8) \ + _(mm_setr_pd) \ + _(mm_setzero_pd) \ + _(mm_setzero_si128) \ + _(mm_shuffle_epi32) \ + _(mm_shuffle_pd) \ + _(mm_shufflehi_epi16) \ + _(mm_shufflelo_epi16) \ + _(mm_sll_epi16) \ + _(mm_sll_epi32) \ + _(mm_sll_epi64) \ + _(mm_slli_epi16) \ + _(mm_slli_epi32) \ + _(mm_slli_epi64) \ + _(mm_slli_si128) \ + _(mm_sqrt_pd) \ + _(mm_sqrt_sd) \ + _(mm_sra_epi16) \ + _(mm_sra_epi32) \ + _(mm_srai_epi16) \ + _(mm_srai_epi32) \ + _(mm_srl_epi16) \ + _(mm_srl_epi32) \ + _(mm_srl_epi64) \ + _(mm_srli_epi16) \ + _(mm_srli_epi32) \ + _(mm_srli_epi64) \ + _(mm_srli_si128) \ + _(mm_store_pd) \ + _(mm_store_pd1) \ + _(mm_store_sd) \ + _(mm_store_si128) \ + _(mm_store1_pd) \ + _(mm_storeh_pd) \ + _(mm_storel_epi64) \ + _(mm_storel_pd) \ + _(mm_storer_pd) \ + _(mm_storeu_pd) \ + _(mm_storeu_si128) \ + _(mm_storeu_si32) \ + _(mm_stream_pd) \ + _(mm_stream_si128) \ + _(mm_stream_si32) \ + _(mm_stream_si64) \ + _(mm_sub_epi16) \ + _(mm_sub_epi32) \ + _(mm_sub_epi64) \ + _(mm_sub_epi8) \ + _(mm_sub_pd) \ + _(mm_sub_sd) \ + _(mm_sub_si64) \ + _(mm_subs_epi16) \ + _(mm_subs_epi8) \ + _(mm_subs_epu16) \ + _(mm_subs_epu8) \ + _(mm_ucomieq_sd) \ + _(mm_ucomige_sd) \ + _(mm_ucomigt_sd) \ + _(mm_ucomile_sd) \ + _(mm_ucomilt_sd) \ + _(mm_ucomineq_sd) \ + _(mm_undefined_pd) \ + _(mm_undefined_si128) \ + _(mm_unpackhi_epi16) \ + _(mm_unpackhi_epi32) \ + _(mm_unpackhi_epi64) \ + _(mm_unpackhi_epi8) \ + _(mm_unpackhi_pd) \ + _(mm_unpacklo_epi16) \ + _(mm_unpacklo_epi32) \ + _(mm_unpacklo_epi64) \ + _(mm_unpacklo_epi8) \ + _(mm_unpacklo_pd) \ + _(mm_xor_pd) \ + _(mm_xor_si128) \ + /* SSE3 */ \ + _(mm_addsub_pd) \ + _(mm_addsub_ps) \ + _(mm_hadd_pd) \ + _(mm_hadd_ps) \ + _(mm_hsub_pd) \ + _(mm_hsub_ps) \ + _(mm_lddqu_si128) \ + _(mm_loaddup_pd) \ + _(mm_movedup_pd) \ + _(mm_movehdup_ps) \ + _(mm_moveldup_ps) \ + /* SSSE3 */ \ + _(mm_abs_epi16) \ + _(mm_abs_epi32) \ + _(mm_abs_epi8) \ + _(mm_abs_pi16) \ + _(mm_abs_pi32) \ + _(mm_abs_pi8) \ + _(mm_alignr_epi8) \ + _(mm_alignr_pi8) \ + _(mm_hadd_epi16) \ + _(mm_hadd_epi32) \ + _(mm_hadd_pi16) \ + _(mm_hadd_pi32) \ + _(mm_hadds_epi16) \ + _(mm_hadds_pi16) \ + _(mm_hsub_epi16) \ + _(mm_hsub_epi32) \ + _(mm_hsub_pi16) \ + _(mm_hsub_pi32) \ + _(mm_hsubs_epi16) \ + _(mm_hsubs_pi16) \ + _(mm_maddubs_epi16) \ + _(mm_maddubs_pi16) \ + _(mm_mulhrs_epi16) \ + _(mm_mulhrs_pi16) \ + _(mm_shuffle_epi8) \ + _(mm_shuffle_pi8) \ + _(mm_sign_epi16) \ + _(mm_sign_epi32) \ + _(mm_sign_epi8) \ + _(mm_sign_pi16) \ + _(mm_sign_pi32) \ + _(mm_sign_pi8) \ + /* SSE4.1 */ \ + _(mm_blend_epi16) \ + _(mm_blend_pd) \ + _(mm_blend_ps) \ + _(mm_blendv_epi8) \ + _(mm_blendv_pd) \ + _(mm_blendv_ps) \ + _(mm_ceil_pd) \ + _(mm_ceil_ps) \ + _(mm_ceil_sd) \ + _(mm_ceil_ss) \ + _(mm_cmpeq_epi64) \ + _(mm_cvtepi16_epi32) \ + _(mm_cvtepi16_epi64) \ + _(mm_cvtepi32_epi64) \ + _(mm_cvtepi8_epi16) \ + _(mm_cvtepi8_epi32) \ + _(mm_cvtepi8_epi64) \ + _(mm_cvtepu16_epi32) \ + _(mm_cvtepu16_epi64) \ + _(mm_cvtepu32_epi64) \ + _(mm_cvtepu8_epi16) \ + _(mm_cvtepu8_epi32) \ + _(mm_cvtepu8_epi64) \ + _(mm_dp_pd) \ + _(mm_dp_ps) \ + _(mm_extract_epi32) \ + _(mm_extract_epi64) \ + _(mm_extract_epi8) \ + _(mm_extract_ps) \ + _(mm_floor_pd) \ + _(mm_floor_ps) \ + _(mm_floor_sd) \ + _(mm_floor_ss) \ + _(mm_insert_epi32) \ + _(mm_insert_epi64) \ + _(mm_insert_epi8) \ + _(mm_insert_ps) \ + _(mm_max_epi32) \ + _(mm_max_epi8) \ + _(mm_max_epu16) \ + _(mm_max_epu32) \ + _(mm_min_epi32) \ + _(mm_min_epi8) \ + _(mm_min_epu16) \ + _(mm_min_epu32) \ + _(mm_minpos_epu16) \ + _(mm_mpsadbw_epu8) \ + _(mm_mul_epi32) \ + _(mm_mullo_epi32) \ + _(mm_packus_epi32) \ + _(mm_round_pd) \ + _(mm_round_ps) \ + _(mm_round_sd) \ + _(mm_round_ss) \ + _(mm_stream_load_si128) \ + _(mm_test_all_ones) \ + _(mm_test_all_zeros) \ + _(mm_test_mix_ones_zeros) \ + _(mm_testc_si128) \ + _(mm_testnzc_si128) \ + _(mm_testz_si128) \ + /* SSE4.2 */ \ + /*_(mm_cmpestra)*/ \ + /*_(mm_cmpestrc) */ \ + /*_(mm_cmpestri) */ \ + /*_(mm_cmpestrm) */ \ + /*_(mm_cmpestro) */ \ + /*_(mm_cmpestrs) */ \ + /*_(mm_cmpestrz) */ \ + /*_(mm_cmpgt_epi64) */ \ + /*_(mm_cmpistra) */ \ + /*_(mm_cmpistrc) */ \ + /*_(mm_cmpistri) */ \ + /*_(mm_cmpistrm) */ \ + /*_(mm_cmpistro) */ \ + /*_(mm_cmpistrs) */ \ + /*_(mm_cmpistrz) */ \ + /*_(mm_crc32_u16) */ \ + /*_(mm_crc32_u32) */ \ + /*_(mm_crc32_u64) */ \ + /*_(mm_crc32_u8) */ \ + /* AES */ \ + _(mm_aesenc_si128) \ + _(mm_aesdec_si128) \ + _(mm_aesenclast_si128) \ + _(mm_aesdeclast_si128) \ + _(mm_aesimc_si128) \ + _(mm_aeskeygenassist_si128) \ + /* Others */ \ + _(mm_clmulepi64_si128) \ + _(mm_get_denormals_zero_mode) \ + _(mm_popcnt_u32) \ + _(mm_popcnt_u64) \ + _(mm_set_denormals_zero_mode) \ + _(rdtsc) \ + _(last) /* This indicates the end of macros */ + +namespace SSE2RVV { +// The way unit tests are implemented is that 10,000 random floating point and +// integer vec4 numbers are generated as sample data. +// +// A short C implementation of every intrinsic is implemented and compared to +// the actual expected results from the corresponding SSE intrinsic against all +// of the 10,000 randomized input vectors. When running on RISCV, then the +// results are compared to the RISCV approximate version. +extern const char *instruction_string[]; +enum INSTRUCTION_TEST { +#define _(x) it_##x, + INTRIN_LIST +#undef _ +}; + +class SSE2RVV_TEST { +public: + static SSE2RVV_TEST *create(void); // create the test. + + // Run test of this instruction; + // Passed: TEST_SUCCESS (1) + // Failed: TEST_FAIL (0) + // Unimplemented: TEST_UNIMPL (-1) + virtual result_t run_test(INSTRUCTION_TEST test) = 0; + virtual void release(void) = 0; +}; + +} // namespace SSE2RVV + +#endif // SSE2RVV_TEST_H diff --git a/tests/main.cpp b/tests/main.cpp new file mode 100644 index 0000000..e7deff6 --- /dev/null +++ b/tests/main.cpp @@ -0,0 +1,36 @@ +#include "impl.h" +#include +#include + +int main(int /*argc*/, const char ** /*argv*/) { + SSE2RVV::SSE2RVV_TEST *test = SSE2RVV::SSE2RVV_TEST::create(); + uint32_t pass_count = 0; + uint32_t failed_count = 0; + uint32_t ignore_count = 0; + for (uint32_t i = 0; i < SSE2RVV::it_last; i++) { + SSE2RVV::INSTRUCTION_TEST it = SSE2RVV::INSTRUCTION_TEST(i); + SSE2RVV::result_t ret = test->run_test(it); + // If the test fails, we will run it again so we can step into the + // debugger and figure out why! + if (ret == SSE2RVV::TEST_FAIL) { + printf("Test %-30s failed\n", SSE2RVV::instruction_string[it]); + failed_count++; + } else if (ret == SSE2RVV::TEST_UNIMPL) { + printf("Test %-30s skipped\n", SSE2RVV::instruction_string[it]); + ignore_count++; + } else { + printf("Test %-30s passed\n", SSE2RVV::instruction_string[it]); + pass_count++; + } + } + test->release(); + printf("SSE2RVV_TEST Complete!\n" + "Passed: %d\n" + "Failed: %d\n" + "Ignored: %d\n" + "Coverage rate: %.2f%%\n", + pass_count, failed_count, ignore_count, + (float)pass_count / (pass_count + failed_count + ignore_count) * 100); + + return failed_count ? -1 : 0; +}