diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..8e75eee
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,12 @@
+ColumnLimit:     80
+BraceWrapping:
+  AfterFunction: true
+  AfterNamespace: true
+  AfterStruct: true
+  AfterClass: true
+  AfterControlStatement: true
+  AfterEnum: true
+  AfterUnion: true
+  AfterExternBlock: true
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
diff --git a/.github/workflows/github_actions.yml b/.github/workflows/github_actions.yml
new file mode 100644
index 0000000..923d520
--- /dev/null
+++ b/.github/workflows/github_actions.yml
@@ -0,0 +1,45 @@
+name: Github Actions
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  cross_compile_tests:
+    runs-on: ubuntu-20.04
+    steps:
+      - name: checkout code
+        uses: actions/checkout@v3.2.0
+      - name: setup riscv toolchain
+        run: |
+          mkdir /opt/riscv
+          export PATH=$PATH:/opt/riscv/bin
+          wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.10.18/riscv64-elf-ubuntu-20.04-gcc-nightly-2023.10.18-nightly.tar.gz
+          sudo tar -xzf riscv64-elf-ubuntu-20.04-gcc-nightly-2023.10.18-nightly.tar.gz -C /opt/
+          
+      - name: run tests
+        run: |
+          export PATH=$PATH:/opt/riscv/bin
+          sh scripts/cross-test.sh qemu
+
+  check_test_cases:
+    runs-on: ubuntu-20.04
+    steps:
+      - name: checkout code
+        uses: actions/checkout@v3.2.0
+      - name: build artifact
+        run: |
+          make test
+
+  coding_style:
+    runs-on: ubuntu-20.04
+    steps:
+      - name: checkout code
+        uses: actions/checkout@v3.2.0
+      - name: style check
+        run: |
+          sudo apt-get install -q -y clang-format-12
+          sh scripts/check-format.sh
+        shell: bash
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e92bb52
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+*.exe
+*.o
+*.gch
+tests/*.d
+tests/main
+.vs/
+Debug/
+Release/
+*.log
diff --git a/LICENSE b/LICENSE
index df86fd7..9fa6f8d 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2023 Yang Hau
+Copyright (c) 2023 SSE2RVV Contributors
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..e714b3c
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,89 @@
+ifndef CC
+override CC = gcc
+endif
+
+ifndef CXX
+override CXX = g++
+endif
+
+ifndef CROSS_COMPILE
+    processor := $(shell uname -m)
+else # CROSS_COMPILE was set
+    CC = $(CROSS_COMPILE)gcc
+    CXX = $(CROSS_COMPILE)g++
+    CXXFLAGS += -static
+    LDFLAGS += -static
+
+    check_riscv := $(shell echo | $(CROSS_COMPILE)cpp -dM - | grep " __riscv_xlen " | cut -c22-)
+    uname_result := $(shell uname -m)
+	ifeq ($(check_riscv),64)
+		processor = rv64
+    else ifeq ($(uname_result),rv64imafdc)
+		processor = rv64
+    else ifeq ($(check_riscv),32)
+		processor = rv32
+    else ifeq ($(uname_result),rv32i)
+		processor = rv32
+	else
+		$(error Unsupported cross-compiler)
+	endif
+
+	ifeq ($(processor),$(filter $(processor),i386 x86_64))
+		ARCH_CFLAGS = -maes -mpclmul -mssse3 -msse4.2
+	else
+		ARCH_CFLAGS = -march=$(processor)gcv_zba
+	endif
+
+	ifeq ($(SIMULATOR_TYPE), qemu)
+		SIMULATOR += qemu-riscv64
+		SIMULATOR_FLAGS = -cpu $(processor),v=true,zba=true,vlen=128
+	else
+		SIMULATOR = spike
+		SIMULATOR_FLAGS = --isa=$(processor)gcv_zba
+		PROXY_KERNEL = pk
+	endif
+endif
+
+CXXFLAGS += -Wall -Wcast-qual -I. $(ARCH_CFLAGS)
+LDFLAGS	+= -lm
+OBJS = \
+    tests/binding.o \
+    tests/common.o \
+    tests/impl.o \
+    tests/main.o
+deps := $(OBJS:%.o=%.o.d)
+
+.SUFFIXES: .o .cpp
+.cpp.o:
+	$(CXX) -o $@ $(CXXFLAGS) -c -MMD -MF $@.d $<
+
+EXEC = tests/main
+
+$(EXEC): $(OBJS)
+	$(CXX) $(LDFLAGS) -o $@ $^
+
+test: tests/main
+ifeq ($(processor),$(filter $(processor),rv32 rv64))
+	$(CC) $(ARCH_CFLAGS) -c sse2rvv.h
+endif
+	$(SIMULATOR) $(SIMULATOR_FLAGS) $(PROXY_KERNEL) $^
+
+build-test: tests/main
+ifeq ($(processor),$(filter $(processor),rv32 rv64))
+	$(CC) $(ARCH_CFLAGS) -c sse2rvv.h
+endif
+
+format:
+	@echo "Formatting files with clang-format.."
+	@if ! hash clang-format; then echo "clang-format is required to indent"; fi
+	clang-format -i sse2rvv.h tests/*.cpp tests/*.h
+
+.PHONY: clean check format
+
+clean:
+	$(RM) $(OBJS) $(EXEC) $(deps) sse2rvv.h.gch
+
+clean-all: clean
+	$(RM) *.log
+
+-include $(deps)
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..48277b0
--- /dev/null
+++ b/README.md
@@ -0,0 +1,94 @@
+# sse2rvv
+
+A C/C++ header file that converts Intel SSE intrinsics to RISCV-V Extension intrinsics.
+
+## Introduction
+
+`sse2rvv` is a translator of Intel SSE (Streaming SIMD Extensions) intrinsics
+to [RISCV-V Extension](https://github.com/riscv/riscv-v-spec),
+shortening the time needed to get an RISCV working program that then can be used to
+extract profiles and to identify hot paths in the code.
+The header file `sse2rvv.h` contains several of the functions provided by Intel
+intrinsic headers such as `<xmmintrin.h>`, only implemented with RISCV-based counterparts
+to produce the exact semantics of the intrinsics.
+
+This project is based on [sse2neon](https://github.com/DLTcollab/sse2neon), and modify it to RISCV version.
+
+## Mapping and Coverage
+
+Header file | Extension |
+---|---|
+`<mmintrin.h>` | MMX |
+`<xmmintrin.h>` | SSE |
+`<emmintrin.h>` | SSE2 |
+`<pmmintrin.h>` | SSE3 |
+`<tmmintrin.h>` | SSSE3 |
+`<smmintrin.h>` | SSE4.1 |
+`<nmmintrin.h>` | SSE4.2 |
+`<wmmintrin.h>` | AES  |
+
+`sse2rvv` aims to support SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 and AES extension.
+
+In order to deliver RVV-equivalent intrinsics for all SSE intrinsics used widely,
+please be aware that some SSE intrinsics exist a direct mapping with a concrete
+NEON-equivalent intrinsic. Others, unfortunately, lack a 1:1 mapping, meaning that
+their equivalents are built utilizing a number of NEON intrinsics.
+
+For example, SSE intrinsic `_mm_loadu_si128` has a direct RVV mapping (`vld1q_s32`),
+but SSE intrinsic `_mm_maddubs_epi16` has to be implemented with multiple RVV instructions.
+
+### Floating-point compatibility
+
+Some conversions require several RVV intrinsics, which may produce inconsistent results
+compared to their SSE counterparts due to differences in the arithmetic rules of IEEE-754.
+
+## Usage
+
+- Put the file `sse2rvv.h` in to your source code directory.
+
+- Locate the following SSE header files included in the code:
+```C
+#include <xmmintrin.h>
+#include <emmintrin.h>
+```
+  {p,t,s,n,w}mmintrin.h could be replaceable as well.
+
+- Replace them with:
+```C
+#include "sse2rvv.h"
+```
+
+- Explicitly specify platform-specific options to gcc/clang compilers.
+  * On riscv64
+  ```shell
+  -march=r64gcv_zba
+  ```
+
+## Run Built-in Test Suite
+
+`sse2rvv` provides a unified interface for developing test cases. These test
+cases are located in `tests` directory, and the input data is specified at
+runtime. Use the following commands to perform test cases:
+```shell
+$ make test
+```
+
+## Reference
+* [sse2neon](https://github.com/DLTcollab/sse2neon)
+* [Intel Intrinsics Guide](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html)
+* [Microsoft: x86 intrinsics list](https://learn.microsoft.com/en-us/cpp/intrinsics/x86-intrinsics-list)
+* [Arm Neon Intrinsics Reference](https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics)
+* [Neon Programmer's Guide for Armv8-A](https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/neon-programmers-guide-for-armv8-a)
+* [NEON Programmer's Guide](https://static.docs.arm.com/den0018/a/DEN0018A_neon_programmers_guide_en.pdf)
+* [qemu/target/i386/ops_sse.h](https://github.com/qemu/qemu/blob/master/target/i386/ops_sse.h): Comprehensive SSE instruction emulation in C. Ideal for semantic checks.
+* [Porting Takua Renderer to 64-bit ARM- Part 1](https://blog.yiningkarlli.com/2021/05/porting-takua-to-arm-pt1.html)
+* [Porting Takua Renderer to 64-bit ARM- Part 2](https://blog.yiningkarlli.com/2021/07/porting-takua-to-arm-pt2.html)
+* [Comparing SIMD on x86-64 and arm64](https://blog.yiningkarlli.com/2021/09/neon-vs-sse.html)
+* [Port with SSE2Neon and SIMDe](https://developer.arm.com/documentation/102581/0200/Port-with-SSE2Neon-and-SIMDe)
+* [Genomics: Optimizing the BWA aligner for Arm Servers](https://community.arm.com/arm-community-blogs/b/high-performance-computing-blog/posts/optimizing-genomics-and-the-bwa-aligner-for-arm-servers)
+* [Bit twiddling with Arm Neon: beating SSE movemasks, counting bits and more](https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon)
+* [C/C++ on Graviton](https://github.com/aws/aws-graviton-getting-started/blob/main/c-c%2B%2B.md)
+
+## Licensing
+
+`sse2rvv` is freely redistributable under the MIT License.
diff --git a/scripts/check-format.sh b/scripts/check-format.sh
new file mode 100755
index 0000000..b22c668
--- /dev/null
+++ b/scripts/check-format.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+set -x
+
+for file in ${SOURCES};
+do
+    clang-format ${file} > expected-format
+    diff -u -p --label="${file}" --label="expected coding style" ${file} expected-format
+done
+exit $(clang-format --output-replacements-xml ${SOURCES} | egrep -c "</replacement>")
diff --git a/scripts/cross-test.sh b/scripts/cross-test.sh
new file mode 100755
index 0000000..3275059
--- /dev/null
+++ b/scripts/cross-test.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+# Clang/LLVM is natively a cross-compiler.
+# TODO: Do cross-compilation using Clang
+# https://clang.llvm.org/docs/CrossCompilation.html
+if [ $(printenv CXX | grep clang) ]; then
+    exit
+fi
+
+set -x
+
+make clean
+make CROSS_COMPILE=riscv64-unknown-elf- SIMULATOR_TYPE=$1 test || exit 1 # riscv64
diff --git a/sse2rvv.h b/sse2rvv.h
new file mode 100644
index 0000000..23108de
--- /dev/null
+++ b/sse2rvv.h
@@ -0,0 +1,3272 @@
+#ifndef SSE2RVV_H
+#define SSE2RVV_H
+
+// This header file provides a simple API translation layer
+// between SSE intrinsics to their corresponding RVV versions
+
+/*
+ * sse2rvv is freely redistributable under the MIT License.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// /* Tunable configurations */
+
+// /* Enable precise implementation of math operations
+//  * This would slow down the computation a bit, but gives consistent result
+//  with
+//  * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
+//  */
+// /* _mm_min|max_ps|ss|pd|sd */
+// #ifndef SSE2RVV_PRECISE_MINMAX
+// #define SSE2RVV_PRECISE_MINMAX (0)
+// #endif
+// /* _mm_rcp_ps and _mm_div_ps */
+// #ifndef SSE2RVV_PRECISE_DIV
+// #define SSE2RVV_PRECISE_DIV (0)
+// #endif
+// /* _mm_sqrt_ps and _mm_rsqrt_ps */
+// #ifndef SSE2RVV_PRECISE_SQRT
+// #define SSE2RVV_PRECISE_SQRT (0)
+// #endif
+// /* _mm_dp_pd */
+// #ifndef SSE2RVV_PRECISE_DP
+// #define SSE2RVV_PRECISE_DP (0)
+// #endif
+
+/* compiler specific definitions */
+#if defined(__GNUC__) || defined(__clang__)
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+#define _sse2rvv_likely(x) __builtin_expect(!!(x), 1)
+#define _sse2rvv_unlikely(x) __builtin_expect(!!(x), 0)
+#else
+#pragma message("Macro name collisions may happen with unsupported compilers.")
+#endif
+
+/* C language does not allow initializing a variable with a function call. */
+#ifdef __cplusplus
+#define _sse2rvv_const static const
+#else
+#define _sse2rvv_const const
+#endif
+
+#include <riscv_vector.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+/* A few intrinsics accept traditional data types like ints or floats, but
+ * most operate on data types that are specific to SSE.
+ * If a vector type ends in d, it contains doubles, and if it does not have
+ * a suffix, it contains floats. An integer vector type can contain any type
+ * of integer, from chars to shorts to unsigned long longs.
+ */
+typedef vint64m1_t __m64;
+typedef vfloat32m1_t __m128;  /* 128-bit vector containing 4 floats */
+typedef vfloat64m1_t __m128d; /* 128-bit vector containing 2 doubles */
+typedef vint64m1_t __m128i;   /* 128-bit vector containing integers */
+typedef vuint8m4_t uint8x16x4_t;
+
+// __int64 is defined in the Intrinsics Guide which maps to different datatype
+// in different data model
+#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
+#if (defined(__x86_64__) || defined(__i386__))
+#define __int64 long long
+#else
+#define __int64 int64_t
+#endif
+#endif
+
+// A struct is defined in this header file called 'SIMDVec' which can be used
+// by applications which attempt to access the contents of an __m128 struct
+// directly.  It is important to note that accessing the __m128 struct directly
+// is bad coding practice by Microsoft: @see:
+// https://learn.microsoft.com/en-us/cpp/cpp/m128
+//
+// However, some legacy source code may try to access the contents of an __m128
+// struct directly so the developer can use the SIMDVec as an alias for it.  Any
+// casting must be done manually by the developer, as you cannot cast or
+// otherwise alias the base NEON data type for intrinsic operations.
+//
+// union intended to allow direct access to an __m128 variable using the names
+// that the MSVC compiler provides.  This union should really only be used when
+// trying to access the members of the vector as integer values.  GCC/clang
+// allow native access to the float members through a simple array access
+// operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause
+// a performance hit.  If it really is needed however, the original __m128
+// variable can be aliased with a pointer to this union and used to access
+// individual components.  The use of this union should be hidden behind a macro
+// that is used throughout the codebase to access the members instead of always
+// declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec {
+  float m128_f32[4];    // as floats - DON'T USE. Added for convenience.
+  int8_t m128_i8[16];   // as signed 8-bit integers.
+  int16_t m128_i16[8];  // as signed 16-bit integers.
+  int32_t m128_i32[4];  // as signed 32-bit integers.
+  int64_t m128_i64[2];  // as signed 64-bit integers.
+  uint8_t m128_u8[16];  // as unsigned 8-bit integers.
+  uint16_t m128_u16[8]; // as unsigned 16-bit integers.
+  uint32_t m128_u32[4]; // as unsigned 32-bit integers.
+  uint64_t m128_u64[2]; // as unsigned 64-bit integers.
+} SIMDVec;
+
+/* SSE macros */
+// #define _MM_GET_FLUSH_ZERO_MODE _sse2rvv_mm_get_flush_zero_mode
+// #define _MM_SET_FLUSH_ZERO_MODE _sse2rvv_mm_set_flush_zero_mode
+// #define _MM_GET_DENORMALS_ZERO_MODE _sse2rvv_mm_get_denormals_zero_mode
+// #define _MM_SET_DENORMALS_ZERO_MODE _sse2rvv_mm_set_denormals_zero_mode
+
+// Function declaration
+// SSE
+// FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void);
+// FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
+// FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
+// FORCE_INLINE __m128 _mm_set_ps1(float);
+// FORCE_INLINE __m128 _mm_setzero_ps(void);
+// SSE2
+// FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
+// FORCE_INLINE __m128i _mm_castps_si128(__m128);
+// FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
+// FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
+// FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
+// FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
+// FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
+// FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
+// FORCE_INLINE __m128d _mm_set_pd(double, double);
+// FORCE_INLINE __m128i _mm_set1_epi32(int);
+// FORCE_INLINE __m128i _mm_setzero_si128(void);
+// SSE4.1
+// FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
+// FORCE_INLINE __m128 _mm_ceil_ps(__m128);
+// FORCE_INLINE __m128d _mm_floor_pd(__m128d);
+// FORCE_INLINE __m128 _mm_floor_ps(__m128);
+// FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
+// FORCE_INLINE __m128 _mm_round_ps(__m128, int);
+// SSE4.2
+// FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
+
+/* Function Naming Conventions
+ * The naming convention of SSE intrinsics is straightforward. A generic SSE
+ * intrinsic function is given as follows:
+ *   _mm_<name>_<data_type>
+ *
+ * The parts of this format are given as follows:
+ * 1. <name> describes the operation performed by the intrinsic
+ * 2. <data_type> identifies the data type of the function's primary arguments
+ *
+ * This last part, <data_type>, is a little complicated. It identifies the
+ * content of the input values, and can be set to any of the following values:
+ * + ps - vectors contain floats (ps stands for packed single-precision)
+ * + pd - vectors contain doubles (pd stands for packed double-precision)
+ * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            signed integers
+ * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            unsigned integers
+ * + si128 - unspecified 128-bit vector or 256-bit vector
+ * + m128/m128i/m128d - identifies input vector types when they are different
+ *                      than the type of the returned vector
+ *
+ * For example, _mm_setzero_ps. The _mm implies that the function returns
+ * a 128-bit vector. The _ps at the end implies that the argument vectors
+ * contain floats.
+ *
+ * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
+ *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
+ *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ *   // Set packed 8-bit integers
+ *   // 128 bits, 16 chars, per 8 bits
+ *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
+ *                                  4, 5, 12, 13, 6, 7, 14, 15);
+ *   // Shuffle packed 8-bit integers
+ *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
+ */
+
+/* Constants for use with _mm_prefetch. */
+enum _mm_hint {
+  _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
+  _MM_HINT_T0 = 1,  /* load data to L1 and L2 cache */
+  _MM_HINT_T1 = 2,  /* load data to L2 cache only */
+  _MM_HINT_T2 = 3,  /* load data to L2 cache only, mark it as NTA */
+};
+
+// The bit field mapping to the FPCR(floating-point control register)
+typedef struct {
+  uint16_t res0;
+  uint8_t res1 : 6;
+  uint8_t bit22 : 1;
+  uint8_t bit23 : 1;
+  uint8_t bit24 : 1;
+  uint8_t res2 : 7;
+#if defined(__aarch64__) || defined(_M_ARM64)
+  uint32_t res3;
+#endif
+} fpcr_bitfield;
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+// FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) {}
+
+// takes the lower two 32-bit values from a and swaps them and places in high
+// end of result takes the higher two 32 bit values from b and swaps them and
+// places in low end of result.
+// FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) {}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) {}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) {}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) {}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) {}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) {}
+
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
+// high
+// FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) {}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) {}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) {}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) {}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) {}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) {}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) {}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) {}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) {}
+
+// C equivalent:
+//   __m128i _mm_shuffle_epi32_default(__m128i a,
+//                                     __constrange(0, 255) int imm) {
+//       __m128i ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+// #define _mm_shuffle_epi32_default(a, imm)
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of a and places it into the high end of the result.
+// FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a) {}
+
+// takes the lower two 32-bit values from a and swaps them and places in low end
+// of result takes the higher two 32 bit values from a and swaps them and places
+// in high end of result.
+// FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a) {}
+
+// rotates the least significant 32 bits into the most significant 32 bits, and
+// shifts the rest down
+// FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a) {}
+
+// rotates the most significant 32 bits into the least significant 32 bits, and
+// shifts the rest up
+// FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a) {}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of a and places it in the lower 64 bits
+// FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a) {}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
+// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
+// FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a) {}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
+// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
+// places it in the lower 64 bits
+// FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a) {}
+
+// FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a) {}
+
+// FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a) {}
+
+// FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) {}
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+// #define _mm_shuffle_epi32_splat(a, imm)
+#else
+// #define _mm_shuffle_epi32_splat(a, imm)
+#endif
+
+// NEON does not support a general purpose permute intrinsic.
+// Shuffle single-precision (32-bit) floating-point elements in a using the
+// control in imm8, and store the results in dst.
+//
+// C equivalent:
+//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
+//                                 __constrange(0, 255) int imm) {
+//       __m128 ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+//
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
+// #define _mm_shuffle_ps_default(a, b, imm)
+
+// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
+// Store the results in the low 64 bits of dst, with the high 64 bits being
+// copied from a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
+// #define _mm_shufflelo_epi16_function(a, imm)
+
+// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
+// Store the results in the high 64 bits of dst, with the low 64 bits being
+// copied from a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
+// #define _mm_shufflehi_epi16_function(a, imm)
+
+/* MMX */
+
+//_mm_empty is a no-op on arm
+// FORCE_INLINE void _mm_empty(void) {}
+/* SSE */
+
+// Add packed single-precision (32-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
+// FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) {}
+
+// Add the lower single-precision (32-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
+// FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) {}
+
+// Compute the bitwise AND of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
+// FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) {}
+
+// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
+// FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) {}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
+// FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b) {}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
+// FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b) {}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
+// FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) {}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
+// FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b) {}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
+// FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) {}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
+// FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b) {}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
+// FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) {}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
+// FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b) {}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
+// FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) {}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
+// FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b) {}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
+// FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) {}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
+// FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b) {}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
+// FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) {}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
+// FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b) {}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
+// FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b) {}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
+// FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b) {}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
+// FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b) {}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-greater-than, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
+// FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b) {}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
+// FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) {}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-less-than-or-equal, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
+// FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b) {}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
+// FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) {}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-less-than, store the result in the lower element of dst, and copy
+// the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
+// FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b) {}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// to see if neither is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
+//
+// See also:
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
+// FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b) {}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b to see if neither is NaN, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
+// FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b) {}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// to see if either is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
+// FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b) {}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b to see if either is NaN, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
+// FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b) {}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
+// FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) {}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for greater-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
+// FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) {}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for greater-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
+// FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) {}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for less-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
+// FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) {}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for less-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
+// FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) {}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for not-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
+// FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) {}
+
+// Convert packed signed 32-bit integers in b to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, and copy the upper 2 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
+// FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b) {}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
+// FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a) {}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
+// FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) {}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
+// FORCE_INLINE int _mm_cvt_ss2si(__m128 a) {}
+
+// Convert packed 16-bit integers in a to packed single-precision (32-bit)
+// floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
+// FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a) {}
+
+// Convert packed 32-bit integers in b to packed single-precision (32-bit)
+// floating-point elements, store the results in the lower 2 elements of dst,
+// and copy the upper 2 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
+// FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b) {}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, then convert the packed signed 32-bit integers in b to
+// single-precision (32-bit) floating-point element, and store the results in
+// the upper 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
+// FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b) {}
+
+// Convert the lower packed 8-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
+// FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a) {}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 16-bit integers, and store the results in dst. Note: this intrinsic
+// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
+// 0x7FFFFFFF.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
+// FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a) {}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
+// #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 8-bit integers, and store the results in lower 4 elements of dst.
+// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
+// between 0x7F and 0x7FFFFFFF.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
+// FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a) {}
+
+// Convert packed unsigned 16-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
+// FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a) {}
+
+// Convert the lower packed unsigned 8-bit integers in a to packed
+// single-precision (32-bit) floating-point elements, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
+// FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a) {}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
+// #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
+
+// Convert the signed 64-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
+// FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b) {}
+
+// Copy the lower single-precision (32-bit) floating-point element of a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
+// FORCE_INLINE float _mm_cvtss_f32(__m128 a) {}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
+// #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
+// FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a) {}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
+// FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a) {}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
+// FORCE_INLINE int _mm_cvtt_ss2si(__m128 a) {}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
+// #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
+// #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
+// FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a) {}
+
+// Divide packed single-precision (32-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+// Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement
+// division by multiplying a by b's reciprocal before using the Newton-Raphson
+// method to approximate the results.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
+// FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) {}
+
+// Divide the lower single-precision (32-bit) floating-point element in a by the
+// lower single-precision (32-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper 3 packed elements from a to
+// the upper elements of dst.
+// Warning: ARMv7-A does not produce the same result compared to Intel and not
+// IEEE-compliant.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
+// FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) {}
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
+// #define _mm_extract_pi16(a, imm)
+
+// Free aligned memory that was allocated with _mm_malloc.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
+#if !defined(SSE2RVV_ALLOC_DEFINED)
+// FORCE_INLINE void _mm_free(void *addr) {}
+#endif
+
+// Macro: Get the flush zero bits from the MXCSR control and status register.
+// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
+// _MM_FLUSH_ZERO_OFF
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
+// FORCE_INLINE unsigned int _sse2rvv_mm_get_flush_zero_mode(void) {}
+
+// Macro: Get the rounding mode bits from the MXCSR control and status register.
+// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
+// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
+// FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void) {}
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
+// #define _mm_insert_pi16(a, b, imm)
+
+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
+// FORCE_INLINE __m128 _mm_load_ps(const float *p) {}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[63:32] := MEM[mem_addr+31:mem_addr]
+//   dst[95:64] := MEM[mem_addr+31:mem_addr]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
+// #define _mm_load_ps1 _mm_load1_ps
+
+// Load a single-precision (32-bit) floating-point element from memory into the
+// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
+// aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
+// FORCE_INLINE __m128 _mm_load_ss(const float *p) {}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
+// FORCE_INLINE __m128 _mm_load1_ps(const float *p) {}
+
+// Load 2 single-precision (32-bit) floating-point elements from memory into the
+// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
+// mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
+// FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p) {}
+
+// Load 2 single-precision (32-bit) floating-point elements from memory into the
+// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
+// mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
+// FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p) {}
+
+// Load 4 single-precision (32-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
+// FORCE_INLINE __m128 _mm_loadr_ps(const float *p) {}
+
+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from memory into dst. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
+// FORCE_INLINE __m128 _mm_loadu_ps(const float *p) {}
+
+// Load unaligned 16-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
+// FORCE_INLINE __m128i _mm_loadu_si16(const void *p) {}
+
+// Load unaligned 64-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
+// FORCE_INLINE __m128i _mm_loadu_si64(const void *p) {}
+
+// Allocate size bytes of memory, aligned to the alignment specified in align,
+// and return a pointer to the allocated memory. _mm_free should be used to free
+// memory that is allocated with _mm_malloc.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
+#if !defined(SSE2RVV_ALLOC_DEFINED)
+// FORCE_INLINE void *_mm_malloc(size_t size, size_t align) {}
+#endif
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
+// FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr) {}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
+// #define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
+// FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b) {}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b,
+// and store packed maximum values in dst. dst does not follow the IEEE Standard
+// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
+// signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
+// FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) {}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
+// FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b) {}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper 3
+// packed elements from a to the upper element of dst. dst does not follow the
+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
+// inputs are NaN or signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
+// FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) {}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
+// FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b) {}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b,
+// and store packed minimum values in dst. dst does not follow the IEEE Standard
+// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
+// signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
+// FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) {}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
+// FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b) {}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper 3
+// packed elements from a to the upper element of dst. dst does not follow the
+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
+// inputs are NaN or signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
+// FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) {}
+
+// Move the lower single-precision (32-bit) floating-point element from b to the
+// lower element of dst, and copy the upper 3 packed elements from a to the
+// upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
+// FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b) {}
+
+// Move the upper 2 single-precision (32-bit) floating-point elements from b to
+// the lower 2 elements of dst, and copy the upper 2 elements from a to the
+// upper 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
+// FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b) {}
+
+// Move the lower 2 single-precision (32-bit) floating-point elements from b to
+// the upper 2 elements of dst, and copy the lower 2 elements from a to the
+// lower 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
+// FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B) {}
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
+// FORCE_INLINE int _mm_movemask_pi8(__m64 a) {}
+
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed single-precision (32-bit) floating-point element in a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
+// FORCE_INLINE int _mm_movemask_ps(__m128 a) {}
+
+// Multiply packed single-precision (32-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
+// FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) {}
+
+// Multiply the lower single-precision (32-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
+// FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) {}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
+// FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b) {}
+
+// Compute the bitwise OR of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
+// FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) {}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
+// #define _m_pavgb(a, b) _mm_avg_pu8(a, b)
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
+// #define _m_pavgw(a, b) _mm_avg_pu16(a, b)
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
+// #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
+// #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
+// #define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
+// #define _m_pmaxub(a, b) _mm_max_pu8(a, b)
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
+// #define _m_pminsw(a, b) _mm_min_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
+// #define _m_pminub(a, b) _mm_min_pu8(a, b)
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
+// #define _m_pmovmskb(a) _mm_movemask_pi8(a)
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
+// #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
+
+// Fetch the line of data from memory that contains address p to a location in
+// the cache hierarchy specified by the locality hint i.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
+// FORCE_INLINE void _mm_prefetch(char const *p, int i) {}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
+// #define _m_psadbw(a, b) _mm_sad_pu8(a, b)
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
+// #define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
+
+// Compute the approximate reciprocal of packed single-precision (32-bit)
+// floating-point elements in a, and store the results in dst. The maximum
+// relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
+// FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) {}
+
+// Compute the approximate reciprocal of the lower single-precision (32-bit)
+// floating-point element in a, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
+// FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) {}
+
+// Compute the approximate reciprocal square root of packed single-precision
+// (32-bit) floating-point elements in a, and store the results in dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
+// FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) {}
+
+// Compute the approximate reciprocal square root of the lower single-precision
+// (32-bit) floating-point element in a, store the result in the lower element
+// of dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
+// FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in) {}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
+// FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b) {}
+
+// Macro: Set the flush zero bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The flush zero may contain any of the
+// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
+// FORCE_INLINE void _sse2rvv_mm_set_flush_zero_mode(unsigned int flag) {}
+
+// Set packed single-precision (32-bit) floating-point elements in dst with the
+// supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
+// FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) {}
+
+// Broadcast single-precision (32-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
+// FORCE_INLINE __m128 _mm_set_ps1(float _w) {}
+
+// Macro: Set the rounding mode bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The rounding mode may contain any of
+// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
+// _MM_ROUND_TOWARD_ZERO
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
+// FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) {}
+
+// Copy single-precision (32-bit) floating-point element a to the lower element
+// of dst, and zero the upper 3 elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
+// FORCE_INLINE __m128 _mm_set_ss(float a) {}
+
+// Broadcast single-precision (32-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
+// FORCE_INLINE __m128 _mm_set1_ps(float _w) {}
+
+// Set the MXCSR control and status register with the value in unsigned 32-bit
+// integer a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
+// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
+// FORCE_INLINE void _mm_setcsr(unsigned int a) {}
+
+// Get the unsigned 32-bit value of the MXCSR control and status register.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
+// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
+// FORCE_INLINE unsigned int _mm_getcsr(void) {}
+
+// Set packed single-precision (32-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
+// FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x) {}
+
+// Return vector of type __m128 with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
+// FORCE_INLINE __m128 _mm_setzero_ps(void) {}
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
+#ifdef _sse2rvv_shuffle
+// #define _mm_shuffle_pi16(a, imm)
+#else
+// #define _mm_shuffle_pi16(a, imm)
+#endif
+
+// Perform a serializing operation on all store-to-memory instructions that were
+// issued prior to this instruction. Guarantees that every store instruction
+// that precedes, in program order, is globally visible before any store
+// instruction which follows the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
+// FORCE_INLINE void _mm_sfence(void) {}
+
+// Perform a serializing operation on all load-from-memory and store-to-memory
+// instructions that were issued prior to this instruction. Guarantees that
+// every memory access that precedes, in program order, the memory fence
+// instruction is globally visible before any memory instruction which follows
+// the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
+// FORCE_INLINE void _mm_mfence(void) {}
+
+// Perform a serializing operation on all load-from-memory instructions that
+// were issued prior to this instruction. Guarantees that every load instruction
+// that precedes, in program order, is globally visible before any load
+// instruction which follows the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
+// FORCE_INLINE void _mm_lfence(void) {}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
+// int imm)
+#ifdef _sse2rvv_shuffle
+// #define _mm_shuffle_ps(a, b, imm)
+#else // generic
+// #define _mm_shuffle_ps(a, b, imm)
+#endif
+
+// Compute the square root of packed single-precision (32-bit) floating-point
+// elements in a, and store the results in dst.
+// Due to ARMv7-A NEON's lack of a precise square root intrinsic, we implement
+// square root by multiplying input in with its reciprocal square root before
+// using the Newton-Raphson method to approximate the results.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
+// FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) {}
+
+// Compute the square root of the lower single-precision (32-bit) floating-point
+// element in a, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
+// FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) {}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
+// FORCE_INLINE void _mm_store_ps(float *p, __m128 a) {}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
+// FORCE_INLINE void _mm_store_ps1(float *p, __m128 a) {}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
+// FORCE_INLINE void _mm_store_ss(float *p, __m128 a) {}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
+// #define _mm_store1_ps _mm_store_ps1
+
+// Store the upper 2 single-precision (32-bit) floating-point elements from a
+// into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
+// FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a) {}
+
+// Store the lower 2 single-precision (32-bit) floating-point elements from a
+// into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
+// FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a) {}
+
+// Store 4 single-precision (32-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
+// FORCE_INLINE void _mm_storer_ps(float *p, __m128 a) {}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
+// FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) {}
+
+// Stores 16-bits of integer data a at the address p.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
+// FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a) {}
+
+// Stores 64-bits of integer data a at the address p.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
+// FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a) {}
+
+// Store 64-bits of integer data from a into memory using a non-temporal memory
+// hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
+// FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a) {}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
+// point elements) from a into memory using a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
+// FORCE_INLINE void _mm_stream_ps(float *p, __m128 a) {}
+
+// Subtract packed single-precision (32-bit) floating-point elements in b from
+// packed single-precision (32-bit) floating-point elements in a, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
+// FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) {}
+
+// Subtract the lower single-precision (32-bit) floating-point element in b from
+// the lower single-precision (32-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper 3 packed elements from
+// a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
+// FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) {}
+
+// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
+// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
+// transposed matrix in these vectors (row0 now contains column 0, etc.).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)                              \
+  do {                                                                         \
+    float32x4x2_t ROW01 = vtrnq_f32(row0, row1);                               \
+    float32x4x2_t ROW23 = vtrnq_f32(row2, row3);                               \
+    row0 =                                                                     \
+        vcombine_f32(vget_low_f32(ROW01.val[0]), vget_low_f32(ROW23.val[0]));  \
+    row1 =                                                                     \
+        vcombine_f32(vget_low_f32(ROW01.val[1]), vget_low_f32(ROW23.val[1]));  \
+    row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),                           \
+                        vget_high_f32(ROW23.val[0]));                          \
+    row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),                           \
+                        vget_high_f32(ROW23.val[1]));                          \
+  } while (0)
+
+// according to the documentation, these intrinsics behave the same as the
+// non-'u' versions.  We'll just alias them here.
+// #define _mm_ucomieq_ss _mm_comieq_ss
+// #define _mm_ucomige_ss _mm_comige_ss
+// #define _mm_ucomigt_ss _mm_comigt_ss
+// #define _mm_ucomile_ss _mm_comile_ss
+// #define _mm_ucomilt_ss _mm_comilt_ss
+// #define _mm_ucomineq_ss _mm_comineq_ss
+
+// Return vector of type __m128i with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
+// FORCE_INLINE __m128i _mm_undefined_si128(void) {}
+
+// Return vector of type __m128 with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
+// FORCE_INLINE __m128 _mm_undefined_ps(void) {}
+
+// Unpack and interleave single-precision (32-bit) floating-point elements from
+// the high half a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
+// FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) {}
+
+// Unpack and interleave single-precision (32-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
+// FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) {}
+
+// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
+// FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) {}
+
+/* SSE2 */
+
+// Add packed 16-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
+// FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) {}
+
+// Add packed 32-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
+// FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) {}
+
+// Add packed 64-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
+// FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b) {}
+
+// Add packed 8-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
+// FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b) {}
+
+// Add packed double-precision (64-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
+// FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) {}
+
+// Add the lower double-precision (64-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper element from
+// a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
+// FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b) {}
+
+// Add 64-bit integers a and b, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
+// FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) {}
+
+// Add packed signed 16-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
+// FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b) {}
+
+// Add packed signed 8-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
+// FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b) {}
+
+// Add packed unsigned 16-bit integers in a and b using saturation, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
+// FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b) {}
+
+// Add packed unsigned 8-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
+// FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b) {}
+
+// Compute the bitwise AND of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
+// FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) {}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
+// FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) {}
+
+// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
+// FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b) {}
+
+// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
+// AND with b, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
+// FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) {}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
+// FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b) {}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
+// FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b) {}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
+// #define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
+// #define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
+
+// Cast vector of type __m128d to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
+// FORCE_INLINE __m128 _mm_castpd_ps(__m128d a) {}
+
+// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
+// FORCE_INLINE __m128i _mm_castpd_si128(__m128d a) {}
+
+// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
+// FORCE_INLINE __m128d _mm_castps_pd(__m128 a) {}
+
+// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
+// FORCE_INLINE __m128i _mm_castps_si128(__m128 a) {}
+
+// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
+// FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a) {}
+
+// Cast vector of type __m128i to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
+// FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) {}
+
+// Invalidate and flush the cache line that contains p from all levels of the
+// cache hierarchy.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
+#if defined(__APPLE__)
+#include <libkern/OSCacheControl.h>
+#endif
+// FORCE_INLINE void _mm_clflush(void const *p) {}
+
+// Compare packed 16-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
+// FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) {}
+
+// Compare packed 32-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
+// FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) {}
+
+// Compare packed 8-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
+// FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) {}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
+// FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b) {}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
+// FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b) {}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
+// FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b) {}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
+// FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b) {}
+
+// Compare packed signed 16-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
+// FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) {}
+
+// Compare packed signed 32-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
+// FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) {}
+
+// Compare packed signed 8-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
+// FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) {}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
+// FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b) {}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
+// FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b) {}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
+// FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b) {}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
+// FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b) {}
+
+// Compare packed signed 16-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
+// FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b) {}
+
+// Compare packed signed 32-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
+// FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) {}
+
+// Compare packed signed 8-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
+// FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b) {}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
+// FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b) {}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
+// FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b) {}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
+// FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b) {}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
+// FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b) {}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
+// FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b) {}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
+// FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b) {}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
+// FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b) {}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
+// FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b) {}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
+// FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b) {}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
+// FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b) {}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
+// FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b) {}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
+// FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b) {}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if neither is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
+// FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b) {}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if neither is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
+// FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b) {}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if either is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
+// FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b) {}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if either is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
+// FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b) {}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
+// FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b) {}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
+// FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b) {}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
+// FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b) {}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
+// FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b) {}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
+// FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b) {}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for not-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
+// FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b) {}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
+// FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a) {}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
+// FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) {}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
+// FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a) {}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
+// FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a) {}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed single-precision (32-bit) floating-point elements, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
+// FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) {}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
+// FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a) {}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
+// FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) {}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed double-precision (64-bit) floating-point elements, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
+// FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a) {}
+
+// Copy the lower double-precision (64-bit) floating-point element of a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
+// FORCE_INLINE double _mm_cvtsd_f64(__m128d a) {}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
+// FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a) {}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
+// FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a) {}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
+// #define _mm_cvtsd_si64x _mm_cvtsd_si64
+
+// Convert the lower double-precision (64-bit) floating-point element in b to a
+// single-precision (32-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper 3 packed elements from a to the
+// upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
+// FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b) {}
+
+// Copy the lower 32-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
+// FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) {}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
+// FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a) {}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
+// #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Convert the signed 32-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
+// FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b) {}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
+// #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Copy 32-bit integer a to the lower elements of dst, and zero the upper
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
+// FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) {}
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
+// FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b) {}
+
+// Copy 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
+// FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a) {}
+
+// Copy 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
+// #define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
+// #define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
+
+// Convert the lower single-precision (32-bit) floating-point element in b to a
+// double-precision (64-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
+// FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) {}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
+// FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a) {}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
+// FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a) {}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
+// FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) {}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
+// FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a) {}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
+// FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a) {}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
+// #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
+
+// Divide packed double-precision (64-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
+// FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b) {}
+
+// Divide the lower double-precision (64-bit) floating-point element in a by the
+// lower double-precision (64-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper element from a to the upper
+// element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
+// FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b) {}
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
+// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
+// #define _mm_extract_epi16(a, imm)
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
+// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
+//                                       __constrange(0,8) int imm)
+// #define _mm_insert_epi16(a, b, imm)
+
+// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
+// FORCE_INLINE __m128d _mm_load_pd(const double *p) {}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
+// #define _mm_load_pd1 _mm_load1_pd
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower of dst, and zero the upper element. mem_addr does not need to be
+// aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
+// FORCE_INLINE __m128d _mm_load_sd(const double *p) {}
+
+// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
+// on a 16-byte boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
+// FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) {}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
+// FORCE_INLINE __m128d _mm_load1_pd(const double *p) {}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// upper element of dst, and copy the lower element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
+// FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p) {}
+
+// Load 64-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
+// FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p) {}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower element of dst, and copy the upper element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
+// FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p) {}
+
+// Load 2 double-precision (64-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
+// FORCE_INLINE __m128d _mm_loadr_pd(const double *p) {}
+
+// Loads two double-precision from unaligned memory, floating-point values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
+// FORCE_INLINE __m128d _mm_loadu_pd(const double *p) {}
+
+// Load 128-bits of integer data from memory into dst. mem_addr does not need to
+// be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
+// FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) {}
+
+// Load unaligned 32-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
+// FORCE_INLINE __m128i _mm_loadu_si32(const void *p) {}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
+// 32-bit integers, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
+// FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) {}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint. mem_addr does not need to be aligned
+// on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
+// FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char
+// *mem_addr) {}
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
+// FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b) {}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
+// FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) {}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed maximum values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
+// FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b) {}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
+// FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b) {}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
+// FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) {}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
+// FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b) {}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed minimum values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
+// FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b) {}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
+// FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b) {}
+
+// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
+// upper element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
+// FORCE_INLINE __m128i _mm_move_epi64(__m128i a) {}
+
+// Move the lower double-precision (64-bit) floating-point element from b to the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
+// FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b) {}
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
+// FORCE_INLINE int _mm_movemask_epi8(__m128i a) {}
+
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed double-precision (64-bit) floating-point element in a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
+// FORCE_INLINE int _mm_movemask_pd(__m128d a) {}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
+// FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a) {}
+
+// Copy the 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
+// FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a) {}
+
+// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
+// a and b, and store the unsigned 64-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
+// FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) {}
+
+// Multiply packed double-precision (64-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
+// FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) {}
+
+// Multiply the lower double-precision (64-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper element
+// from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
+// FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b) {}
+
+// Multiply the low unsigned 32-bit integers from a and b, and store the
+// unsigned 64-bit result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
+// FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b) {}
+
+// Multiply the packed signed 16-bit integers in a and b, producing intermediate
+// 32-bit integers, and store the high 16 bits of the intermediate integers in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
+// FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) {}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
+// FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b) {}
+
+// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
+// integers, and store the low 16 bits of the intermediate integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
+// FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) {}
+
+// Compute the bitwise OR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
+// FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b) {}
+
+// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
+// FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) {}
+
+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
+// using signed saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
+// FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) {}
+
+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
+// using signed saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
+// FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) {}
+
+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
+// using unsigned saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
+// FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) {}
+
+// Pause the processor. This is typically used in spin-wait loops and depending
+// on the x86 processor typical values are in the 40-100 cycle range. The
+// 'yield' instruction isn't a good fit because it's effectively a nop on most
+// Arm cores. Experience with several databases has shown has shown an 'isb' is
+// a reasonable approximation.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
+// FORCE_INLINE void _mm_pause(void) {}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce two
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of 64-bit elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
+// FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b) {}
+
+// Set packed 16-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
+// FORCE_INLINE __m128i _mm_set_epi16(short i7,
+//                                    short i6,
+//                                    short i5,
+//                                    short i4,
+//                                    short i3,
+//                                    short i2,
+//                                    short i1,
+//                                    short i0) {}
+
+// Set packed 32-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
+// FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) {}
+
+// Set packed 64-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
+// FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2) {}
+
+// Set packed 64-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
+// FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2) {}
+
+// Set packed 8-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
+// FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
+//                                   signed char b14,
+//                                   signed char b13,
+//                                   signed char b12,
+//                                   signed char b11,
+//                                   signed char b10,
+//                                   signed char b9,
+//                                   signed char b8,
+//                                   signed char b7,
+//                                   signed char b6,
+//                                   signed char b5,
+//                                   signed char b4,
+//                                   signed char b3,
+//                                   signed char b2,
+//                                   signed char b1,
+//                                   signed char b0) {}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
+// FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) {}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
+// #define _mm_set_pd1 _mm_set1_pd
+
+// Copy double-precision (64-bit) floating-point element a to the lower element
+// of dst, and zero the upper element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
+// FORCE_INLINE __m128d _mm_set_sd(double a) {}
+
+// Broadcast 16-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
+// FORCE_INLINE __m128i _mm_set1_epi16(short w) {}
+
+// Broadcast 32-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
+// FORCE_INLINE __m128i _mm_set1_epi32(int _i) {}
+
+// Broadcast 64-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
+// FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i) {}
+
+// Broadcast 64-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
+// FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i) {}
+
+// Broadcast 8-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
+// FORCE_INLINE __m128i _mm_set1_epi8(signed char w) {}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
+// FORCE_INLINE __m128d _mm_set1_pd(double d) {}
+
+// Set packed 16-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
+// FORCE_INLINE __m128i _mm_setr_epi16(short w0,
+//                                     short w1,
+//                                     short w2,
+//                                     short w3,
+//                                     short w4,
+//                                     short w5,
+//                                     short w6,
+//                                     short w7) {}
+
+// Set packed 32-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
+// FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0) {}
+
+// Set packed 64-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
+// FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0) {}
+
+// Set packed 8-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
+// FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
+//                                    signed char b1,
+//                                    signed char b2,
+//                                    signed char b3,
+//                                    signed char b4,
+//                                    signed char b5,
+//                                    signed char b6,
+//                                    signed char b7,
+//                                    signed char b8,
+//                                    signed char b9,
+//                                    signed char b10,
+//                                    signed char b11,
+//                                    signed char b12,
+//                                    signed char b13,
+//                                    signed char b14,
+//                                    signed char b15) {}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
+// FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0) {}
+
+// Return vector of type __m128d with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
+// FORCE_INLINE __m128d _mm_setzero_pd(void) {}
+
+// Return vector of type __m128i with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
+// FORCE_INLINE __m128i _mm_setzero_si128(void) {}
+
+// Shuffle 32-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
+// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
+//                                        __constrange(0,255) int imm)
+#if defined(_sse2rvv_shuffle)
+// #define _mm_shuffle_epi32(a, imm)
+#else // generic
+// #define _mm_shuffle_epi32(a, imm)
+#endif
+
+// Shuffle double-precision (64-bit) floating-point elements using the control
+// in imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
+#ifdef _sse2rvv_shuffle
+// #define _mm_shuffle_pd(a, b, imm8)
+#else
+// #define _mm_shuffle_pd(a, b, imm8)
+#endif
+
+// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#if defined(_sse2rvv_shuffle)
+// #define _mm_shufflehi_epi16(a, imm)
+#else // generic
+// #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
+#endif
+
+// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#if defined(_sse2rvv_shuffle)
+// #define _mm_shufflelo_epi16(a, imm)
+#else // generic
+// #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
+#endif
+
+// Shift packed 16-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
+// FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count) {}
+
+// Shift packed 32-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
+// FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count) {}
+
+// Shift packed 64-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
+// FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count) {}
+
+// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
+// FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm) {}
+
+// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
+// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm) {}
+
+// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
+// FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm) {}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
+// #define _mm_slli_si128(a, imm)
+
+// Compute the square root of packed double-precision (64-bit) floating-point
+// elements in a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
+// FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a) {}
+
+// Compute the square root of the lower double-precision (64-bit) floating-point
+// element in b, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
+// FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b) {}
+
+// Shift packed 16-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
+// FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) {}
+
+// Shift packed 32-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
+// FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) {}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in sign
+// bits, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
+// FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm) {}
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
+// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
+// #define _mm_srai_epi32(a, imm)
+
+// Shift packed 16-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
+// FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count) {}
+
+// Shift packed 32-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
+// FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count) {}
+
+// Shift packed 64-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
+// FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) {}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
+// #define _mm_srli_epi16(a, imm)
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
+// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
+// #define _mm_srli_epi32(a, imm)
+
+// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
+// #define _mm_srli_epi64(a, imm)
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
+// #define _mm_srli_si128(a, imm)
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
+// FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) {}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
+// FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a) {}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
+// FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a) {}
+
+// Store 128-bits of integer data from a into memory. mem_addr must be aligned
+// on a 16-byte boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
+// FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a) {}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
+// #define _mm_store1_pd _mm_store_pd1
+
+// Store the upper double-precision (64-bit) floating-point element from a into
+// memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
+// FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a) {}
+
+// Store 64-bit integer from the first element of a into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
+// FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b) {}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
+// FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a) {}
+
+// Store 2 double-precision (64-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
+// FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a) {}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
+// FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a) {}
+
+// Store 128-bits of integer data from a into memory. mem_addr does not need to
+// be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
+// FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) {}
+
+// Store 32-bit integer from the first element of a into memory. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
+// FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a) {}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory using a non-temporal memory hint. mem_addr must
+// be aligned on a 16-byte boundary or a general-protection exception may be
+// generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
+// FORCE_INLINE void _mm_stream_pd(double *p, __m128d a) {}
+
+// Store 128-bits of integer data from a into memory using a non-temporal memory
+// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
+// exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
+// FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) {}
+
+// Store 32-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
+// FORCE_INLINE void _mm_stream_si32(int *p, int a) {}
+
+// Store 64-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
+// FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a) {}
+
+// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
+// FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b) {}
+
+// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
+// FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) {}
+
+// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
+// FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b) {}
+
+// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
+// FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b) {}
+
+// Subtract packed double-precision (64-bit) floating-point elements in b from
+// packed double-precision (64-bit) floating-point elements in a, and store the
+// results in dst.
+//  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
+// FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b) {}
+
+// Subtract the lower double-precision (64-bit) floating-point element in b from
+// the lower double-precision (64-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
+// FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b) {}
+
+// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
+// FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b) {}
+
+// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
+// using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
+// FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) {}
+
+// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
+// using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
+// FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) {}
+
+// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
+// integers in a using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
+// FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) {}
+
+// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
+// integers in a using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
+// FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) {}
+
+// #define _mm_ucomieq_sd _mm_comieq_sd
+// #define _mm_ucomige_sd _mm_comige_sd
+// #define _mm_ucomigt_sd _mm_comigt_sd
+// #define _mm_ucomile_sd _mm_comile_sd
+// #define _mm_ucomilt_sd _mm_comilt_sd
+// #define _mm_ucomineq_sd _mm_comineq_sd
+
+// Return vector of type __m128d with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
+// FORCE_INLINE __m128d _mm_undefined_pd(void) {}
+
+// Unpack and interleave 16-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
+// FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) {}
+
+// Unpack and interleave 32-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
+// FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) {}
+
+// Unpack and interleave 64-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
+// FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) {}
+
+// Unpack and interleave 8-bit integers from the high half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
+// FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) {}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the high half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
+// FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b) {}
+
+// Unpack and interleave 16-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
+// FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) {}
+
+// Unpack and interleave 32-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
+// FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) {}
+
+// Unpack and interleave 64-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
+// FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) {}
+
+// Unpack and interleave 8-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
+// FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) {}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
+// FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b) {}
+
+// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
+// FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b) {}
+
+// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
+// FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) {}
+
+/* SSE3 */
+
+// Alternatively add and subtract packed double-precision (64-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
+// FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b) {}
+
+// Alternatively add and subtract packed single-precision (32-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
+// FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) {}
+
+// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
+// FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) {}
+
+// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
+// FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) {}
+
+// Horizontally subtract adjacent pairs of double-precision (64-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
+// FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b) {}
+
+// Horizontally subtract adjacent pairs of single-precision (32-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
+// FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b) {}
+
+// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
+// may perform better than _mm_loadu_si128 when the data crosses a cache line
+// boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
+// #define _mm_lddqu_si128 _mm_loadu_si128
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
+// #define _mm_loaddup_pd _mm_load1_pd
+
+// Duplicate the low double-precision (64-bit) floating-point element from a,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
+// FORCE_INLINE __m128d _mm_movedup_pd(__m128d a) {}
+
+// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
+// FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) {}
+
+// Duplicate even-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
+// FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) {}
+
+/* SSSE3 */
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
+// FORCE_INLINE __m128i _mm_abs_epi16(__m128i a) {}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
+// FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) {}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
+// FORCE_INLINE __m128i _mm_abs_epi8(__m128i a) {}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
+// FORCE_INLINE __m64 _mm_abs_pi16(__m64 a) {}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
+// FORCE_INLINE __m64 _mm_abs_pi32(__m64 a) {}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
+// FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) {}
+
+// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 16 bytes in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
+#if defined(__GNUC__) && !defined(__clang__)
+// #define _mm_alignr_epi8(a, b, imm)
+
+#else
+// #define _mm_alignr_epi8(a, b, imm)
+
+#endif
+
+// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 8 bytes in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
+// #define _mm_alignr_pi8(a, b, imm)
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
+// FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b) {}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
+// FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b) {}
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
+// FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b) {}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
+// FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b) {}
+
+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
+// saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
+// FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b) {}
+
+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
+// saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
+// FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b) {}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
+// FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b) {}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
+// FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b) {}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
+// FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b) {}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
+// FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b) {}
+
+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
+// using saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
+// FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b) {}
+
+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
+// using saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
+// FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b) {}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
+// and pack the saturated results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
+// FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b) {}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
+// pack the saturated results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
+// FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b) {}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
+// the packed 16-bit integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
+// FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) {}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Truncate each intermediate integer to the 18 most
+// significant bits, round by adding 1, and store bits [16:1] to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
+// FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b) {}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
+// FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) {}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
+// FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b) {}
+
+// Negate packed 16-bit integers in a when the corresponding signed
+// 16-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
+// FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b) {}
+
+// Negate packed 32-bit integers in a when the corresponding signed
+// 32-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
+// FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b) {}
+
+// Negate packed 8-bit integers in a when the corresponding signed
+// 8-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
+// FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b) {}
+
+// Negate packed 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
+// FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b) {}
+
+// Negate packed 32-bit integers in a when the corresponding signed 32-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
+// FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b) {}
+
+// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
+// in b is negative, and store the results in dst. Element in dst are zeroed out
+// when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
+// FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) {}
+
+/* SSE4.1 */
+
+// Blend packed 16-bit integers from a and b using control mask imm8, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
+// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
+//                                      __constrange(0,255) int imm)
+// #define _mm_blend_epi16(a, b, imm)
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using control mask imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
+// #define _mm_blend_pd(a, b, imm)
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
+// FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8) {}
+
+// Blend packed 8-bit integers from a and b using mask, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
+// FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
+// {}
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
+// FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask) {}
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
+// FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask) {}
+
+// Round the packed double-precision (64-bit) floating-point elements in a up
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
+// FORCE_INLINE __m128d _mm_ceil_pd(__m128d a) {}
+
+// Round the packed single-precision (32-bit) floating-point elements in a up to
+// an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
+// FORCE_INLINE __m128 _mm_ceil_ps(__m128 a) {}
+
+// Round the lower double-precision (64-bit) floating-point element in b up to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
+// FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b) {}
+
+// Round the lower single-precision (32-bit) floating-point element in b up to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
+// FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b) {}
+
+// Compare packed 64-bit integers in a and b for equality, and store the results
+// in dst
+// FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) {}
+
+// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
+// FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a) {}
+
+// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
+// FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a) {}
+
+// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
+// FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a) {}
+
+// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
+// FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a) {}
+
+// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
+// FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a) {}
+
+// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
+// integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
+// FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a) {}
+
+// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
+// FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a) {}
+
+// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
+// FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a) {}
+
+// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
+// FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a) {}
+
+// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
+// FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a) {}
+
+// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
+// FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a) {}
+
+// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed
+// 64-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
+// FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) {}
+
+// Conditionally multiply the packed double-precision (64-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products, and
+// conditionally store the sum in dst using the low 4 bits of imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
+// FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm) {}
+
+// Conditionally multiply the packed single-precision (32-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products,
+// and conditionally store the sum in dst using the low 4 bits of imm.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
+// FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) {}
+
+// Extract a 32-bit integer from a, selected with imm8, and store the result in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
+// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
+// #define _mm_extract_epi32(a, imm)
+
+// Extract a 64-bit integer from a, selected with imm8, and store the result in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
+// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
+// #define _mm_extract_epi64(a, imm)
+
+// Extract an 8-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
+// __constrange(0,16) int imm)
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
+// #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a),
+// (imm))
+
+// Extracts the selected single-precision (32-bit) floating-point from a.
+// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
+// #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a),
+// (imm))
+
+// Round the packed double-precision (64-bit) floating-point elements in a down
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
+// FORCE_INLINE __m128d _mm_floor_pd(__m128d a) {}
+
+// Round the packed single-precision (32-bit) floating-point elements in a down
+// to an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
+// FORCE_INLINE __m128 _mm_floor_ps(__m128 a) {}
+
+// Round the lower double-precision (64-bit) floating-point element in b down to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
+// FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b) {}
+
+// Round the lower single-precision (32-bit) floating-point element in b down to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
+// FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b) {}
+
+// Copy a to dst, and insert the 32-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
+// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
+//                                       __constrange(0,4) int imm)
+// #define _mm_insert_epi32(a, b, imm)
+
+// Copy a to dst, and insert the 64-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
+// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
+//                                       __constrange(0,2) int imm)
+// #define _mm_insert_epi64(a, b, imm)
+
+// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
+// location specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
+// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
+//                                      __constrange(0,16) int imm)
+// #define _mm_insert_epi8(a, b, imm)
+
+// Copy a to tmp, then insert a single-precision (32-bit) floating-point
+// element from b into tmp using the control in imm8. Store tmp to dst using
+// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
+// #define _mm_insert_ps(a, b, imm8)
+
+// Compare packed signed 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
+// FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b) {}
+
+// Compare packed signed 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
+// FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b) {}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
+// FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b) {}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
+// FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) {}
+
+// Compare packed signed 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
+// FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b) {}
+
+// Compare packed signed 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
+// FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b) {}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
+// FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b) {}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
+// FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) {}
+
+// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
+// in a, store the minimum and index in dst, and zero the remaining bits in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
+// FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a) {}
+
+// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
+// 8-bit integers in a compared to those in b, and store the 16-bit results in
+// dst. Eight SADs are performed using one quadruplet from b and eight
+// quadruplets from a. One quadruplet is selected from b starting at on the
+// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
+// integers selected from a starting at the offset specified in imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
+// FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm) {}
+
+// Multiply the low signed 32-bit integers from each packed 64-bit element in
+// a and b, and store the signed 64-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
+// FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b) {}
+
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
+// integers, and store the low 32 bits of the intermediate integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
+// FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b) {}
+
+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
+// using unsigned saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
+// FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) {}
+
+// Round the packed double-precision (64-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
+// FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding) {}
+
+// Round the packed single-precision (32-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed single-precision
+// floating-point elements in dst.
+// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
+// FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) {}
+
+// Round the lower double-precision (64-bit) floating-point element in b using
+// the rounding parameter, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
+// FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding) {}
+
+// Round the lower single-precision (32-bit) floating-point element in b using
+// the rounding parameter, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst. Rounding is done according to the
+// rounding[3:0] parameter, which can be one of:
+//     (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress
+//     exceptions
+//     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
+//     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
+//     _MM_SET_ROUNDING_MODE
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
+// FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding) {}
+
+// Load 128-bits of integer data from memory into dst using a non-temporal
+// memory hint. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
+// FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p) {}
+
+// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
+// all 1's, and return 1 if the result is zero, otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
+// FORCE_INLINE int _mm_test_all_ones(__m128i a) {}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and return 1 if the result is zero, otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
+// FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask) {}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
+// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
+// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
+// FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask) {}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the CF value.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
+// FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) {}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
+// #define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the ZF value.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
+// FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) {}
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and returns 1 if b did not contain a null character and the
+// resulting mask was zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
+// FORCE_INLINE int _mm_cmpestra(__m128i a,
+//                               int la,
+//                               __m128i b,
+//                               int lb,
+//                               const int imm8) {}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
+// FORCE_INLINE int _mm_cmpestrc(__m128i a,
+//                               int la,
+//                               __m128i b,
+//                               int lb,
+//                               const int imm8) {}
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and store the generated index in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
+// FORCE_INLINE int _mm_cmpestri(__m128i a,
+//                               int la,
+//                               __m128i b,
+//                               int lb,
+//                               const int imm8) {}
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and store the generated mask in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
+// FORCE_INLINE __m128i _mm_cmpestrm(__m128i a, int la, __m128i b, int lb,
+//                                   const int imm8) {}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns bit 0 of the resulting bit mask.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
+// FORCE_INLINE int _mm_cmpestro(__m128i a,
+//                               int la,
+//                               __m128i b,
+//                               int lb,
+//                               const int imm8) {}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if any character in a was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
+// FORCE_INLINE int _mm_cmpestrs(__m128i a,
+//                               int la,
+//                               __m128i b,
+//                               int lb,
+//                               const int imm8) {}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if any character in b was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
+// FORCE_INLINE int _mm_cmpestrz(__m128i a,
+//                               int la,
+//                               __m128i b,
+//                               int lb,
+//                               const int imm8) {}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if b did not contain a null character and the resulting
+// mask was zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
+// FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8) {}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
+// FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8) {}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and store the generated index in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
+// FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8) {}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and store the generated mask in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
+// FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8) {}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns bit 0 of the resulting bit mask.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
+// FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8) {}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if any character in a was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
+// FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8) {}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if any character in b was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
+// FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8) {}
+
+// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
+// in b for greater than.
+// FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) {}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 16-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
+// FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) {}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 32-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
+// FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) {}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 64-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
+// FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) {}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 8-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
+// FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) {}
+
+/* AES */
+
+// In the absence of crypto extensions, implement aesenc using regular NEON
+// intrinsics instead. See:
+// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
+// for more information.
+// FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey) {}
+
+// Perform one round of an AES decryption flow on data (state) in a using the
+// round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
+// FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey) {}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
+// FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) {}
+
+// Perform the last round of an AES decryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
+// FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey) {}
+
+// Perform the InvMixColumns transformation on a and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
+// FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a) {}
+
+// Assist in expanding the AES cipher key by computing steps towards generating
+// a round key for encryption cipher using data from a and an 8-bit round
+// constant specified in imm8, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
+//
+// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
+// This instruction generates a round key for AES encryption. See
+// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
+// for details.
+// FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) {}
+
+/* Others */
+
+// Perform a carry-less multiplication of two 64-bit integers, selected from a
+// and b according to imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
+// FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int
+// imm) {}
+
+// FORCE_INLINE unsigned int _sse2rvv_mm_get_denormals_zero_mode(void) {}
+
+// Count the number of bits set to 1 in unsigned 32-bit integer a, and
+// return that count in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
+// FORCE_INLINE int _mm_popcnt_u32(unsigned int a) {}
+
+// Count the number of bits set to 1 in unsigned 64-bit integer a, and
+// return that count in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
+// FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) {}
+
+// FORCE_INLINE void _sse2rvv_mm_set_denormals_zero_mode(unsigned int flag) {}
+
+// Return the current 64-bit value of the processor's time-stamp counter.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
+// FORCE_INLINE uint64_t _rdtsc(void) {}
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma pop_macro("ALIGN_STRUCT")
+#pragma pop_macro("FORCE_INLINE")
+#endif
+
+// #if defined(__GNUC__) && !defined(__clang__)
+// #pragma GCC pop_options
+// #endif
+
+#endif
diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 0000000..7f66634
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,29 @@
+# Test Suite for SSE2RVV
+
+:warning: **Warning: The test suite is based on the little-endian architecture.**
+
+## Add More Test Items
+Once the conversion is implemented, the test can be added with the following steps:
+
+* File `tests/impl.h`
+
+  Add the intrinsic under `INTRIN_LIST` macro. The naming convention
+  should be `mm_xxx`.
+  Place it in the correct classification with the alphabetical order.
+  The classification can be referenced from [Intel Intrinsics Guide](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html).
+
+* File `tests/impl.cpp`
+    ```c
+    result_t test_mm_xxx()
+    {
+        // The C implementation
+        ...
+
+        // The rvv implementation
+        ret = _mm_xxx();
+
+        // Compare the result of two implementations and return either
+        // TEST_SUCCESS, TEST_FAIL, or TEST_UNIMPL
+        ...
+    }
+    ```
diff --git a/tests/binding.cpp b/tests/binding.cpp
new file mode 100644
index 0000000..6f9b6a3
--- /dev/null
+++ b/tests/binding.cpp
@@ -0,0 +1,31 @@
+#include "binding.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+namespace SSE2RVV {
+void *platform_aligned_alloc(size_t size) {
+  void *address;
+#if defined(_WIN32)
+  address = _aligned_malloc(size, 16);
+#else
+  // FIXME
+  // int ret = posix_memalign(&address, 16, size);
+  address = malloc(size);
+#endif
+  if (!address) {
+    fprintf(stderr, "Error at File %s line number %d\n", __FILE__, __LINE__);
+    exit(EXIT_FAILURE);
+  }
+  return address;
+}
+
+void platform_aligned_free(void *ptr) {
+#if defined(_WIN32)
+  _aligned_free(ptr);
+#else
+  free(ptr);
+#endif
+}
+
+} // namespace SSE2RVV
diff --git a/tests/binding.h b/tests/binding.h
new file mode 100644
index 0000000..564b502
--- /dev/null
+++ b/tests/binding.h
@@ -0,0 +1,18 @@
+#ifndef SSE2RVV_BINDING_H
+#define SSE2RVV_BINDING_H
+
+#include <stdlib.h>
+
+// The SSE2RVV unit tests run both within our own internal project
+// as well as within the open source framework.
+// This header file is used to abstract any distinctions between
+// those two build environments.
+//
+// Initially, this is for how 16 byte aligned memory is allocated
+namespace SSE2RVV {
+void *platform_aligned_alloc(size_t size);
+void platform_aligned_free(void *ptr);
+
+} // namespace SSE2RVV
+
+#endif
diff --git a/tests/common.cpp b/tests/common.cpp
new file mode 100644
index 0000000..33338c0
--- /dev/null
+++ b/tests/common.cpp
@@ -0,0 +1,330 @@
+#include "common.h"
+#include <cmath>
+#include <cstdint>
+
+namespace SSE2RVV {
+int32_t NaN = ~0;
+int64_t NaN64 = ~0;
+
+result_t validate_int64(__m128i a, int64_t i0, int64_t i1) {
+  const int64_t *t = (const int64_t *)&a;
+  ASSERT_RETURN(t[0] == i0);
+  ASSERT_RETURN(t[1] == i1);
+  return TEST_SUCCESS;
+}
+
+result_t validate_uint64(__m128i a, uint64_t i0, uint64_t i1) {
+  const uint64_t *t = (const uint64_t *)&a;
+  ASSERT_RETURN(t[0] == i0);
+  ASSERT_RETURN(t[1] == i1);
+  return TEST_SUCCESS;
+}
+
+result_t validate_int64(__m64 a, int64_t i0) {
+  const int64_t *t = (const int64_t *)&a;
+  ASSERT_RETURN(t[0] == i0);
+  return TEST_SUCCESS;
+}
+
+result_t validate_uint64(__m64 a, uint64_t i0) {
+  const uint64_t *t = (const uint64_t *)&a;
+  ASSERT_RETURN(t[0] == i0);
+  return TEST_SUCCESS;
+}
+
+result_t validate_int32(__m128i a, int32_t i0, int32_t i1, int32_t i2,
+                        int32_t i3) {
+  const int32_t *t = (const int32_t *)&a;
+  ASSERT_RETURN(t[0] == i0);
+  ASSERT_RETURN(t[1] == i1);
+  ASSERT_RETURN(t[2] == i2);
+  ASSERT_RETURN(t[3] == i3);
+  return TEST_SUCCESS;
+}
+
+result_t validate_uint32(__m128i a, uint32_t u0, uint32_t u1, uint32_t u2,
+                         uint32_t u3) {
+  const uint32_t *t = (const uint32_t *)&a;
+  ASSERT_RETURN(t[0] == u0);
+  ASSERT_RETURN(t[1] == u1);
+  ASSERT_RETURN(t[2] == u2);
+  ASSERT_RETURN(t[3] == u3);
+  return TEST_SUCCESS;
+}
+
+result_t validate_int32(__m64 a, int32_t u0, int32_t u1) {
+  const int32_t *t = (const int32_t *)&a;
+  ASSERT_RETURN(t[0] == u0);
+  ASSERT_RETURN(t[1] == u1);
+  return TEST_SUCCESS;
+}
+
+result_t validate_uint32(__m64 a, uint32_t u0, uint32_t u1) {
+  const uint32_t *t = (const uint32_t *)&a;
+  ASSERT_RETURN(t[0] == u0);
+  ASSERT_RETURN(t[1] == u1);
+  return TEST_SUCCESS;
+}
+
+result_t validate_int16(__m128i a, int16_t i0, int16_t i1, int16_t i2,
+                        int16_t i3, int16_t i4, int16_t i5, int16_t i6,
+                        int16_t i7) {
+  const int16_t *t = (const int16_t *)&a;
+  ASSERT_RETURN(t[0] == i0);
+  ASSERT_RETURN(t[1] == i1);
+  ASSERT_RETURN(t[2] == i2);
+  ASSERT_RETURN(t[3] == i3);
+  ASSERT_RETURN(t[4] == i4);
+  ASSERT_RETURN(t[5] == i5);
+  ASSERT_RETURN(t[6] == i6);
+  ASSERT_RETURN(t[7] == i7);
+  return TEST_SUCCESS;
+}
+
+result_t validate_uint16(__m128i a, uint16_t u0, uint16_t u1, uint16_t u2,
+                         uint16_t u3, uint16_t u4, uint16_t u5, uint16_t u6,
+                         uint16_t u7) {
+  const uint16_t *t = (const uint16_t *)&a;
+  ASSERT_RETURN(t[0] == u0);
+  ASSERT_RETURN(t[1] == u1);
+  ASSERT_RETURN(t[2] == u2);
+  ASSERT_RETURN(t[3] == u3);
+  ASSERT_RETURN(t[4] == u4);
+  ASSERT_RETURN(t[5] == u5);
+  ASSERT_RETURN(t[6] == u6);
+  ASSERT_RETURN(t[7] == u7);
+  return TEST_SUCCESS;
+}
+
+result_t validate_int16(__m64 a, int16_t i0, int16_t i1, int16_t i2,
+                        int16_t i3) {
+  const int16_t *t = (const int16_t *)&a;
+  ASSERT_RETURN(t[0] == i0);
+  ASSERT_RETURN(t[1] == i1);
+  ASSERT_RETURN(t[2] == i2);
+  ASSERT_RETURN(t[3] == i3);
+  return TEST_SUCCESS;
+}
+
+result_t validate_uint16(__m64 a, uint16_t u0, uint16_t u1, uint16_t u2,
+                         uint16_t u3) {
+  const uint16_t *t = (const uint16_t *)&a;
+  ASSERT_RETURN(t[0] == u0);
+  ASSERT_RETURN(t[1] == u1);
+  ASSERT_RETURN(t[2] == u2);
+  ASSERT_RETURN(t[3] == u3);
+  return TEST_SUCCESS;
+}
+
+result_t validate_int8(__m128i a, int8_t i0, int8_t i1, int8_t i2, int8_t i3,
+                       int8_t i4, int8_t i5, int8_t i6, int8_t i7, int8_t i8,
+                       int8_t i9, int8_t i10, int8_t i11, int8_t i12,
+                       int8_t i13, int8_t i14, int8_t i15) {
+  const int8_t *t = (const int8_t *)&a;
+  ASSERT_RETURN(t[0] == i0);
+  ASSERT_RETURN(t[1] == i1);
+  ASSERT_RETURN(t[2] == i2);
+  ASSERT_RETURN(t[3] == i3);
+  ASSERT_RETURN(t[4] == i4);
+  ASSERT_RETURN(t[5] == i5);
+  ASSERT_RETURN(t[6] == i6);
+  ASSERT_RETURN(t[7] == i7);
+  ASSERT_RETURN(t[8] == i8);
+  ASSERT_RETURN(t[9] == i9);
+  ASSERT_RETURN(t[10] == i10);
+  ASSERT_RETURN(t[11] == i11);
+  ASSERT_RETURN(t[12] == i12);
+  ASSERT_RETURN(t[13] == i13);
+  ASSERT_RETURN(t[14] == i14);
+  ASSERT_RETURN(t[15] == i15);
+  return TEST_SUCCESS;
+}
+
+result_t validate_uint8(__m128i a, uint8_t u0, uint8_t u1, uint8_t u2,
+                        uint8_t u3, uint8_t u4, uint8_t u5, uint8_t u6,
+                        uint8_t u7, uint8_t u8, uint8_t u9, uint8_t u10,
+                        uint8_t u11, uint8_t u12, uint8_t u13, uint8_t u14,
+                        uint8_t u15) {
+  const uint8_t *t = (const uint8_t *)&a;
+  ASSERT_RETURN(t[0] == u0);
+  ASSERT_RETURN(t[1] == u1);
+  ASSERT_RETURN(t[2] == u2);
+  ASSERT_RETURN(t[3] == u3);
+  ASSERT_RETURN(t[4] == u4);
+  ASSERT_RETURN(t[5] == u5);
+  ASSERT_RETURN(t[6] == u6);
+  ASSERT_RETURN(t[7] == u7);
+  ASSERT_RETURN(t[8] == u8);
+  ASSERT_RETURN(t[9] == u9);
+  ASSERT_RETURN(t[10] == u10);
+  ASSERT_RETURN(t[11] == u11);
+  ASSERT_RETURN(t[12] == u12);
+  ASSERT_RETURN(t[13] == u13);
+  ASSERT_RETURN(t[14] == u14);
+  ASSERT_RETURN(t[15] == u15);
+  return TEST_SUCCESS;
+}
+
+result_t validate_int8(__m64 a, int8_t i0, int8_t i1, int8_t i2, int8_t i3,
+                       int8_t i4, int8_t i5, int8_t i6, int8_t i7) {
+  const int8_t *t = (const int8_t *)&a;
+  ASSERT_RETURN(t[0] == i0);
+  ASSERT_RETURN(t[1] == i1);
+  ASSERT_RETURN(t[2] == i2);
+  ASSERT_RETURN(t[3] == i3);
+  ASSERT_RETURN(t[4] == i4);
+  ASSERT_RETURN(t[5] == i5);
+  ASSERT_RETURN(t[6] == i6);
+  ASSERT_RETURN(t[7] == i7);
+  return TEST_SUCCESS;
+}
+
+result_t validate_uint8(__m64 a, uint8_t u0, uint8_t u1, uint8_t u2, uint8_t u3,
+                        uint8_t u4, uint8_t u5, uint8_t u6, uint8_t u7) {
+  const uint8_t *t = (const uint8_t *)&a;
+  ASSERT_RETURN(t[0] == u0);
+  ASSERT_RETURN(t[1] == u1);
+  ASSERT_RETURN(t[2] == u2);
+  ASSERT_RETURN(t[3] == u3);
+  ASSERT_RETURN(t[4] == u4);
+  ASSERT_RETURN(t[5] == u5);
+  ASSERT_RETURN(t[6] == u6);
+  ASSERT_RETURN(t[7] == u7);
+  return TEST_SUCCESS;
+}
+
+result_t validate_float_pair(float a, float b) {
+  const uint32_t *ua = (const uint32_t *)&a;
+  const uint32_t *ub = (const uint32_t *)&b;
+  // We do an integer (binary) compare rather than a
+  // floating point compare to take NaNs and infinities
+  // into account as well.
+  return (*ua) == (*ub) ? TEST_SUCCESS : TEST_FAIL;
+}
+
+result_t validate_double_pair(double a, double b) {
+  const uint64_t *ua = (const uint64_t *)&a;
+  const uint64_t *ub = (const uint64_t *)&b;
+  // We do an integer (binary) compare rather than a
+  // floating point compare to take NaNs and infinities
+  // into account as well.
+
+  if (std::isnan(a) && std::isnan(b)) {
+    return TEST_SUCCESS;
+  }
+
+  return (*ua) == (*ub) ? TEST_SUCCESS : TEST_FAIL;
+}
+
+result_t validate_float(__m64 a, float f0, float f1) {
+  const float *t = (const float *)&a;
+  ASSERT_RETURN(validate_float_pair(t[0], f0));
+  ASSERT_RETURN(validate_float_pair(t[1], f1));
+  return TEST_SUCCESS;
+}
+
+result_t validate_float(__m128 a, float f0, float f1, float f2, float f3) {
+  const float *t = (const float *)&a;
+  ASSERT_RETURN(validate_float_pair(t[0], f0));
+  ASSERT_RETURN(validate_float_pair(t[1], f1));
+  ASSERT_RETURN(validate_float_pair(t[2], f2));
+  ASSERT_RETURN(validate_float_pair(t[3], f3));
+  return TEST_SUCCESS;
+}
+
+result_t validate_double(__m128d a, double d0, double d1) {
+  const double *t = (const double *)&a;
+  ASSERT_RETURN(validate_double_pair(t[0], d0));
+  ASSERT_RETURN(validate_double_pair(t[1], d1));
+  return TEST_SUCCESS;
+}
+
+result_t validate_float_epsilon(__m128 a, float f0, float f1, float f2,
+                                float f3, float epsilon) {
+  const float *t = (const float *)&a;
+  float df0 = fabsf(t[0] - f0);
+  float df1 = fabsf(t[1] - f1);
+  float df2 = fabsf(t[2] - f2);
+  float df3 = fabsf(t[3] - f3);
+
+  // Due to floating-point error, subtracting floating-point number with NaN
+  // and zero value usually produces erroneous result. Therefore, we directly
+  // define the difference of two floating-point numbers to zero if both
+  // numbers are NaN or zero.
+  if ((std::isnan(t[0]) && std::isnan(f0)) || (t[0] == 0 && f0 == 0)) {
+    df0 = 0;
+  }
+
+  if ((std::isnan(t[1]) && std::isnan(f1)) || (t[1] == 0 && f1 == 0)) {
+    df1 = 0;
+  }
+
+  if ((std::isnan(t[2]) && std::isnan(f2)) || (t[2] == 0 && f2 == 0)) {
+    df2 = 0;
+  }
+
+  if ((std::isnan(t[3]) && std::isnan(f3)) || (t[3] == 0 && f3 == 0)) {
+    df3 = 0;
+  }
+
+  ASSERT_RETURN(df0 < epsilon);
+  ASSERT_RETURN(df1 < epsilon);
+  ASSERT_RETURN(df2 < epsilon);
+  ASSERT_RETURN(df3 < epsilon);
+  return TEST_SUCCESS;
+}
+
+result_t validate_float_error(__m128 a, float f0, float f1, float f2, float f3,
+                              float err) {
+  const float *t = (const float *)&a;
+  float df0 = fabsf((t[0] - f0) / f0);
+  float df1 = fabsf((t[1] - f1) / f1);
+  float df2 = fabsf((t[2] - f2) / f2);
+  float df3 = fabsf((t[3] - f3) / f3);
+
+  if ((std::isnan(t[0]) && std::isnan(f0)) || (t[0] == 0 && f0 == 0) ||
+      (std::isinf(t[0]) && std::isinf(f0))) {
+    df0 = 0;
+  }
+
+  if ((std::isnan(t[1]) && std::isnan(f1)) || (t[1] == 0 && f1 == 0) ||
+      (std::isinf(t[1]) && std::isinf(f1))) {
+    df1 = 0;
+  }
+
+  if ((std::isnan(t[2]) && std::isnan(f2)) || (t[2] == 0 && f2 == 0) ||
+      (std::isinf(t[2]) && std::isinf(f2))) {
+    df2 = 0;
+  }
+
+  if ((std::isnan(t[3]) && std::isnan(f3)) || (t[3] == 0 && f3 == 0) ||
+      (std::isinf(t[3]) && std::isinf(f3))) {
+    df3 = 0;
+  }
+
+  ASSERT_RETURN(df0 < err);
+  ASSERT_RETURN(df1 < err);
+  ASSERT_RETURN(df2 < err);
+  ASSERT_RETURN(df3 < err);
+  return TEST_SUCCESS;
+}
+
+result_t validate_double_error(__m128d a, double d0, double d1, double err) {
+  const double *t = (const double *)&a;
+  double td0 = fabs((t[0] - d0) / d0);
+  double td1 = fabs((t[1] - d1) / d1);
+
+  if (std::isnan(t[0]) && std::isnan(d0)) {
+    td0 = 0;
+  }
+
+  if (std::isnan(t[1]) && std::isnan(d1)) {
+    td1 = 0;
+  }
+
+  ASSERT_RETURN(td0 < err);
+  ASSERT_RETURN(td1 < err);
+  return TEST_SUCCESS;
+}
+
+} // namespace SSE2RVV
diff --git a/tests/common.h b/tests/common.h
new file mode 100644
index 0000000..efb2357
--- /dev/null
+++ b/tests/common.h
@@ -0,0 +1,421 @@
+#ifndef SSE2RVV_COMMON_H
+#define SSE2RVV_COMMON_H
+#include <cstdint>
+#if defined(__riscv) || defined(__riscv__)
+#include "sse2rvv.h"
+#elif defined(__x86_64__) || defined(__i386__)
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
+#include <wmmintrin.h>
+#include <x86intrin.h>
+#include <xmmintrin.h>
+
+// __int64 is defined in the Intrinsics Guide which maps to different datatype
+// in different data model
+#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
+#if (defined(__x86_64__) || defined(__i386__))
+#define __int64 long long
+#else
+#define __int64 int64_t
+#endif
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma push_macro("ALIGN_STRUCT")
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+#else
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+
+typedef union ALIGN_STRUCT(16) SIMDVec {
+  float m128_f32[4];    // as floats - DON'T USE. Added for convenience.
+  int8_t m128_i8[16];   // as signed 8-bit integers.
+  int16_t m128_i16[8];  // as signed 16-bit integers.
+  int32_t m128_i32[4];  // as signed 32-bit integers.
+  int64_t m128_i64[2];  // as signed 64-bit integers.
+  uint8_t m128_u8[16];  // as unsigned 8-bit integers.
+  uint16_t m128_u16[8]; // as unsigned 16-bit integers.
+  uint32_t m128_u32[4]; // as unsigned 32-bit integers.
+  uint64_t m128_u64[2]; // as unsigned 64-bit integers.
+} SIMDVec;
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma pop_macro("ALIGN_STRUCT")
+#endif
+
+/* Tunable testing configuration for precise testing */
+/* _mm_min|max_ps|ss|pd|sd */
+#ifndef SSE2RVV_PRECISE_MINMAX
+#define SSE2RVV_PRECISE_MINMAX (0)
+#endif
+#endif
+
+#define ASSERT_RETURN(x)                                                       \
+  if (!(x))                                                                    \
+    return TEST_FAIL;
+
+namespace SSE2RVV {
+enum result_t {
+  TEST_SUCCESS = 1,
+  TEST_FAIL = 0,
+  TEST_UNIMPL = -1,
+};
+extern int32_t NaN;
+extern int64_t NaN64;
+#define ALL_BIT_1_32 (*(float *)&NaN)
+#define ALL_BIT_1_64 (*(double *)&NaN64)
+
+template <typename T> result_t validate_128bits(T a, T b) {
+  const int32_t *t1 = (const int32_t *)&a;
+  const int32_t *t2 = (const int32_t *)&b;
+
+  ASSERT_RETURN(t1[0] == t2[0]);
+  ASSERT_RETURN(t1[1] == t2[1]);
+  ASSERT_RETURN(t1[2] == t2[2]);
+  ASSERT_RETURN(t1[3] == t2[3]);
+  return TEST_SUCCESS;
+}
+result_t validate_int64(__m128i a, int64_t i0, int64_t i1);
+result_t validate_uint64(__m128i a, uint64_t i0, uint64_t i1);
+result_t validate_int64(__m64 a, int64_t i0);
+result_t validate_uint64(__m64 a, uint64_t i0);
+result_t validate_int32(__m128i a, int32_t i0, int32_t i1, int32_t i2,
+                        int32_t i3);
+result_t validate_uint32(__m128i a, uint32_t u0, uint32_t u1, uint32_t u2,
+                         uint32_t u3);
+result_t validate_int32(__m64 a, int32_t u0, int32_t u1);
+result_t validate_uint32(__m64 a, uint32_t u0, uint32_t u1);
+result_t validate_int16(__m128i a, int16_t i0, int16_t i1, int16_t i2,
+                        int16_t i3, int16_t i4, int16_t i5, int16_t i6,
+                        int16_t i7);
+result_t validate_uint16(__m128i a, uint16_t u0, uint16_t u1, uint16_t u2,
+                         uint16_t u3, uint16_t u4, uint16_t u5, uint16_t u6,
+                         uint16_t u7);
+result_t validate_int16(__m64 a, int16_t i0, int16_t i1, int16_t i2,
+                        int16_t i3);
+result_t validate_uint16(__m64 a, uint16_t u0, uint16_t u1, uint16_t u2,
+                         uint16_t u3);
+result_t validate_int8(__m128i a, int8_t i0, int8_t i1, int8_t i2, int8_t i3,
+                       int8_t i4, int8_t i5, int8_t i6, int8_t i7, int8_t i8,
+                       int8_t i9, int8_t i10, int8_t i11, int8_t i12,
+                       int8_t i13, int8_t i14, int8_t i15);
+result_t validate_uint8(__m128i a, uint8_t u0, uint8_t u1, uint8_t u2,
+                        uint8_t u3, uint8_t u4, uint8_t u5, uint8_t u6,
+                        uint8_t u7, uint8_t u8, uint8_t u9, uint8_t u10,
+                        uint8_t u11, uint8_t u12, uint8_t u13, uint8_t u14,
+                        uint8_t u15);
+result_t validate_int8(__m64 a, int8_t i0, int8_t i1, int8_t i2, int8_t i3,
+                       int8_t i4, int8_t i5, int8_t i6, int8_t i7);
+result_t validate_uint8(__m64 a, uint8_t u0, uint8_t u1, uint8_t u2, uint8_t u3,
+                        uint8_t u4, uint8_t u5, uint8_t u6, uint8_t u7);
+result_t validate_float_pair(float a, float b);
+result_t validate_double_pair(double a, double b);
+result_t validate_float(__m64 a, float f0, float f1);
+result_t validate_float(__m128 a, float f0, float f1, float f2, float f3);
+result_t validate_float_epsilon(__m128 a, float f0, float f1, float f2,
+                                float f3, float epsilon);
+result_t validate_float_error(__m128 a, float f0, float f1, float f2, float f3,
+                              float err);
+result_t validate_double(__m128d a, double d0, double d1);
+result_t validate_double_error(__m128d a, double d0, double d1, double err);
+
+#define VALIDATE_INT8_M128(A, B)                                               \
+  validate_int8(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7], B[8], B[9], \
+                B[10], B[11], B[12], B[13], B[14], B[15])
+#define VALIDATE_UINT8_M128(A, B)                                              \
+  validate_uint8(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7], B[8],      \
+                 B[9], B[10], B[11], B[12], B[13], B[14], B[15])
+#define VALIDATE_INT16_M128(A, B)                                              \
+  validate_int16(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7])
+#define VALIDATE_UINT16_M128(A, B)                                             \
+  validate_uint16(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7])
+#define VALIDATE_INT32_M128(A, B) validate_int32(A, B[0], B[1], B[2], B[3])
+#define VALIDATE_UINT32_M128(A, B) validate_uint32(A, B[0], B[1], B[2], B[3])
+
+#define VALIDATE_INT8_M64(A, B)                                                \
+  validate_int8(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7])
+#define VALIDATE_UINT8_M64(A, B)                                               \
+  validate_uint8(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7])
+#define VALIDATE_INT16_M64(A, B) validate_int16(A, B[0], B[1], B[2], B[3])
+#define VALIDATE_UINT16_M64(A, B) validate_uint16(A, B[0], B[1], B[2], B[3])
+#define VALIDATE_INT32_M64(A, B) validate_int32(A, B[0], B[1])
+#define VALIDATE_UINT32_M64(A, B) validate_uint32(A, B[0], B[1])
+#define CHECK_RESULT(EXP)                                                      \
+  if (EXP != TEST_SUCCESS) {                                                   \
+    return TEST_FAIL;                                                          \
+  }
+#define IMM_2_ITER                                                             \
+  TEST_IMPL(0)                                                                 \
+  TEST_IMPL(1)
+#define IMM_4_ITER                                                             \
+  IMM_2_ITER                                                                   \
+  TEST_IMPL(2)                                                                 \
+  TEST_IMPL(3)
+#define IMM_8_ITER                                                             \
+  IMM_4_ITER                                                                   \
+  TEST_IMPL(4)                                                                 \
+  TEST_IMPL(5)                                                                 \
+  TEST_IMPL(6)                                                                 \
+  TEST_IMPL(7)
+#define IMM_16_ITER                                                            \
+  IMM_8_ITER                                                                   \
+  TEST_IMPL(8)                                                                 \
+  TEST_IMPL(9)                                                                 \
+  TEST_IMPL(10)                                                                \
+  TEST_IMPL(11)                                                                \
+  TEST_IMPL(12)                                                                \
+  TEST_IMPL(13)                                                                \
+  TEST_IMPL(14)                                                                \
+  TEST_IMPL(15)
+#define IMM_32_ITER                                                            \
+  IMM_16_ITER                                                                  \
+  TEST_IMPL(16)                                                                \
+  TEST_IMPL(17)                                                                \
+  TEST_IMPL(18)                                                                \
+  TEST_IMPL(19)                                                                \
+  TEST_IMPL(20)                                                                \
+  TEST_IMPL(21)                                                                \
+  TEST_IMPL(22)                                                                \
+  TEST_IMPL(23)                                                                \
+  TEST_IMPL(24)                                                                \
+  TEST_IMPL(25)                                                                \
+  TEST_IMPL(26)                                                                \
+  TEST_IMPL(27)                                                                \
+  TEST_IMPL(28)                                                                \
+  TEST_IMPL(29)                                                                \
+  TEST_IMPL(30)                                                                \
+  TEST_IMPL(31)
+#define IMM_64_ITER                                                            \
+  IMM_32_ITER                                                                  \
+  TEST_IMPL(32)                                                                \
+  TEST_IMPL(33)                                                                \
+  TEST_IMPL(34)                                                                \
+  TEST_IMPL(35)                                                                \
+  TEST_IMPL(36)                                                                \
+  TEST_IMPL(37)                                                                \
+  TEST_IMPL(38)                                                                \
+  TEST_IMPL(39)                                                                \
+  TEST_IMPL(40)                                                                \
+  TEST_IMPL(41)                                                                \
+  TEST_IMPL(42)                                                                \
+  TEST_IMPL(43)                                                                \
+  TEST_IMPL(44)                                                                \
+  TEST_IMPL(45)                                                                \
+  TEST_IMPL(46)                                                                \
+  TEST_IMPL(47)                                                                \
+  TEST_IMPL(48)                                                                \
+  TEST_IMPL(49)                                                                \
+  TEST_IMPL(50)                                                                \
+  TEST_IMPL(51)                                                                \
+  TEST_IMPL(52)                                                                \
+  TEST_IMPL(53)                                                                \
+  TEST_IMPL(54)                                                                \
+  TEST_IMPL(55)                                                                \
+  TEST_IMPL(56)                                                                \
+  TEST_IMPL(57)                                                                \
+  TEST_IMPL(58)                                                                \
+  TEST_IMPL(59)                                                                \
+  TEST_IMPL(60)                                                                \
+  TEST_IMPL(61)                                                                \
+  TEST_IMPL(62)                                                                \
+  TEST_IMPL(63)
+#define IMM_128_ITER                                                           \
+  IMM_64_ITER                                                                  \
+  TEST_IMPL(64)                                                                \
+  TEST_IMPL(65)                                                                \
+  TEST_IMPL(66)                                                                \
+  TEST_IMPL(67)                                                                \
+  TEST_IMPL(68)                                                                \
+  TEST_IMPL(69)                                                                \
+  TEST_IMPL(70)                                                                \
+  TEST_IMPL(71)                                                                \
+  TEST_IMPL(72)                                                                \
+  TEST_IMPL(73)                                                                \
+  TEST_IMPL(74)                                                                \
+  TEST_IMPL(75)                                                                \
+  TEST_IMPL(76)                                                                \
+  TEST_IMPL(77)                                                                \
+  TEST_IMPL(78)                                                                \
+  TEST_IMPL(79)                                                                \
+  TEST_IMPL(80)                                                                \
+  TEST_IMPL(81)                                                                \
+  TEST_IMPL(82)                                                                \
+  TEST_IMPL(83)                                                                \
+  TEST_IMPL(84)                                                                \
+  TEST_IMPL(85)                                                                \
+  TEST_IMPL(86)                                                                \
+  TEST_IMPL(87)                                                                \
+  TEST_IMPL(88)                                                                \
+  TEST_IMPL(89)                                                                \
+  TEST_IMPL(90)                                                                \
+  TEST_IMPL(91)                                                                \
+  TEST_IMPL(92)                                                                \
+  TEST_IMPL(93)                                                                \
+  TEST_IMPL(94)                                                                \
+  TEST_IMPL(95)                                                                \
+  TEST_IMPL(96)                                                                \
+  TEST_IMPL(97)                                                                \
+  TEST_IMPL(98)                                                                \
+  TEST_IMPL(99)                                                                \
+  TEST_IMPL(100)                                                               \
+  TEST_IMPL(101)                                                               \
+  TEST_IMPL(102)                                                               \
+  TEST_IMPL(103)                                                               \
+  TEST_IMPL(104)                                                               \
+  TEST_IMPL(105)                                                               \
+  TEST_IMPL(106)                                                               \
+  TEST_IMPL(107)                                                               \
+  TEST_IMPL(108)                                                               \
+  TEST_IMPL(109)                                                               \
+  TEST_IMPL(110)                                                               \
+  TEST_IMPL(111)                                                               \
+  TEST_IMPL(112)                                                               \
+  TEST_IMPL(113)                                                               \
+  TEST_IMPL(114)                                                               \
+  TEST_IMPL(115)                                                               \
+  TEST_IMPL(116)                                                               \
+  TEST_IMPL(117)                                                               \
+  TEST_IMPL(118)                                                               \
+  TEST_IMPL(119)                                                               \
+  TEST_IMPL(120)                                                               \
+  TEST_IMPL(121)                                                               \
+  TEST_IMPL(122)                                                               \
+  TEST_IMPL(123)                                                               \
+  TEST_IMPL(124)                                                               \
+  TEST_IMPL(125)                                                               \
+  TEST_IMPL(126)                                                               \
+  TEST_IMPL(127)
+#define IMM_256_ITER                                                           \
+  IMM_128_ITER                                                                 \
+  TEST_IMPL(128)                                                               \
+  TEST_IMPL(129)                                                               \
+  TEST_IMPL(130)                                                               \
+  TEST_IMPL(131)                                                               \
+  TEST_IMPL(132)                                                               \
+  TEST_IMPL(133)                                                               \
+  TEST_IMPL(134)                                                               \
+  TEST_IMPL(135)                                                               \
+  TEST_IMPL(136)                                                               \
+  TEST_IMPL(137)                                                               \
+  TEST_IMPL(138)                                                               \
+  TEST_IMPL(139)                                                               \
+  TEST_IMPL(140)                                                               \
+  TEST_IMPL(141)                                                               \
+  TEST_IMPL(142)                                                               \
+  TEST_IMPL(143)                                                               \
+  TEST_IMPL(144)                                                               \
+  TEST_IMPL(145)                                                               \
+  TEST_IMPL(146)                                                               \
+  TEST_IMPL(147)                                                               \
+  TEST_IMPL(148)                                                               \
+  TEST_IMPL(149)                                                               \
+  TEST_IMPL(150)                                                               \
+  TEST_IMPL(151)                                                               \
+  TEST_IMPL(152)                                                               \
+  TEST_IMPL(153)                                                               \
+  TEST_IMPL(154)                                                               \
+  TEST_IMPL(155)                                                               \
+  TEST_IMPL(156)                                                               \
+  TEST_IMPL(157)                                                               \
+  TEST_IMPL(158)                                                               \
+  TEST_IMPL(159)                                                               \
+  TEST_IMPL(160)                                                               \
+  TEST_IMPL(161)                                                               \
+  TEST_IMPL(162)                                                               \
+  TEST_IMPL(163)                                                               \
+  TEST_IMPL(164)                                                               \
+  TEST_IMPL(165)                                                               \
+  TEST_IMPL(166)                                                               \
+  TEST_IMPL(167)                                                               \
+  TEST_IMPL(168)                                                               \
+  TEST_IMPL(169)                                                               \
+  TEST_IMPL(170)                                                               \
+  TEST_IMPL(171)                                                               \
+  TEST_IMPL(172)                                                               \
+  TEST_IMPL(173)                                                               \
+  TEST_IMPL(174)                                                               \
+  TEST_IMPL(175)                                                               \
+  TEST_IMPL(176)                                                               \
+  TEST_IMPL(177)                                                               \
+  TEST_IMPL(178)                                                               \
+  TEST_IMPL(179)                                                               \
+  TEST_IMPL(180)                                                               \
+  TEST_IMPL(181)                                                               \
+  TEST_IMPL(182)                                                               \
+  TEST_IMPL(183)                                                               \
+  TEST_IMPL(184)                                                               \
+  TEST_IMPL(185)                                                               \
+  TEST_IMPL(186)                                                               \
+  TEST_IMPL(187)                                                               \
+  TEST_IMPL(188)                                                               \
+  TEST_IMPL(189)                                                               \
+  TEST_IMPL(190)                                                               \
+  TEST_IMPL(191)                                                               \
+  TEST_IMPL(192)                                                               \
+  TEST_IMPL(193)                                                               \
+  TEST_IMPL(194)                                                               \
+  TEST_IMPL(195)                                                               \
+  TEST_IMPL(196)                                                               \
+  TEST_IMPL(197)                                                               \
+  TEST_IMPL(198)                                                               \
+  TEST_IMPL(199)                                                               \
+  TEST_IMPL(200)                                                               \
+  TEST_IMPL(201)                                                               \
+  TEST_IMPL(202)                                                               \
+  TEST_IMPL(203)                                                               \
+  TEST_IMPL(204)                                                               \
+  TEST_IMPL(205)                                                               \
+  TEST_IMPL(206)                                                               \
+  TEST_IMPL(207)                                                               \
+  TEST_IMPL(208)                                                               \
+  TEST_IMPL(209)                                                               \
+  TEST_IMPL(210)                                                               \
+  TEST_IMPL(211)                                                               \
+  TEST_IMPL(212)                                                               \
+  TEST_IMPL(213)                                                               \
+  TEST_IMPL(214)                                                               \
+  TEST_IMPL(215)                                                               \
+  TEST_IMPL(216)                                                               \
+  TEST_IMPL(217)                                                               \
+  TEST_IMPL(218)                                                               \
+  TEST_IMPL(219)                                                               \
+  TEST_IMPL(220)                                                               \
+  TEST_IMPL(221)                                                               \
+  TEST_IMPL(222)                                                               \
+  TEST_IMPL(223)                                                               \
+  TEST_IMPL(224)                                                               \
+  TEST_IMPL(225)                                                               \
+  TEST_IMPL(226)                                                               \
+  TEST_IMPL(227)                                                               \
+  TEST_IMPL(228)                                                               \
+  TEST_IMPL(229)                                                               \
+  TEST_IMPL(230)                                                               \
+  TEST_IMPL(231)                                                               \
+  TEST_IMPL(232)                                                               \
+  TEST_IMPL(233)                                                               \
+  TEST_IMPL(234)                                                               \
+  TEST_IMPL(235)                                                               \
+  TEST_IMPL(236)                                                               \
+  TEST_IMPL(237)                                                               \
+  TEST_IMPL(238)                                                               \
+  TEST_IMPL(239)                                                               \
+  TEST_IMPL(240)                                                               \
+  TEST_IMPL(241)                                                               \
+  TEST_IMPL(242)                                                               \
+  TEST_IMPL(243)                                                               \
+  TEST_IMPL(244)                                                               \
+  TEST_IMPL(245)                                                               \
+  TEST_IMPL(246)                                                               \
+  TEST_IMPL(247)                                                               \
+  TEST_IMPL(248)                                                               \
+  TEST_IMPL(249)                                                               \
+  TEST_IMPL(250)                                                               \
+  TEST_IMPL(251)                                                               \
+  TEST_IMPL(252)                                                               \
+  TEST_IMPL(253)                                                               \
+  TEST_IMPL(254)                                                               \
+  TEST_IMPL(255)
+} // namespace SSE2RVV
+
+#endif
diff --git a/tests/impl.cpp b/tests/impl.cpp
new file mode 100644
index 0000000..53dddc3
--- /dev/null
+++ b/tests/impl.cpp
@@ -0,0 +1,9570 @@
+#include <assert.h>
+#include <float.h>
+#include <inttypes.h>
+#include <math.h>
+#include <stdalign.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <utility>
+
+#include "binding.h"
+#include "impl.h"
+
+// Try 10,000 random floating point values for each test we run
+#define MAX_TEST_VALUE 10000
+
+/* Pattern Matching for C macros.
+ * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
+ */
+
+/* catenate */
+#define PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
+
+#define IIF(c) PRIMITIVE_CAT(IIF_, c)
+/* run the 2nd parameter */
+#define IIF_0(t, ...) __VA_ARGS__
+/* run the 1st parameter */
+#define IIF_1(t, ...) t
+
+// This program a set of unit tests to ensure that each SSE call provide the
+// output we expect.  If this fires an assert, then something didn't match up.
+//
+// Functions with "test_" prefix will be called in run_single_test.
+namespace SSE2RVV {
+// Forward declaration
+class SSE2RVV_TEST_IMPL : public SSE2RVV_TEST {
+public:
+  SSE2RVV_TEST_IMPL(void);
+  result_t load_test_float_pointers(uint32_t i);
+  result_t load_test_int_pointers(uint32_t i);
+  result_t run_single_test(INSTRUCTION_TEST test, uint32_t i);
+
+  float *test_cases_float_pointer1;
+  float *test_cases_float_pointer2;
+  int32_t *test_cases_int_pointer1;
+  int32_t *test_cases_int_pointer2;
+  float test_cases_floats[MAX_TEST_VALUE];
+  int32_t test_cases_ints[MAX_TEST_VALUE];
+
+  virtual ~SSE2RVV_TEST_IMPL(void) {
+    platform_aligned_free(test_cases_float_pointer1);
+    platform_aligned_free(test_cases_float_pointer2);
+    platform_aligned_free(test_cases_int_pointer1);
+    platform_aligned_free(test_cases_int_pointer2);
+  }
+  virtual void release(void) { delete this; }
+  virtual result_t run_test(INSTRUCTION_TEST test) {
+    result_t ret = TEST_SUCCESS;
+
+    // Test a whole bunch of values
+    for (uint32_t i = 0; i < (MAX_TEST_VALUE - 8); i++) {
+      ret = load_test_float_pointers(i); // Load some random float values
+      if (ret == TEST_FAIL)
+        break;                         // load test float failed??
+      ret = load_test_int_pointers(i); // load some random int values
+      if (ret == TEST_FAIL)
+        break; // load test float failed??
+      // If we are testing the reciprocal, then invert the input data
+      // (easier for debugging)
+      if (test == it_mm_rcp_ps) {
+        test_cases_float_pointer1[0] = 1.0f / test_cases_float_pointer1[0];
+        test_cases_float_pointer1[1] = 1.0f / test_cases_float_pointer1[1];
+        test_cases_float_pointer1[2] = 1.0f / test_cases_float_pointer1[2];
+        test_cases_float_pointer1[3] = 1.0f / test_cases_float_pointer1[3];
+      }
+      if (test == it_mm_rcp_ps || test == it_mm_rcp_ss ||
+          test == it_mm_rsqrt_ps || test == it_mm_rsqrt_ss) {
+        if ((rand() & 3) == 0) {
+          uint32_t r1 = rand() & 3;
+          uint32_t r2 = rand() & 3;
+          uint32_t r3 = rand() & 3;
+          uint32_t r4 = rand() & 3;
+          uint32_t r5 = rand() & 3;
+          uint32_t r6 = rand() & 3;
+          uint32_t r7 = rand() & 3;
+          uint32_t r8 = rand() & 3;
+          test_cases_float_pointer1[r1] = 0.0f;
+          test_cases_float_pointer1[r2] = 0.0f;
+          test_cases_float_pointer1[r3] = 0.0f;
+          test_cases_float_pointer1[r4] = 0.0f;
+          test_cases_float_pointer1[r5] = -0.0f;
+          test_cases_float_pointer1[r6] = -0.0f;
+          test_cases_float_pointer1[r7] = -0.0f;
+          test_cases_float_pointer1[r8] = -0.0f;
+        }
+      }
+      if (test == it_mm_cmpge_ps || test == it_mm_cmpge_ss ||
+          test == it_mm_cmple_ps || test == it_mm_cmple_ss ||
+          test == it_mm_cmpeq_ps || test == it_mm_cmpeq_ss) {
+        // Make sure at least one value is the same.
+        test_cases_float_pointer1[3] = test_cases_float_pointer2[3];
+      }
+
+      if (test == it_mm_cmpord_ps || test == it_mm_cmpord_ss ||
+          test == it_mm_cmpunord_ps || test == it_mm_cmpunord_ss ||
+          test == it_mm_cmpeq_ps || test == it_mm_cmpeq_ss ||
+          test == it_mm_cmpge_ps || test == it_mm_cmpge_ss ||
+          test == it_mm_cmpgt_ps || test == it_mm_cmpgt_ss ||
+          test == it_mm_cmple_ps || test == it_mm_cmple_ss ||
+          test == it_mm_cmplt_ps || test == it_mm_cmplt_ss ||
+          test == it_mm_cmpneq_ps || test == it_mm_cmpneq_ss ||
+          test == it_mm_cmpnge_ps || test == it_mm_cmpnge_ss ||
+          test == it_mm_cmpngt_ps || test == it_mm_cmpngt_ss ||
+          test == it_mm_cmpnle_ps || test == it_mm_cmpnle_ss ||
+          test == it_mm_cmpnlt_ps || test == it_mm_cmpnlt_ss ||
+          test == it_mm_comieq_ss || test == it_mm_ucomieq_ss ||
+          test == it_mm_comige_ss || test == it_mm_ucomige_ss ||
+          test == it_mm_comigt_ss || test == it_mm_ucomigt_ss ||
+          test == it_mm_comile_ss || test == it_mm_ucomile_ss ||
+          test == it_mm_comilt_ss || test == it_mm_ucomilt_ss ||
+          test == it_mm_comineq_ss || test == it_mm_ucomineq_ss) {
+        // Make sure the NaN values are included in the testing
+        // one out of four times.
+        if ((rand() & 3) == 0) {
+          uint32_t r1 = rand() & 3;
+          uint32_t r2 = rand() & 3;
+          test_cases_float_pointer1[r1] = nanf("");
+          test_cases_float_pointer2[r2] = nanf("");
+        }
+      }
+
+      if (test == it_mm_cmpord_pd || test == it_mm_cmpord_sd ||
+          test == it_mm_cmpunord_pd || test == it_mm_cmpunord_sd ||
+          test == it_mm_cmpeq_pd || test == it_mm_cmpeq_sd ||
+          test == it_mm_cmpge_pd || test == it_mm_cmpge_sd ||
+          test == it_mm_cmpgt_pd || test == it_mm_cmpgt_sd ||
+          test == it_mm_cmple_pd || test == it_mm_cmple_sd ||
+          test == it_mm_cmplt_pd || test == it_mm_cmplt_sd ||
+          test == it_mm_cmpneq_pd || test == it_mm_cmpneq_sd ||
+          test == it_mm_cmpnge_pd || test == it_mm_cmpnge_sd ||
+          test == it_mm_cmpngt_pd || test == it_mm_cmpngt_sd ||
+          test == it_mm_cmpnle_pd || test == it_mm_cmpnle_sd ||
+          test == it_mm_cmpnlt_pd || test == it_mm_cmpnlt_sd ||
+          test == it_mm_comieq_sd || test == it_mm_ucomieq_sd ||
+          test == it_mm_comige_sd || test == it_mm_ucomige_sd ||
+          test == it_mm_comigt_sd || test == it_mm_ucomigt_sd ||
+          test == it_mm_comile_sd || test == it_mm_ucomile_sd ||
+          test == it_mm_comilt_sd || test == it_mm_ucomilt_sd ||
+          test == it_mm_comineq_sd || test == it_mm_ucomineq_sd) {
+        // Make sure the NaN values are included in the testing
+        // one out of four times.
+        if ((rand() & 3) == 0) {
+          // FIXME:
+          // The argument "0xFFFFFFFFFFFF" is a tricky workaround to
+          // set the NaN value for doubles. The code is not intuitive
+          // and should be fixed in the future.
+          uint32_t r1 = ((rand() & 1) << 1) + 1;
+          uint32_t r2 = ((rand() & 1) << 1) + 1;
+          test_cases_float_pointer1[r1] = nanf("0xFFFFFFFFFFFF");
+          test_cases_float_pointer2[r2] = nanf("0xFFFFFFFFFFFF");
+        }
+      }
+
+      if (test == it_mm_max_pd || test == it_mm_max_sd ||
+          test == it_mm_min_pd || test == it_mm_min_sd) {
+        // Make sure the positive/negative infinity values are included
+        // in the testing one out of four times.
+        if ((rand() & 3) == 0) {
+          uint32_t r1 = ((rand() & 1) << 1) + 1;
+          uint32_t r2 = ((rand() & 1) << 1) + 1;
+          uint32_t r3 = ((rand() & 1) << 1) + 1;
+          uint32_t r4 = ((rand() & 1) << 1) + 1;
+          test_cases_float_pointer1[r1] = INFINITY;
+          test_cases_float_pointer2[r2] = INFINITY;
+          test_cases_float_pointer1[r3] = -INFINITY;
+          test_cases_float_pointer1[r4] = -INFINITY;
+        }
+      }
+
+#if SSE2RVV_PRECISE_MINMAX
+      if (test == it_mm_max_ps || test == it_mm_max_ss ||
+          test == it_mm_min_ps || test == it_mm_min_ss) {
+        // Make sure the NaN values are included in the testing
+        // one out of four times.
+        if ((rand() & 3) == 0) {
+          uint32_t r1 = rand() & 3;
+          uint32_t r2 = rand() & 3;
+          test_cases_float_pointer1[r1] = nanf("");
+          test_cases_float_pointer2[r2] = nanf("");
+        }
+      }
+
+      if (test == it_mm_max_pd || test == it_mm_max_sd ||
+          test == it_mm_min_pd || test == it_mm_min_sd) {
+        // Make sure the NaN values are included in the testing
+        // one out of four times.
+        if ((rand() & 3) == 0) {
+          // FIXME:
+          // The argument "0xFFFFFFFFFFFF" is a tricky workaround to
+          // set the NaN value for doubles. The code is not intuitive
+          // and should be fixed in the future.
+          uint32_t r1 = ((rand() & 1) << 1) + 1;
+          uint32_t r2 = ((rand() & 1) << 1) + 1;
+          test_cases_float_pointer1[r1] = nanf("0xFFFFFFFFFFFF");
+          test_cases_float_pointer2[r2] = nanf("0xFFFFFFFFFFFF");
+        }
+      }
+#endif
+
+      // one out of every random 64 times or so, mix up the test floats to
+      // contain some integer values
+      if ((rand() & 63) == 0) {
+        uint32_t option = rand() & 3;
+        switch (option) {
+        // All integers..
+        case 0:
+          test_cases_float_pointer1[0] = float(test_cases_int_pointer1[0]);
+          test_cases_float_pointer1[1] = float(test_cases_int_pointer1[1]);
+          test_cases_float_pointer1[2] = float(test_cases_int_pointer1[2]);
+          test_cases_float_pointer1[3] = float(test_cases_int_pointer1[3]);
+
+          test_cases_float_pointer2[0] = float(test_cases_int_pointer2[0]);
+          test_cases_float_pointer2[1] = float(test_cases_int_pointer2[1]);
+          test_cases_float_pointer2[2] = float(test_cases_int_pointer2[2]);
+          test_cases_float_pointer2[3] = float(test_cases_int_pointer2[3]);
+
+          break;
+        case 1: {
+          uint32_t index = rand() & 3;
+          test_cases_float_pointer1[index] =
+              float(test_cases_int_pointer1[index]);
+          index = rand() & 3;
+          test_cases_float_pointer2[index] =
+              float(test_cases_int_pointer2[index]);
+        } break;
+        case 2: {
+          uint32_t index1 = rand() & 3;
+          uint32_t index2 = rand() & 3;
+          test_cases_float_pointer1[index1] =
+              float(test_cases_int_pointer1[index1]);
+          test_cases_float_pointer1[index2] =
+              float(test_cases_int_pointer1[index2]);
+          index1 = rand() & 3;
+          index2 = rand() & 3;
+          test_cases_float_pointer1[index1] =
+              float(test_cases_int_pointer1[index1]);
+          test_cases_float_pointer1[index2] =
+              float(test_cases_int_pointer1[index2]);
+        } break;
+        case 3:
+          test_cases_float_pointer1[0] = float(test_cases_int_pointer1[0]);
+          test_cases_float_pointer1[1] = float(test_cases_int_pointer1[1]);
+          test_cases_float_pointer1[2] = float(test_cases_int_pointer1[2]);
+          test_cases_float_pointer1[3] = float(test_cases_int_pointer1[3]);
+          break;
+        }
+        if ((rand() & 3) == 0) { // one out of 4 times, make halves
+          for (uint32_t j = 0; j < 4; j++) {
+            test_cases_float_pointer1[j] *= 0.5f;
+            test_cases_float_pointer2[j] *= 0.5f;
+          }
+        }
+      }
+
+      ret = run_single_test(test, i);
+      if (ret == TEST_FAIL) // the test failed...
+      {
+        // Set a breakpoint here if you want to step through the failure
+        // case in the debugger
+        ret = run_single_test(test, i);
+        break;
+      }
+    }
+    return ret;
+  }
+};
+
+const char *instructionString[] = {
+#define _(x) #x,
+    INTRIN_LIST
+#undef _
+};
+
+// Produce rounding which is the same as SSE instructions with _MM_ROUND_NEAREST
+// rounding mode
+static inline float bankersRounding(float val) {
+  if (val < 0)
+    return -bankersRounding(-val);
+
+  float ret;
+  float roundDown = floorf(val); // Round down value
+  float roundUp = ceilf(val);    // Round up value
+  float diffDown = val - roundDown;
+  float diffUp = roundUp - val;
+
+  if (diffDown < diffUp) {
+    /* If it's closer to the round down value, then use it */
+    ret = roundDown;
+  } else if (diffDown > diffUp) {
+    /* If it's closer to the round up value, then use it */
+    ret = roundUp;
+  } else {
+    /* If it's equidistant between round up and round down value, pick the
+     * one which is an even number */
+    float half = roundDown / 2;
+    if (half != floorf(half)) {
+      /* If the round down value is odd, return the round up value */
+      ret = roundUp;
+    } else {
+      /* If the round up value is odd, return the round down value */
+      ret = roundDown;
+    }
+  }
+  return ret;
+}
+
+static inline double bankersRounding(double val) {
+  if (val < 0)
+    return -bankersRounding(-val);
+
+  double ret;
+  double roundDown = floor(val); // Round down value
+  double roundUp = ceil(val);    // Round up value
+  double diffDown = val - roundDown;
+  double diffUp = roundUp - val;
+
+  if (diffDown < diffUp) {
+    /* If it's closer to the round down value, then use it */
+    ret = roundDown;
+  } else if (diffDown > diffUp) {
+    /* If it's closer to the round up value, then use it */
+    ret = roundUp;
+  } else {
+    /* If it's equidistant between round up and round down value, pick the
+     * one which is an even number */
+    double half = roundDown / 2;
+    if (half != floor(half)) {
+      /* If the round down value is odd, return the round up value */
+      ret = roundUp;
+    } else {
+      /* If the round up value is odd, return the round down value */
+      ret = roundDown;
+    }
+  }
+  return ret;
+}
+
+// SplitMix64 PRNG by Sebastiano Vigna, see:
+// <https://xoshiro.di.unimi.it/splitmix64.c>
+static uint64_t state; // the state of SplitMix64 PRNG
+const double TWOPOWER64 = pow(2, 64);
+
+#define SSE2RVV_INIT_RNG(seed)                                                 \
+  do {                                                                         \
+    state = seed;                                                              \
+  } while (0)
+
+static double next() {
+  uint64_t z = (state += 0x9e3779b97f4a7c15);
+  z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
+  z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
+  return z ^ (z >> 31);
+}
+
+static float ranf() { return next() / TWOPOWER64; }
+
+static float ranf(float low, float high) { return ranf() * (high - low) + low; }
+
+// Enable the tests which are using the macro of another tests
+result_t test_mm_slli_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter);
+result_t test_mm_srli_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter);
+result_t test_mm_shuffle_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter);
+
+// This function is not called from "run_single_test", but for other intrinsic
+// tests that might need to call "_mm_set_epi32".
+// __m128i do_mm_set_epi32(int32_t x, int32_t y, int32_t z, int32_t w) {
+//   __m128i a = _mm_set_epi32(x, y, z, w);
+//   validate_int32(a, w, z, y, x);
+//   return a;
+// }
+
+// This function is not called from "run_single_test", but for other intrinsic
+// tests that might need to load __m64 data.
+// template <class T> __m64 load_m64(const T *p) { return *((const __m64 *)p); }
+
+// This function is not called from "run_single_test", but for other intrinsic
+// tests that might need to call "_mm_load_ps".
+// template <class T> __m128 load_m128(const T *p) {
+//   return _mm_loadu_ps((const float *)p);
+// }
+
+// This function is not called from "run_single_test", but for other intrinsic
+// tests that might need to call "_mm_load_ps".
+// template <class T> __m128i load_m128i(const T *p) {
+//   __m128 a = _mm_loadu_ps((const float *)p);
+//   __m128i ia = *(const __m128i *)&a;
+//   return ia;
+// }
+
+// This function is not called from "run_single_test", but for other intrinsic
+// tests that might need to call "_mm_load_pd".
+// template <class T> __m128d load_m128d(const T *p) {
+//   return _mm_loadu_pd((const double *)p);
+// }
+
+// This function is not called from "run_single_test", but for other intrinsic
+// tests that might need to call "_mm_store_ps".
+// result_t do_mm_store_ps(float *p, float x, float y, float z, float w) {
+//   __m128 a = _mm_set_ps(x, y, z, w);
+//   _mm_store_ps(p, a);
+//   ASSERT_RETURN(p[0] == w);
+//   ASSERT_RETURN(p[1] == z);
+//   ASSERT_RETURN(p[2] == y);
+//   ASSERT_RETURN(p[3] == x);
+//   return TEST_SUCCESS;
+// }
+
+// This function is not called from "run_single_test", but for other intrinsic
+// tests that might need to call "_mm_store_ps".
+// result_t do_mm_store_ps(int32_t *p, int32_t x, int32_t y, int32_t z,
+//                         int32_t w) {
+//   __m128i a = _mm_set_epi32(x, y, z, w);
+//   _mm_store_ps((float *)p, *(const __m128 *)&a);
+//   ASSERT_RETURN(p[0] == w);
+//   ASSERT_RETURN(p[1] == z);
+//   ASSERT_RETURN(p[2] == y);
+//   ASSERT_RETURN(p[3] == x);
+//   return TEST_SUCCESS;
+// }
+
+float cmp_noNaN(float a, float b) {
+  return (!isnan(a) && !isnan(b)) ? ALL_BIT_1_32 : 0.0f;
+}
+
+double cmp_noNaN(double a, double b) {
+  return (!isnan(a) && !isnan(b)) ? ALL_BIT_1_64 : 0.0f;
+}
+
+float cmp_hasNaN(float a, float b) {
+  return (isnan(a) || isnan(b)) ? ALL_BIT_1_32 : 0.0f;
+}
+
+double cmp_hasNaN(double a, double b) {
+  return (isnan(a) || isnan(b)) ? ALL_BIT_1_64 : 0.0f;
+}
+
+int32_t comilt_ss(float a, float b) {
+  if (isnan(a) || isnan(b))
+    return 0;
+  return (a < b);
+}
+
+int32_t comigt_ss(float a, float b) {
+  if (isnan(a) || isnan(b))
+    return 0;
+  return (a > b);
+}
+
+int32_t comile_ss(float a, float b) {
+  if (isnan(a) || isnan(b))
+    return 0;
+  return (a <= b);
+}
+
+int32_t comige_ss(float a, float b) {
+  if (isnan(a) || isnan(b))
+    return 0;
+  return (a >= b);
+}
+
+int32_t comieq_ss(float a, float b) {
+  if (isnan(a) || isnan(b))
+    return 0;
+  return (a == b);
+}
+
+int32_t comineq_ss(float a, float b) {
+  if (isnan(a) || isnan(b))
+    return 1;
+  return (a != b);
+}
+
+static inline int16_t saturate_16(int32_t a) {
+  int32_t max = (1 << 15) - 1;
+  int32_t min = -(1 << 15);
+  if (a > max)
+    return max;
+  if (a < min)
+    return min;
+  return a;
+}
+
+uint32_t canonical_crc32_u8(uint32_t crc, uint8_t v) {
+  crc ^= v;
+  for (int bit = 0; bit < 8; bit++) {
+    if (crc & 1)
+      crc = (crc >> 1) ^ uint32_t(0x82f63b78);
+    else
+      crc = (crc >> 1);
+  }
+  return crc;
+}
+
+uint32_t canonical_crc32_u16(uint32_t crc, uint16_t v) {
+  crc = canonical_crc32_u8(crc, v & 0xff);
+  crc = canonical_crc32_u8(crc, (v >> 8) & 0xff);
+  return crc;
+}
+
+uint32_t canonical_crc32_u32(uint32_t crc, uint32_t v) {
+  crc = canonical_crc32_u16(crc, v & 0xffff);
+  crc = canonical_crc32_u16(crc, (v >> 16) & 0xffff);
+  return crc;
+}
+
+uint64_t canonical_crc32_u64(uint64_t crc, uint64_t v) {
+  crc = canonical_crc32_u32((uint32_t)(crc), v & 0xffffffff);
+  crc = canonical_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
+  return crc;
+}
+
+static const uint8_t crypto_aes_sbox[256] = {
+    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b,
+    0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
+    0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26,
+    0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
+    0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
+    0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed,
+    0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f,
+    0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
+    0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec,
+    0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14,
+    0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
+    0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
+    0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f,
+    0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
+    0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11,
+    0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f,
+    0xb0, 0x54, 0xbb, 0x16,
+};
+
+static const uint8_t crypto_aes_rsbox[256] = {
+    0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e,
+    0x81, 0xf3, 0xd7, 0xfb, 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
+    0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, 0x54, 0x7b, 0x94, 0x32,
+    0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
+    0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49,
+    0x6d, 0x8b, 0xd1, 0x25, 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
+    0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, 0x6c, 0x70, 0x48, 0x50,
+    0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
+    0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05,
+    0xb8, 0xb3, 0x45, 0x06, 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
+    0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, 0x3a, 0x91, 0x11, 0x41,
+    0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
+    0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8,
+    0x1c, 0x75, 0xdf, 0x6e, 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
+    0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, 0xfc, 0x56, 0x3e, 0x4b,
+    0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
+    0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59,
+    0x27, 0x80, 0xec, 0x5f, 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
+    0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, 0xa0, 0xe0, 0x3b, 0x4d,
+    0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
+    0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63,
+    0x55, 0x21, 0x0c, 0x7d,
+};
+
+// XT is x_time function that muliplies 'x' by 2 in GF(2^8)
+#define XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
+
+inline __m128i aesenc_128_reference(__m128i a, __m128i b) {
+  uint8_t i, t, u, v[4][4];
+  for (i = 0; i < 16; ++i) {
+    v[((i / 4) + 4 - (i % 4)) % 4][i % 4] =
+        crypto_aes_sbox[((SIMDVec *)&a)->m128_u8[i]];
+  }
+  for (i = 0; i < 4; ++i) {
+    t = v[i][0];
+    u = v[i][0] ^ v[i][1] ^ v[i][2] ^ v[i][3];
+    v[i][0] ^= u ^ XT(v[i][0] ^ v[i][1]);
+    v[i][1] ^= u ^ XT(v[i][1] ^ v[i][2]);
+    v[i][2] ^= u ^ XT(v[i][2] ^ v[i][3]);
+    v[i][3] ^= u ^ XT(v[i][3] ^ t);
+  }
+
+  for (i = 0; i < 16; ++i) {
+    ((SIMDVec *)&a)->m128_u8[i] = v[i / 4][i % 4] ^ ((SIMDVec *)&b)->m128_u8[i];
+  }
+
+  return a;
+}
+
+#define MULTIPLY(x, y)                                                         \
+  (((y & 1) * x) ^ ((y >> 1 & 1) * XT(x)) ^ ((y >> 2 & 1) * XT(XT(x))) ^       \
+   ((y >> 3 & 1) * XT(XT(XT(x)))) ^ ((y >> 4 & 1) * XT(XT(XT(XT(x))))))
+
+inline __m128i aesdec_128_reference(__m128i a, __m128i b) {
+  uint8_t i, e, f, g, h, v[4][4];
+  for (i = 0; i < 16; ++i) {
+    v[((i / 4) + (i % 4)) % 4][i % 4] =
+        crypto_aes_rsbox[((SIMDVec *)&a)->m128_u8[i]];
+  }
+
+  for (i = 0; i < 4; ++i) {
+    e = v[i][0];
+    f = v[i][1];
+    g = v[i][2];
+    h = v[i][3];
+
+    v[i][0] = MULTIPLY(e, 0x0e) ^ MULTIPLY(f, 0x0b) ^ MULTIPLY(g, 0x0d) ^
+              MULTIPLY(h, 0x09);
+    v[i][1] = MULTIPLY(e, 0x09) ^ MULTIPLY(f, 0x0e) ^ MULTIPLY(g, 0x0b) ^
+              MULTIPLY(h, 0x0d);
+    v[i][2] = MULTIPLY(e, 0x0d) ^ MULTIPLY(f, 0x09) ^ MULTIPLY(g, 0x0e) ^
+              MULTIPLY(h, 0x0b);
+    v[i][3] = MULTIPLY(e, 0x0b) ^ MULTIPLY(f, 0x0d) ^ MULTIPLY(g, 0x09) ^
+              MULTIPLY(h, 0x0e);
+  }
+
+  for (i = 0; i < 16; ++i) {
+    ((SIMDVec *)&a)->m128_u8[i] = v[i / 4][i % 4] ^ ((SIMDVec *)&b)->m128_u8[i];
+  }
+  return a;
+}
+
+inline __m128i aesenclast_128_reference(__m128i s, __m128i rk) {
+  uint8_t i, v[4][4];
+  for (i = 0; i < 16; ++i)
+    v[((i / 4) + 4 - (i % 4)) % 4][i % 4] =
+        crypto_aes_sbox[((SIMDVec *)&s)->m128_u8[i]];
+  for (i = 0; i < 16; ++i)
+    ((SIMDVec *)&s)->m128_u8[i] =
+        v[i / 4][i % 4] ^ ((SIMDVec *)&rk)->m128_u8[i];
+  return s;
+}
+
+// Rotates right (circular right shift) value by "amount" positions
+static inline uint32_t rotr(uint32_t value, uint32_t amount) {
+  return (value >> amount) | (value << ((32 - amount) & 31));
+}
+
+static inline uint64_t MUL(uint32_t a, uint32_t b) {
+  return (uint64_t)a * (uint64_t)b;
+}
+
+// From BearSSL. Performs a 32-bit->64-bit carryless/polynomial
+// long multiply.
+//
+// This implementation was chosen because it is reasonably fast
+// without a lookup table or branching.
+//
+// This does it by splitting up the bits in a way that they
+// would not carry, then combine them together with xor (a
+// carryless add).
+//
+// https://www.bearssl.org/gitweb/?p=BearSSL;a=blob;f=src/hash/ghash_ctmul.c;h=3623202;hb=5f045c7#l164
+static uint64_t clmul_32(uint32_t x, uint32_t y) {
+  uint32_t x0, x1, x2, x3;
+  uint32_t y0, y1, y2, y3;
+  uint64_t z0, z1, z2, z3;
+
+  x0 = x & (uint32_t)0x11111111;
+  x1 = x & (uint32_t)0x22222222;
+  x2 = x & (uint32_t)0x44444444;
+  x3 = x & (uint32_t)0x88888888;
+  y0 = y & (uint32_t)0x11111111;
+  y1 = y & (uint32_t)0x22222222;
+  y2 = y & (uint32_t)0x44444444;
+  y3 = y & (uint32_t)0x88888888;
+  z0 = MUL(x0, y0) ^ MUL(x1, y3) ^ MUL(x2, y2) ^ MUL(x3, y1);
+  z1 = MUL(x0, y1) ^ MUL(x1, y0) ^ MUL(x2, y3) ^ MUL(x3, y2);
+  z2 = MUL(x0, y2) ^ MUL(x1, y1) ^ MUL(x2, y0) ^ MUL(x3, y3);
+  z3 = MUL(x0, y3) ^ MUL(x1, y2) ^ MUL(x2, y1) ^ MUL(x3, y0);
+  z0 &= (uint64_t)0x1111111111111111;
+  z1 &= (uint64_t)0x2222222222222222;
+  z2 &= (uint64_t)0x4444444444444444;
+  z3 &= (uint64_t)0x8888888888888888;
+  return z0 | z1 | z2 | z3;
+}
+
+// Performs a 64x64->128-bit carryless/polynomial long
+// multiply, using the above routine to calculate the
+// subproducts needed for the full-size multiply.
+//
+// This uses the Karatsuba algorithm.
+//
+// Normally, the Karatsuba algorithm isn't beneficial
+// until very large numbers due to carry tracking and
+// multiplication being relatively cheap.
+//
+// However, we have no carries and multiplication is
+// definitely not cheap, so the Karatsuba algorithm is
+// a low cost and easy optimization.
+//
+// https://en.m.wikipedia.org/wiki/Karatsuba_algorithm
+//
+// Note that addition and subtraction are both
+// performed with xor, since all operations are
+// carryless.
+//
+// The comments represent the actual mathematical
+// operations being performed (instead of the bitwise
+// operations) and to reflect the linked Wikipedia article.
+static std::pair<uint64_t, uint64_t> clmul_64(uint64_t x, uint64_t y) {
+  // B = 2
+  // m = 32
+  // x = (x1 * B^m) + x0
+  uint32_t x0 = x & 0xffffffff;
+  uint32_t x1 = x >> 32;
+  // y = (y1 * B^m) + y0
+  uint32_t y0 = y & 0xffffffff;
+  uint32_t y1 = y >> 32;
+
+  // z0 = x0 * y0
+  uint64_t z0 = clmul_32(x0, y0);
+  // z2 = x1 * y1
+  uint64_t z2 = clmul_32(x1, y1);
+  // z1 = (x0 + x1) * (y0 + y1) - z0 - z2
+  uint64_t z1 = clmul_32(x0 ^ x1, y0 ^ y1) ^ z0 ^ z2;
+
+  // xy = z0 + (z1 * B^m) + (z2 * B^2m)
+  // note: z1 is split between the low and high halves
+  uint64_t xy0 = z0 ^ (z1 << 32);
+  uint64_t xy1 = z2 ^ (z1 >> 32);
+
+  return std::make_pair(xy0, xy1);
+}
+
+/* MMX */
+result_t test_mm_empty(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+/* SSE */
+result_t test_mm_add_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   float dx = _a[0] + _b[0];
+  //   float dy = _a[1] + _b[1];
+  //   float dz = _a[2] + _b[2];
+  //   float dw = _a[3] + _b[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 c = _mm_add_ps(a, b);
+  //   return validate_float(c, dx, dy, dz, dw);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_add_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer1;
+  //
+  //   float f0 = _a[0] + _b[0];
+  //   float f1 = _a[1];
+  //   float f2 = _a[2];
+  //   float f3 = _a[3];
+  //
+  //   __m128 a = _mm_load_ps(_a);
+  //   __m128 b = _mm_load_ps(_b);
+  //   __m128 c = _mm_add_ss(a, b);
+  //
+  //   return validate_float(c, f0, f1, f2, f3);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_and_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 c = _mm_and_ps(a, b);
+  // now for the assertion...
+  //   const uint32_t *ia = (const uint32_t *)&a;
+  //   const uint32_t *ib = (const uint32_t *)&b;
+  //   uint32_t r[4];
+  //   r[0] = ia[0] & ib[0];
+  //   r[1] = ia[1] & ib[1];
+  //   r[2] = ia[2] & ib[2];
+  //   r[3] = ia[3] & ib[3];
+  //   __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]);
+  //   result_t res = VALIDATE_INT32_M128(*(const __m128i *)&c, r);
+  //   if (res) {
+  //     res = VALIDATE_INT32_M128(ret, r);
+  //   }
+  //   return res;
+  return TEST_UNIMPL;
+}
+
+// r0 := ~a0 & b0
+// r1 := ~a1 & b1
+// r2 := ~a2 & b2
+// r3 := ~a3 & b3
+result_t test_mm_andnot_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 c = _mm_andnot_ps(a, b);
+  // now for the assertion...
+  //   const uint32_t *ia = (const uint32_t *)&a;
+  //   const uint32_t *ib = (const uint32_t *)&b;
+  //   uint32_t r[4];
+  //   r[0] = ~ia[0] & ib[0];
+  //   r[1] = ~ia[1] & ib[1];
+  //   r[2] = ~ia[2] & ib[2];
+  //   r[3] = ~ia[3] & ib[3];
+  //   __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]);
+  //   result_t res = TEST_FAIL;
+  //   res = VALIDATE_INT32_M128(*(const __m128i *)&c, r);
+  //   if (res) {
+  //     res = VALIDATE_INT32_M128(ret, r);
+  //   }
+  //   return res;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_avg_pu16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1;
+  //   const uint16_t *_b = (const uint16_t *)impl.test_cases_int_pointer2;
+  //   uint16_t d[4];
+  //   d[0] = (_a[0] + _b[0] + 1) >> 1;
+  //   d[1] = (_a[1] + _b[1] + 1) >> 1;
+  //   d[2] = (_a[2] + _b[2] + 1) >> 1;
+  //   d[3] = (_a[3] + _b[3] + 1) >> 1;
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 c = _mm_avg_pu16(a, b);
+  //
+  //   return VALIDATE_UINT16_M64(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_avg_pu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
+  //   const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2;
+  //   uint8_t d[8];
+  //   d[0] = (_a[0] + _b[0] + 1) >> 1;
+  //   d[1] = (_a[1] + _b[1] + 1) >> 1;
+  //   d[2] = (_a[2] + _b[2] + 1) >> 1;
+  //   d[3] = (_a[3] + _b[3] + 1) >> 1;
+  //   d[4] = (_a[4] + _b[4] + 1) >> 1;
+  //   d[5] = (_a[5] + _b[5] + 1) >> 1;
+  //   d[6] = (_a[6] + _b[6] + 1) >> 1;
+  //   d[7] = (_a[7] + _b[7] + 1) >> 1;
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 c = _mm_avg_pu8(a, b);
+  //
+  //   return VALIDATE_UINT8_M64(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpeq_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   int32_t result[4];
+  //   result[0] = _a[0] == _b[0] ? -1 : 0;
+  //   result[1] = _a[1] == _b[1] ? -1 : 0;
+  //   result[2] = _a[2] == _b[2] ? -1 : 0;
+  //   result[3] = _a[3] == _b[3] ? -1 : 0;
+  //
+  //   __m128 ret = _mm_cmpeq_ps(a, b);
+  //   __m128i iret = *(const __m128i *)&ret;
+  //   return VALIDATE_INT32_M128(iret, result);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpeq_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   float result[4];
+  //   result[0] = _a[0] == _b[0] ? ALL_BIT_1_32 : 0;
+  //   result[1] = _a[1];
+  //   result[2] = _a[2];
+  //   result[3] = _a[3];
+  //
+  //   __m128 ret = _mm_cmpeq_ss(a, b);
+  //   return validate_float(ret, result[0], result[1], result[2], result[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpge_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   int32_t result[4];
+  //   result[0] = _a[0] >= _b[0] ? -1 : 0;
+  //   result[1] = _a[1] >= _b[1] ? -1 : 0;
+  //   result[2] = _a[2] >= _b[2] ? -1 : 0;
+  //   result[3] = _a[3] >= _b[3] ? -1 : 0;
+  //
+  //   __m128 ret = _mm_cmpge_ps(a, b);
+  //   __m128i iret = *(const __m128i *)&ret;
+  //   return VALIDATE_INT32_M128(iret, result);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpge_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   float result[4];
+  //   result[0] = _a[0] >= _b[0] ? ALL_BIT_1_32 : 0;
+  //   result[1] = _a[1];
+  //   result[2] = _a[2];
+  //   result[3] = _a[3];
+  //
+  //   __m128 ret = _mm_cmpge_ss(a, b);
+  //   return validate_float(ret, result[0], result[1], result[2], result[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpgt_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   int32_t result[4];
+  //   result[0] = _a[0] > _b[0] ? -1 : 0;
+  //   result[1] = _a[1] > _b[1] ? -1 : 0;
+  //   result[2] = _a[2] > _b[2] ? -1 : 0;
+  //   result[3] = _a[3] > _b[3] ? -1 : 0;
+  //
+  //   __m128 ret = _mm_cmpgt_ps(a, b);
+  //   __m128i iret = *(const __m128i *)&ret;
+  //   return VALIDATE_INT32_M128(iret, result);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpgt_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   float result[4];
+  //   result[0] = _a[0] > _b[0] ? ALL_BIT_1_32 : 0;
+  //   result[1] = _a[1];
+  //   result[2] = _a[2];
+  //   result[3] = _a[3];
+  //
+  //   __m128 ret = _mm_cmpgt_ss(a, b);
+  //   return validate_float(ret, result[0], result[1], result[2], result[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmple_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   int32_t result[4];
+  //   result[0] = _a[0] <= _b[0] ? -1 : 0;
+  //   result[1] = _a[1] <= _b[1] ? -1 : 0;
+  //   result[2] = _a[2] <= _b[2] ? -1 : 0;
+  //   result[3] = _a[3] <= _b[3] ? -1 : 0;
+  //
+  //   __m128 ret = _mm_cmple_ps(a, b);
+  //   __m128i iret = *(const __m128i *)&ret;
+  //   return VALIDATE_INT32_M128(iret, result);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmple_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   float result[4];
+  //   result[0] = _a[0] <= _b[0] ? ALL_BIT_1_32 : 0;
+  //   result[1] = _a[1];
+  //   result[2] = _a[2];
+  //   result[3] = _a[3];
+  //
+  //   __m128 ret = _mm_cmple_ss(a, b);
+  //   return validate_float(ret, result[0], result[1], result[2], result[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmplt_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   int32_t result[4];
+  //   result[0] = _a[0] < _b[0] ? -1 : 0;
+  //   result[1] = _a[1] < _b[1] ? -1 : 0;
+  //   result[2] = _a[2] < _b[2] ? -1 : 0;
+  //   result[3] = _a[3] < _b[3] ? -1 : 0;
+  //
+  //   __m128 ret = _mm_cmplt_ps(a, b);
+  //   __m128i iret = *(const __m128i *)&ret;
+  //   return VALIDATE_INT32_M128(iret, result);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmplt_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   float result[4];
+  //   result[0] = _a[0] < _b[0] ? ALL_BIT_1_32 : 0;
+  //   result[1] = _a[1];
+  //   result[2] = _a[2];
+  //   result[3] = _a[3];
+  //
+  //   __m128 ret = _mm_cmplt_ss(a, b);
+  //   return validate_float(ret, result[0], result[1], result[2], result[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpneq_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   int32_t result[4];
+  //   result[0] = _a[0] != _b[0] ? -1 : 0;
+  //   result[1] = _a[1] != _b[1] ? -1 : 0;
+  //   result[2] = _a[2] != _b[2] ? -1 : 0;
+  //   result[3] = _a[3] != _b[3] ? -1 : 0;
+  //
+  //   __m128 ret = _mm_cmpneq_ps(a, b);
+  //   __m128i iret = *(const __m128i *)&ret;
+  //   return VALIDATE_INT32_M128(iret, result);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpneq_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   float result[4];
+  //   result[0] = _a[0] != _b[0] ? ALL_BIT_1_32 : 0;
+  //   result[1] = _a[1];
+  //   result[2] = _a[2];
+  //   result[3] = _a[3];
+  //
+  //   __m128 ret = _mm_cmpneq_ss(a, b);
+  //   return validate_float(ret, result[0], result[1], result[2], result[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpnge_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   float result[4];
+  //   result[0] = !(_a[0] >= _b[0]) ? ALL_BIT_1_32 : 0;
+  //   result[1] = !(_a[1] >= _b[1]) ? ALL_BIT_1_32 : 0;
+  //   result[2] = !(_a[2] >= _b[2]) ? ALL_BIT_1_32 : 0;
+  //   result[3] = !(_a[3] >= _b[3]) ? ALL_BIT_1_32 : 0;
+  //
+  //   __m128 ret = _mm_cmpnge_ps(a, b);
+  //   return validate_float(ret, result[0], result[1], result[2], result[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpnge_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   float result[4];
+  //   result[0] = !(_a[0] >= _b[0]) ? ALL_BIT_1_32 : 0;
+  //   result[1] = _a[1];
+  //   result[2] = _a[2];
+  //   result[3] = _a[3];
+  //
+  //   __m128 ret = _mm_cmpnge_ss(a, b);
+  //   return validate_float(ret, result[0], result[1], result[2], result[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpngt_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   float result[4];
+  //   result[0] = !(_a[0] > _b[0]) ? ALL_BIT_1_32 : 0;
+  //   result[1] = !(_a[1] > _b[1]) ? ALL_BIT_1_32 : 0;
+  //   result[2] = !(_a[2] > _b[2]) ? ALL_BIT_1_32 : 0;
+  //   result[3] = !(_a[3] > _b[3]) ? ALL_BIT_1_32 : 0;
+  //
+  //   __m128 ret = _mm_cmpngt_ps(a, b);
+  //   return validate_float(ret, result[0], result[1], result[2], result[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpngt_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   float result[4];
+  //   result[0] = !(_a[0] > _b[0]) ? ALL_BIT_1_32 : 0;
+  //   result[1] = _a[1];
+  //   result[2] = _a[2];
+  //   result[3] = _a[3];
+  //
+  //   __m128 ret = _mm_cmpngt_ss(a, b);
+  //   return validate_float(ret, result[0], result[1], result[2], result[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpnle_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   float result[4];
+  //   result[0] = !(_a[0] <= _b[0]) ? ALL_BIT_1_32 : 0;
+  //   result[1] = !(_a[1] <= _b[1]) ? ALL_BIT_1_32 : 0;
+  //   result[2] = !(_a[2] <= _b[2]) ? ALL_BIT_1_32 : 0;
+  //   result[3] = !(_a[3] <= _b[3]) ? ALL_BIT_1_32 : 0;
+  //
+  //   __m128 ret = _mm_cmpnle_ps(a, b);
+  //   return validate_float(ret, result[0], result[1], result[2], result[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpnle_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   float result[4];
+  //   result[0] = !(_a[0] <= _b[0]) ? ALL_BIT_1_32 : 0;
+  //   result[1] = _a[1];
+  //   result[2] = _a[2];
+  //   result[3] = _a[3];
+  //
+  //   __m128 ret = _mm_cmpnle_ss(a, b);
+  //   return validate_float(ret, result[0], result[1], result[2], result[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpnlt_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   float result[4];
+  //   result[0] = !(_a[0] < _b[0]) ? ALL_BIT_1_32 : 0;
+  //   result[1] = !(_a[1] < _b[1]) ? ALL_BIT_1_32 : 0;
+  //   result[2] = !(_a[2] < _b[2]) ? ALL_BIT_1_32 : 0;
+  //   result[3] = !(_a[3] < _b[3]) ? ALL_BIT_1_32 : 0;
+  //
+  //   __m128 ret = _mm_cmpnlt_ps(a, b);
+  //   return validate_float(ret, result[0], result[1], result[2], result[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpnlt_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   float result[4];
+  //   result[0] = !(_a[0] < _b[0]) ? ALL_BIT_1_32 : 0;
+  //   result[1] = _a[1];
+  //   result[2] = _a[2];
+  //   result[3] = _a[3];
+  //
+  //   __m128 ret = _mm_cmpnlt_ss(a, b);
+  //   return validate_float(ret, result[0], result[1], result[2], result[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpord_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   float result[4];
+  //
+  //   for (uint32_t i = 0; i < 4; i++) {
+  //     result[i] = cmp_noNaN(_a[i], _b[i]);
+  //   }
+  //
+  //   __m128 ret = _mm_cmpord_ps(a, b);
+  //
+  //   return validate_float(ret, result[0], result[1], result[2], result[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpord_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   float result[4];
+  //   result[0] = cmp_noNaN(_a[0], _b[0]);
+  //   result[1] = _a[1];
+  //   result[2] = _a[2];
+  //   result[3] = _a[3];
+  //
+  //   __m128 ret = _mm_cmpord_ss(a, b);
+  //
+  //   return validate_float(ret, result[0], result[1], result[2], result[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpunord_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   float result[4];
+  //
+  //   for (uint32_t i = 0; i < 4; i++) {
+  //     result[i] = cmp_hasNaN(_a[i], _b[i]);
+  //   }
+  //
+  //   __m128 ret = _mm_cmpunord_ps(a, b);
+  //
+  //   return validate_float(ret, result[0], result[1], result[2], result[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpunord_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   float result[4];
+  //   result[0] = cmp_hasNaN(_a[0], _b[0]);
+  //   result[1] = _a[1];
+  //   result[2] = _a[2];
+  //   result[3] = _a[3];
+  //
+  //   __m128 ret = _mm_cmpunord_ss(a, b);
+  //
+  //   return validate_float(ret, result[0], result[1], result[2], result[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_comieq_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // FIXME:
+  // The GCC does not implement _mm_comieq_ss correctly.
+  // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98612 for more
+  // information.
+  // #if defined(__GNUC__) && !defined(__clang__)
+  //   return TEST_UNIMPL;
+  // #else
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   int32_t result = comieq_ss(_a[0], _b[0]);
+  //   int32_t ret = _mm_comieq_ss(a, b);
+  //
+  //   return result == ret ? TEST_SUCCESS : TEST_FAIL;
+  // #endif
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_comige_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   int32_t result = comige_ss(_a[0], _b[0]);
+  //   int32_t ret = _mm_comige_ss(a, b);
+  //
+  //   return result == ret ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_comigt_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   int32_t result = comigt_ss(_a[0], _b[0]);
+  //   int32_t ret = _mm_comigt_ss(a, b);
+  //
+  //   return result == ret ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_comile_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // FIXME:
+  // The GCC does not implement _mm_comile_ss correctly.
+  // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98612 for more
+  // information.
+  // #if defined(__GNUC__) && !defined(__clang__)
+  //   return TEST_UNIMPL;
+  // #else
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   int32_t result = comile_ss(_a[0], _b[0]);
+  //   int32_t ret = _mm_comile_ss(a, b);
+  //
+  //   return result == ret ? TEST_SUCCESS : TEST_FAIL;
+  // #endif
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_comilt_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // FIXME:
+  // The GCC does not implement _mm_comilt_ss correctly.
+  // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98612 for more
+  // information.
+  // #if defined(__GNUC__) && !defined(__clang__)
+  //   return TEST_UNIMPL;
+  // #else
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   int32_t result = comilt_ss(_a[0], _b[0]);
+  //
+  //   int32_t ret = _mm_comilt_ss(a, b);
+  //
+  //   return result == ret ? TEST_SUCCESS : TEST_FAIL;
+  // #endif
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_comineq_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // FIXME:
+  // The GCC does not implement _mm_comineq_ss correctly.
+  // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98612 for more
+  // information.
+  // #if defined(__GNUC__) && !defined(__clang__)
+  //   return TEST_UNIMPL;
+  // #else
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   int32_t result = comineq_ss(_a[0], _b[0]);
+  //   int32_t ret = _mm_comineq_ss(a, b);
+  //
+  //   return result == ret ? TEST_SUCCESS : TEST_FAIL;
+  // #endif
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvt_pi2ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const int32_t *_b = impl.test_cases_int_pointer2;
+  //
+  //   float dx = (float)_b[0];
+  //   float dy = (float)_b[1];
+  //   float dz = _a[2];
+  //   float dw = _a[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m128 c = _mm_cvt_pi2ps(a, b);
+  //
+  //   return validate_float(c, dx, dy, dz, dw);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvt_ps2pi(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   int32_t d[2];
+  //
+  //   for (int idx = 0; idx < 2; idx++) {
+  //     switch (iter & 0x3) {
+  //     case 0:
+  //       _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+  //       d[idx] = (int32_t)(bankersRounding(_a[idx]));
+  //       break;
+  //     case 1:
+  //       _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+  //       d[idx] = (int32_t)(floorf(_a[idx]));
+  //       break;
+  //     case 2:
+  //       _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+  //       d[idx] = (int32_t)(ceilf(_a[idx]));
+  //       break;
+  //     case 3:
+  //       _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+  //       d[idx] = (int32_t)(_a[idx]);
+  //       break;
+  //     }
+  //   }
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m64 ret = _mm_cvt_ps2pi(a);
+  //
+  //   return VALIDATE_INT32_M64(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvt_si2ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const int32_t b = *impl.test_cases_int_pointer2;
+  //
+  //   float dx = (float)b;
+  //   float dy = _a[1];
+  //   float dz = _a[2];
+  //   float dw = _a[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 c = _mm_cvt_si2ss(a, b);
+  //
+  //   return validate_float(c, dx, dy, dz, dw);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvt_ss2si(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   int32_t d0;
+  //
+  //   switch (iter & 0x3) {
+  //   case 0:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+  //     d0 = (int32_t)(bankersRounding(_a[0]));
+  //     break;
+  //   case 1:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+  //     d0 = (int32_t)(floorf(_a[0]));
+  //     break;
+  //   case 2:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+  //     d0 = (int32_t)(ceilf(_a[0]));
+  //     break;
+  //   case 3:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+  //     d0 = (int32_t)(_a[0]);
+  //     break;
+  //   }
+  //
+  //   __m128 a = load_m128(_a);
+  //   int32_t ret = _mm_cvt_ss2si(a);
+  //   return ret == d0 ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtpi16_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //
+  //   float dx = (float)_a[0];
+  //   float dy = (float)_a[1];
+  //   float dz = (float)_a[2];
+  //   float dw = (float)_a[3];
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m128 c = _mm_cvtpi16_ps(a);
+  //
+  //   return validate_float(c, dx, dy, dz, dw);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtpi32_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2;
+  //
+  //   float dx = (float)_b[0];
+  //   float dy = (float)_b[1];
+  //   float dz = _a[2];
+  //   float dw = _a[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m128 c = _mm_cvtpi32_ps(a, b);
+  //
+  //   return validate_float(c, dx, dy, dz, dw);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtpi32x2_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2;
+  //
+  //   float dx = (float)_a[0];
+  //   float dy = (float)_a[1];
+  //   float dz = (float)_b[0];
+  //   float dw = (float)_b[1];
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m128 c = _mm_cvtpi32x2_ps(a, b);
+  //
+  //   return validate_float(c, dx, dy, dz, dw);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtpi8_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //
+  //   float dx = (float)_a[0];
+  //   float dy = (float)_a[1];
+  //   float dz = (float)_a[2];
+  //   float dw = (float)_a[3];
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m128 c = _mm_cvtpi8_ps(a);
+  //
+  //   return validate_float(c, dx, dy, dz, dw);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtps_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   int16_t rnd[4];
+  //
+  //   for (int i = 0; i < 4; i++) {
+  //     if ((float)INT16_MAX <= _a[i] && _a[i] <= (float)INT32_MAX) {
+  //       rnd[i] = INT16_MAX;
+  //     } else if (INT16_MIN < _a[i] && _a[i] < INT16_MAX) {
+  //       switch (iter & 0x3) {
+  //       case 0:
+  //         _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+  //         rnd[i] = (int16_t)bankersRounding(_a[i]);
+  //         break;
+  //       case 1:
+  //         _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+  //         rnd[i] = (int16_t)floorf(_a[i]);
+  //         break;
+  //       case 2:
+  //         _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+  //         rnd[i] = (int16_t)ceilf(_a[i]);
+  //         break;
+  //       case 3:
+  //         _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+  //         rnd[i] = (int16_t)_a[i];
+  //         break;
+  //       }
+  //     } else {
+  //       rnd[i] = INT16_MIN;
+  //     }
+  //   }
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m64 ret = _mm_cvtps_pi16(a);
+  //   return VALIDATE_INT16_M64(ret, rnd);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtps_pi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   int32_t d[2];
+  //
+  //   switch (iter & 0x3) {
+  //   case 0:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+  //     d[0] = (int32_t)bankersRounding(_a[0]);
+  //     d[1] = (int32_t)bankersRounding(_a[1]);
+  //     break;
+  //   case 1:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+  //     d[0] = (int32_t)floorf(_a[0]);
+  //     d[1] = (int32_t)floorf(_a[1]);
+  //     break;
+  //   case 2:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+  //     d[0] = (int32_t)ceilf(_a[0]);
+  //     d[1] = (int32_t)ceilf(_a[1]);
+  //     break;
+  //   case 3:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+  //     d[0] = (int32_t)_a[0];
+  //     d[1] = (int32_t)_a[1];
+  //     break;
+  //   }
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m64 ret = _mm_cvtps_pi32(a);
+  //
+  //   return VALIDATE_INT32_M64(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtps_pi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   int8_t rnd[8] = {};
+  //
+  //   for (int i = 0; i < 4; i++) {
+  //     if ((float)INT8_MAX <= _a[i] && _a[i] <= (float)INT32_MAX) {
+  //       rnd[i] = INT8_MAX;
+  //     } else if (INT8_MIN < _a[i] && _a[i] < INT8_MAX) {
+  //       switch (iter & 0x3) {
+  //       case 0:
+  //         _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+  //         rnd[i] = (int8_t)bankersRounding(_a[i]);
+  //         break;
+  //       case 1:
+  //         _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+  //         rnd[i] = (int8_t)floorf(_a[i]);
+  //         break;
+  //       case 2:
+  //         _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+  //         rnd[i] = (int8_t)ceilf(_a[i]);
+  //         break;
+  //       case 3:
+  //         _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+  //         rnd[i] = (int8_t)_a[i];
+  //         break;
+  //       }
+  //     } else {
+  //       rnd[i] = INT8_MIN;
+  //     }
+  //   }
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m64 ret = _mm_cvtps_pi8(a);
+  //   return VALIDATE_INT8_M64(ret, rnd);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtpu16_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1;
+  //
+  //   float dx = (float)_a[0];
+  //   float dy = (float)_a[1];
+  //   float dz = (float)_a[2];
+  //   float dw = (float)_a[3];
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m128 c = _mm_cvtpu16_ps(a);
+  //
+  //   return validate_float(c, dx, dy, dz, dw);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtpu8_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
+  //
+  //   float dx = (float)_a[0];
+  //   float dy = (float)_a[1];
+  //   float dz = (float)_a[2];
+  //   float dw = (float)_a[3];
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m128 c = _mm_cvtpu8_ps(a);
+  //
+  //   return validate_float(c, dx, dy, dz, dw);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtsi32_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const int32_t b = *impl.test_cases_int_pointer2;
+  //
+  //   float dx = (float)b;
+  //   float dy = _a[1];
+  //   float dz = _a[2];
+  //   float dw = _a[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 c = _mm_cvtsi32_ss(a, b);
+  //
+  //   return validate_float(c, dx, dy, dz, dw);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtsi64_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const int64_t b = *(int64_t *)impl.test_cases_int_pointer2;
+  //
+  //   float dx = (float)b;
+  //   float dy = _a[1];
+  //   float dz = _a[2];
+  //   float dw = _a[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 c = _mm_cvtsi64_ss(a, b);
+  //
+  //   return validate_float(c, dx, dy, dz, dw);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtss_f32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //
+  //   float f = _a[0];
+  //
+  //   __m128 a = load_m128(_a);
+  //   float c = _mm_cvtss_f32(a);
+  //
+  //   return f == c ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtss_si32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //
+  //   int32_t d0;
+  //   switch (iter & 0x3) {
+  //   case 0:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+  //     d0 = (int32_t)(bankersRounding(_a[0]));
+  //     break;
+  //   case 1:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+  //     d0 = (int32_t)(floorf(_a[0]));
+  //     break;
+  //   case 2:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+  //     d0 = (int32_t)(ceilf(_a[0]));
+  //     break;
+  //   case 3:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+  //     d0 = (int32_t)(_a[0]);
+  //     break;
+  //   }
+  //
+  //   __m128 a = load_m128(_a);
+  //   int32_t ret = _mm_cvtss_si32(a);
+  //
+  //   return ret == d0 ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtss_si64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //
+  //   int64_t d0;
+  //   switch (iter & 0x3) {
+  //   case 0:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+  //     d0 = (int64_t)(bankersRounding(_a[0]));
+  //     break;
+  //   case 1:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+  //     d0 = (int64_t)(floorf(_a[0]));
+  //     break;
+  //   case 2:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+  //     d0 = (int64_t)(ceilf(_a[0]));
+  //     break;
+  //   case 3:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+  //     d0 = (int64_t)(_a[0]);
+  //     break;
+  //   }
+  //
+  //   __m128 a = load_m128(_a);
+  //   int64_t ret = _mm_cvtss_si64(a);
+  //
+  //   return ret == d0 ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtt_ps2pi(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   int32_t d[2];
+  //
+  //   d[0] = (int32_t)_a[0];
+  //   d[1] = (int32_t)_a[1];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m64 ret = _mm_cvtt_ps2pi(a);
+  //
+  //   return VALIDATE_INT32_M64(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtt_ss2si(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //
+  //   __m128 a = load_m128(_a);
+  //   int ret = _mm_cvtt_ss2si(a);
+  //
+  //   return ret == (int32_t)_a[0] ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvttps_pi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   int32_t d[2];
+  //
+  //   d[0] = (int32_t)_a[0];
+  //   d[1] = (int32_t)_a[1];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m64 ret = _mm_cvttps_pi32(a);
+  //
+  //   return VALIDATE_INT32_M64(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvttss_si32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //
+  //   __m128 a = load_m128(_a);
+  //   int ret = _mm_cvttss_si32(a);
+  //
+  //   return ret == (int32_t)_a[0] ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvttss_si64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //
+  //   __m128 a = load_m128(_a);
+  //   int64_t ret = _mm_cvttss_si64(a);
+  //
+  //   return ret == (int64_t)_a[0] ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_div_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   float f0 = _a[0] / _b[0];
+  //   float f1 = _a[1] / _b[1];
+  //   float f2 = _a[2] / _b[2];
+  //   float f3 = _a[3] / _b[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 c = _mm_div_ps(a, b);
+  //
+  //   return validate_float(c, f0, f1, f2, f3);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_div_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //
+  //   float d0 = _a[0] / _b[0];
+  //   float d1 = _a[1];
+  //   float d2 = _a[2];
+  //   float d3 = _a[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 c = _mm_div_ss(a, b);
+  //
+  //   return validate_float(c, d0, d1, d2, d3);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_extract_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // FIXME GCC has bug on "_mm_extract_pi16" intrinsics. We will enable this
+  // test when GCC fix this bug.
+  // see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98495 for more
+  // information
+  // #if defined(__clang__) || defined(_MSC_VER)
+  //   uint64_t *_a = (uint64_t *)impl.test_cases_int_pointer1;
+  //   const int idx = iter & 0x3;
+  //
+  //   __m64 a = load_m64(_a);
+  //   int c;
+  //   switch (idx) {
+  //   case 0:
+  //     c = _mm_extract_pi16(a, 0);
+  //     break;
+  //   case 1:
+  //     c = _mm_extract_pi16(a, 1);
+  //     break;
+  //   case 2:
+  //     c = _mm_extract_pi16(a, 2);
+  //     break;
+  //   case 3:
+  //     c = _mm_extract_pi16(a, 3);
+  //     break;
+  //   }
+  //
+  //   ASSERT_RETURN((uint64_t)c == ((*_a >> (idx * 16)) & 0xFFFF));
+  //   ASSERT_RETURN(0 == ((uint64_t)c & 0xFFFF0000));
+  //   return TEST_SUCCESS;
+  // #else
+  //   return TEST_UNIMPL;
+  // #endif
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_malloc(const SSE2RVV_TEST_IMPL &impl, uint32_t iter);
+result_t test_mm_free(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   /* We verify _mm_malloc first, and there is no need to check _mm_free .
+  //   */ return test_mm_malloc(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_get_flush_zero_mode(const SSE2RVV_TEST_IMPL &impl,
+                                     uint32_t iter) {
+  //   int res_flush_zero_on, res_flush_zero_off;
+  //   _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+  //   res_flush_zero_on = _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON;
+  //   _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
+  //   res_flush_zero_off = _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_OFF;
+  //
+  //   return (res_flush_zero_on && res_flush_zero_off) ? TEST_SUCCESS :
+  //   TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_get_rounding_mode(const SSE2RVV_TEST_IMPL &impl,
+                                   uint32_t iter) {
+  //   int res_toward_zero, res_to_neg_inf, res_to_pos_inf, res_nearest;
+  //   _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+  //   res_toward_zero = _MM_GET_ROUNDING_MODE() == _MM_ROUND_TOWARD_ZERO ? 1 :
+  //   0; _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); res_to_neg_inf =
+  //   _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN ? 1 : 0;
+  //   _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+  //   res_to_pos_inf = _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP ? 1 : 0;
+  //   _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+  //   res_nearest = _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST ? 1 : 0;
+  //
+  //   if (res_toward_zero && res_to_neg_inf && res_to_pos_inf && res_nearest) {
+  //     return TEST_SUCCESS;
+  //   } else {
+  //     return TEST_FAIL;
+  //   }
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_getcsr(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // store original csr value for post test restoring
+  //   unsigned int originalCsr = _mm_getcsr();
+  //
+  //   unsigned int roundings[] = {_MM_ROUND_TOWARD_ZERO, _MM_ROUND_DOWN,
+  //                               _MM_ROUND_UP, _MM_ROUND_NEAREST};
+  //   for (size_t i = 0; i < sizeof(roundings) / sizeof(roundings[0]); i++) {
+  //     _mm_setcsr(_mm_getcsr() | roundings[i]);
+  //     if ((_mm_getcsr() & roundings[i]) != roundings[i]) {
+  //       return TEST_FAIL;
+  //     }
+  //   }
+  //
+  // restore original csr value for remaining tests
+  //   _mm_setcsr(originalCsr);
+  //
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_insert_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t insert = (int16_t)impl.test_cases_ints[iter];
+  //   __m64 a;
+  //   __m64 b;
+  //
+  // #define TEST_IMPL(IDX)
+  //   int16_t d##IDX[4];
+  //   for (int i = 0; i < 4; i++) {
+  //     d##IDX[i] = _a[i];
+  //   }
+  //   d##IDX[IDX] = insert;
+  //
+  //   a = load_m64(_a);
+  //   b = _mm_insert_pi16(a, insert, IDX);
+  //   CHECK_RESULT(VALIDATE_INT16_M64(b, d##IDX))
+  //
+  //   IMM_4_ITER
+  // #undef TEST_IMPL
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_load_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *addr = impl.test_cases_float_pointer1;
+  //
+  //   __m128 ret = _mm_load_ps(addr);
+  //
+  //   return validate_float(ret, addr[0], addr[1], addr[2], addr[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_load_ps1(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *addr = impl.test_cases_float_pointer1;
+  //
+  //   __m128 ret = _mm_load_ps1(addr);
+  //
+  //   return validate_float(ret, addr[0], addr[0], addr[0], addr[0]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_load_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *addr = impl.test_cases_float_pointer1;
+  //
+  //   __m128 ret = _mm_load_ss(addr);
+  //
+  //   return validate_float(ret, addr[0], 0, 0, 0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_load1_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *p = impl.test_cases_float_pointer1;
+  //   __m128 a = _mm_load1_ps(p);
+  //   return validate_float(a, p[0], p[0], p[0], p[0]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_loadh_pi(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *p1 = impl.test_cases_float_pointer1;
+  //   const float *p2 = impl.test_cases_float_pointer2;
+  //   const __m64 *b = (const __m64 *)p2;
+  //   __m128 a = _mm_load_ps(p1);
+  //   __m128 c = _mm_loadh_pi(a, b);
+  //
+  //   return validate_float(c, p1[0], p1[1], p2[0], p2[1]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_loadl_pi(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *p1 = impl.test_cases_float_pointer1;
+  //   const float *p2 = impl.test_cases_float_pointer2;
+  //   __m128 a = _mm_load_ps(p1);
+  //   const __m64 *b = (const __m64 *)p2;
+  //   __m128 c = _mm_loadl_pi(a, b);
+  //
+  //   return validate_float(c, p2[0], p2[1], p1[2], p1[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_loadr_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *addr = impl.test_cases_float_pointer1;
+  //
+  //   __m128 ret = _mm_loadr_ps(addr);
+  //
+  //   return validate_float(ret, addr[3], addr[2], addr[1], addr[0]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_loadu_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *addr = impl.test_cases_float_pointer1;
+  //
+  //   __m128 ret = _mm_loadu_ps(addr);
+  //
+  //   return validate_float(ret, addr[0], addr[1], addr[2], addr[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_loadu_si16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // The GCC version before 11 does not implement intrinsic function
+  // _mm_loadu_si16. Check https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95483
+  // for more information.
+  // #if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
+  //   return TEST_UNIMPL;
+  // #else
+  //   const int16_t *addr = (const int16_t *)impl.test_cases_int_pointer1;
+  //
+  //   __m128i ret = _mm_loadu_si16((const void *)addr);
+  //
+  //   return validate_int16(ret, addr[0], 0, 0, 0, 0, 0, 0, 0);
+  // #endif
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_loadu_si64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // Versions of GCC prior to 9 do not implement intrinsic function
+  // _mm_loadu_si64. Check https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78782
+  // for more information.
+  // #if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ < 9)
+  //   return TEST_UNIMPL;
+  // #else
+  //   const int64_t *addr = (const int64_t *)impl.test_cases_int_pointer1;
+  //
+  //   __m128i ret = _mm_loadu_si64((const void *)addr);
+  //
+  //   return validate_int64(ret, addr[0], 0);
+  // #endif
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_malloc(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const size_t *a = (const size_t *)impl.test_cases_int_pointer1;
+  //   const size_t *b = (const size_t *)impl.test_cases_int_pointer2;
+  //   size_t size = *a % (1024 * 16) + 1;
+  //   size_t align = 2 << (*b % 5);
+  //
+  //   void *p = _mm_malloc(size, align);
+  //   if (!p)
+  //     return TEST_FAIL;
+  //   result_t res = (((uintptr_t)p % align) == 0) ? TEST_SUCCESS : TEST_FAIL;
+  //   _mm_free(p);
+  //   return res;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_maskmove_si64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
+  //   const uint8_t *_mask = (const uint8_t *)impl.test_cases_int_pointer2;
+  //   char mem_addr[16];
+  //
+  //   const __m64 *a = (const __m64 *)_a;
+  //   const __m64 *mask = (const __m64 *)_mask;
+  //   _mm_maskmove_si64(*a, *mask, (char *)mem_addr);
+  //
+  //   for (int i = 0; i < 8; i++) {
+  //     if (_mask[i] >> 7) {
+  //       ASSERT_RETURN(_a[i] == (uint8_t)mem_addr[i]);
+  //     }
+  //   }
+  //
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_m_maskmovq(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  return test_mm_maskmove_si64(impl, iter);
+}
+
+result_t test_mm_max_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //   int16_t c[4];
+  //
+  //   c[0] = _a[0] > _b[0] ? _a[0] : _b[0];
+  //   c[1] = _a[1] > _b[1] ? _a[1] : _b[1];
+  //   c[2] = _a[2] > _b[2] ? _a[2] : _b[2];
+  //   c[3] = _a[3] > _b[3] ? _a[3] : _b[3];
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 ret = _mm_max_pi16(a, b);
+  //   return VALIDATE_INT16_M64(ret, c);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_max_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   float c[4];
+  //
+  //   c[0] = _a[0] > _b[0] ? _a[0] : _b[0];
+  //   c[1] = _a[1] > _b[1] ? _a[1] : _b[1];
+  //   c[2] = _a[2] > _b[2] ? _a[2] : _b[2];
+  //   c[3] = _a[3] > _b[3] ? _a[3] : _b[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 ret = _mm_max_ps(a, b);
+  //   return validate_float(ret, c[0], c[1], c[2], c[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_max_pu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
+  //   const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2;
+  //   uint8_t c[8];
+  //
+  //   c[0] = _a[0] > _b[0] ? _a[0] : _b[0];
+  //   c[1] = _a[1] > _b[1] ? _a[1] : _b[1];
+  //   c[2] = _a[2] > _b[2] ? _a[2] : _b[2];
+  //   c[3] = _a[3] > _b[3] ? _a[3] : _b[3];
+  //   c[4] = _a[4] > _b[4] ? _a[4] : _b[4];
+  //   c[5] = _a[5] > _b[5] ? _a[5] : _b[5];
+  //   c[6] = _a[6] > _b[6] ? _a[6] : _b[6];
+  //   c[7] = _a[7] > _b[7] ? _a[7] : _b[7];
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 ret = _mm_max_pu8(a, b);
+  //   return VALIDATE_UINT8_M64(ret, c);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_max_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer1;
+  //
+  //   float f0 = _a[0] > _b[0] ? _a[0] : _b[0];
+  //   float f1 = _a[1];
+  //   float f2 = _a[2];
+  //   float f3 = _a[3];
+  //
+  //   __m128 a = _mm_load_ps(_a);
+  //   __m128 b = _mm_load_ps(_b);
+  //   __m128 c = _mm_max_ss(a, b);
+  //
+  //   return validate_float(c, f0, f1, f2, f3);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_min_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //   int16_t c[4];
+  //
+  //   c[0] = _a[0] < _b[0] ? _a[0] : _b[0];
+  //   c[1] = _a[1] < _b[1] ? _a[1] : _b[1];
+  //   c[2] = _a[2] < _b[2] ? _a[2] : _b[2];
+  //   c[3] = _a[3] < _b[3] ? _a[3] : _b[3];
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 ret = _mm_min_pi16(a, b);
+  //   return VALIDATE_INT16_M64(ret, c);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_min_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   float c[4];
+  //
+  //   c[0] = _a[0] < _b[0] ? _a[0] : _b[0];
+  //   c[1] = _a[1] < _b[1] ? _a[1] : _b[1];
+  //   c[2] = _a[2] < _b[2] ? _a[2] : _b[2];
+  //   c[3] = _a[3] < _b[3] ? _a[3] : _b[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 ret = _mm_min_ps(a, b);
+  //   return validate_float(ret, c[0], c[1], c[2], c[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_min_pu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
+  //   const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2;
+  //   uint8_t c[8];
+  //
+  //   c[0] = _a[0] < _b[0] ? _a[0] : _b[0];
+  //   c[1] = _a[1] < _b[1] ? _a[1] : _b[1];
+  //   c[2] = _a[2] < _b[2] ? _a[2] : _b[2];
+  //   c[3] = _a[3] < _b[3] ? _a[3] : _b[3];
+  //   c[4] = _a[4] < _b[4] ? _a[4] : _b[4];
+  //   c[5] = _a[5] < _b[5] ? _a[5] : _b[5];
+  //   c[6] = _a[6] < _b[6] ? _a[6] : _b[6];
+  //   c[7] = _a[7] < _b[7] ? _a[7] : _b[7];
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 ret = _mm_min_pu8(a, b);
+  //   return VALIDATE_UINT8_M64(ret, c);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_min_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   float c;
+  //
+  //   c = _a[0] < _b[0] ? _a[0] : _b[0];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 ret = _mm_min_ss(a, b);
+  //
+  //   return validate_float(ret, c, _a[1], _a[2], _a[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_move_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //
+  //   float result[4];
+  //   result[0] = _b[0];
+  //   result[1] = _a[1];
+  //   result[2] = _a[2];
+  //   result[3] = _a[3];
+  //
+  //   __m128 ret = _mm_move_ss(a, b);
+  //   return validate_float(ret, result[0], result[1], result[2], result[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_movehl_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //
+  //   float f0 = _b[2];
+  //   float f1 = _b[3];
+  //   float f2 = _a[2];
+  //   float f3 = _a[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 ret = _mm_movehl_ps(a, b);
+  //
+  //   return validate_float(ret, f0, f1, f2, f3);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_movelh_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //
+  //   float f0 = _a[0];
+  //   float f1 = _a[1];
+  //   float f2 = _b[0];
+  //   float f3 = _b[1];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 ret = _mm_movelh_ps(a, b);
+  //
+  //   return validate_float(ret, f0, f1, f2, f3);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_movemask_pi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
+  //   unsigned int _c = 0;
+  //   for (int i = 0; i < 8; i++) {
+  //     if (_a[i] & 0x80) {
+  //       _c |= (1 << i);
+  //     }
+  //   }
+  //
+  //   const __m64 *a = (const __m64 *)_a;
+  //   int c = _mm_movemask_pi8(*a);
+  //
+  //   ASSERT_RETURN((unsigned int)c == _c);
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_movemask_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *p = impl.test_cases_float_pointer1;
+  //   int ret = 0;
+  //
+  //   const uint32_t *ip = (const uint32_t *)p;
+  //   if (ip[0] & 0x80000000) {
+  //     ret |= 1;
+  //   }
+  //   if (ip[1] & 0x80000000) {
+  //     ret |= 2;
+  //   }
+  //   if (ip[2] & 0x80000000) {
+  //     ret |= 4;
+  //   }
+  //   if (ip[3] & 0x80000000) {
+  //     ret |= 8;
+  //   }
+  //   __m128 a = load_m128(p);
+  //   int val = _mm_movemask_ps(a);
+  //   return val == ret ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_mul_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   float dx = _a[0] * _b[0];
+  //   float dy = _a[1] * _b[1];
+  //   float dz = _a[2] * _b[2];
+  //   float dw = _a[3] * _b[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 c = _mm_mul_ps(a, b);
+  //   return validate_float(c, dx, dy, dz, dw);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_mul_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //
+  //   float dx = _a[0] * _b[0];
+  //   float dy = _a[1];
+  //   float dz = _a[2];
+  //   float dw = _a[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 c = _mm_mul_ss(a, b);
+  //   return validate_float(c, dx, dy, dz, dw);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_mulhi_pu16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1;
+  //   const uint16_t *_b = (const uint16_t *)impl.test_cases_int_pointer2;
+  //   uint16_t d[4];
+  //   for (uint32_t i = 0; i < 4; i++) {
+  //     uint32_t m = (uint32_t)_a[i] * (uint32_t)_b[i];
+  //     d[i] = (uint16_t)(m >> 16);
+  //   }
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 c = _mm_mulhi_pu16(a, b);
+  //   return VALIDATE_UINT16_M64(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_or_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 c = _mm_or_ps(a, b);
+  // now for the assertion...
+  //   const uint32_t *ia = (const uint32_t *)&a;
+  //   const uint32_t *ib = (const uint32_t *)&b;
+  //   uint32_t r[4];
+  //   r[0] = ia[0] | ib[0];
+  //   r[1] = ia[1] | ib[1];
+  //   r[2] = ia[2] | ib[2];
+  //   r[3] = ia[3] | ib[3];
+  //   __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]);
+  //   result_t res = VALIDATE_INT32_M128(*(const __m128i *)&c, r);
+  //   if (res) {
+  //     res = VALIDATE_INT32_M128(ret, r);
+  //   }
+  //
+  //   return res;
+  return TEST_UNIMPL;
+}
+
+result_t test_m_pavgb(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  return test_mm_avg_pu8(impl, iter);
+}
+
+result_t test_m_pavgw(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  return test_mm_avg_pu16(impl, iter);
+}
+
+result_t test_m_pextrw(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  return test_mm_extract_pi16(impl, iter);
+}
+
+result_t test_m_pinsrw(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  return test_mm_insert_pi16(impl, iter);
+}
+
+result_t test_m_pmaxsw(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  return test_mm_max_pi16(impl, iter);
+}
+
+result_t test_m_pmaxub(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  return test_mm_max_pu8(impl, iter);
+}
+
+result_t test_m_pminsw(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  return test_mm_min_pi16(impl, iter);
+}
+
+result_t test_m_pminub(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  return test_mm_min_pu8(impl, iter);
+}
+
+result_t test_m_pmovmskb(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  return test_mm_movemask_pi8(impl, iter);
+}
+
+result_t test_m_pmulhuw(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  return test_mm_mulhi_pu16(impl, iter);
+}
+
+result_t test_mm_prefetch(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   typedef struct {
+  //     __m128 a;
+  //     float r[4];
+  //   } prefetch_test_t;
+  //   prefetch_test_t test_vec[8] = {
+  //       {
+  //           _mm_set_ps(-0.1f, 0.2f, 0.3f, 0.4f),
+  //           {0.4f, 0.3f, 0.2f, -0.1f},
+  //       },
+  //       {
+  //           _mm_set_ps(0.5f, 0.6f, -0.7f, -0.8f),
+  //           {-0.8f, -0.7f, 0.6f, 0.5f},
+  //       },
+  //       {
+  //           _mm_set_ps(0.9f, 0.10f, -0.11f, 0.12f),
+  //           {0.12f, -0.11f, 0.10f, 0.9f},
+  //       },
+  //       {
+  //           _mm_set_ps(-1.1f, -2.1f, -3.1f, -4.1f),
+  //           {-4.1f, -3.1f, -2.1f, -1.1f},
+  //       },
+  //       {
+  //           _mm_set_ps(100.0f, -110.0f, 120.0f, -130.0f),
+  //           {-130.0f, 120.0f, -110.0f, 100.0f},
+  //       },
+  //       {
+  //           _mm_set_ps(200.5f, 210.5f, -220.5f, 230.5f),
+  //           {995.74f, -93.04f, 144.03f, 902.50f},
+  //       },
+  //       {
+  //           _mm_set_ps(10.11f, -11.12f, -12.13f, 13.14f),
+  //           {13.14f, -12.13f, -11.12f, 10.11f},
+  //       },
+  //       {
+  //           _mm_set_ps(10.1f, -20.2f, 30.3f, 40.4f),
+  //           {40.4f, 30.3f, -20.2f, 10.1f},
+  //       },
+  //   };
+  //
+  //   for (size_t i = 0; i < (sizeof(test_vec) / (sizeof(test_vec[0]))); i++) {
+  //     _mm_prefetch(((const char *)&test_vec[i].a), _MM_HINT_T0);
+  //     _mm_prefetch(((const char *)&test_vec[i].a), _MM_HINT_T1);
+  //     _mm_prefetch(((const char *)&test_vec[i].a), _MM_HINT_T2);
+  //     _mm_prefetch(((const char *)&test_vec[i].a), _MM_HINT_NTA);
+  //   }
+  //
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_m_psadbw(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
+  // const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2;
+  // uint16_t d = 0;
+  // for (int i = 0; i < 8; i++) {
+  //   d += abs(_a[i] - _b[i]);
+  // }
+
+  // __m64 a = load_m64(_a);
+  // __m64 b = load_m64(_b);
+  // __m64 c = _m_psadbw(a, b);
+  // return validate_uint16(c, d, 0, 0, 0);
+  return TEST_UNIMPL;
+}
+
+result_t test_m_pshufw(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // return test_mm_shuffle_pi16(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_rcp_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   float dx = 1.0f / _a[0];
+  //   float dy = 1.0f / _a[1];
+  //   float dz = 1.0f / _a[2];
+  //   float dw = 1.0f / _a[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 c = _mm_rcp_ps(a);
+  //   return validate_float_error(c, dx, dy, dz, dw, 0.001f);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_rcp_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //
+  //   float dx = 1.0f / _a[0];
+  //   float dy = _a[1];
+  //   float dz = _a[2];
+  //   float dw = _a[3];
+  //   __m128 a = load_m128(_a);
+  //   __m128 c = _mm_rcp_ss(a);
+  //   return validate_float_error(c, dx, dy, dz, dw, 0.001f);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_rsqrt_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = (const float *)impl.test_cases_float_pointer1;
+  //
+  //   float f0 = 1 / sqrt(_a[0]);
+  //   float f1 = 1 / sqrt(_a[1]);
+  //   float f2 = 1 / sqrt(_a[2]);
+  //   float f3 = 1 / sqrt(_a[3]);
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 c = _mm_rsqrt_ps(a);
+  //
+  // Here, we ensure the error rate of "_mm_rsqrt_ps()" is under 0.1% compared
+  // to the C implementation.
+  //   return validate_float_error(c, f0, f1, f2, f3, 0.001f);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_rsqrt_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = (const float *)impl.test_cases_float_pointer1;
+  //
+  //   float f0 = 1 / sqrt(_a[0]);
+  //   float f1 = _a[1];
+  //   float f2 = _a[2];
+  //   float f3 = _a[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 c = _mm_rsqrt_ss(a);
+  //
+  // Here, we ensure the error rate of "_mm_rsqrt_ps()" is under 0.1% compared
+  // to the C implementation.
+  //   return validate_float_error(c, f0, f1, f2, f3, 0.001f);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sad_pu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
+  //   const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2;
+  //   uint16_t d = 0;
+  //   for (int i = 0; i < 8; i++) {
+  //     d += abs(_a[i] - _b[i]);
+  //   }
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 c = _mm_sad_pu8(a, b);
+  //   return validate_uint16(c, d, 0, 0, 0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_set_flush_zero_mode(const SSE2RVV_TEST_IMPL &impl,
+                                     uint32_t iter) {
+  // TODO:
+  // After the behavior of denormal number and flush zero mode is fully
+  // investigated, the testing would be added.
+  //   return TEST_UNIMPL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_set_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   float x = impl.test_cases_floats[iter];
+  //   float y = impl.test_cases_floats[iter + 1];
+  //   float z = impl.test_cases_floats[iter + 2];
+  //   float w = impl.test_cases_floats[iter + 3];
+  //   __m128 a = _mm_set_ps(x, y, z, w);
+  //   return validate_float(a, w, z, y, x);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_set_ps1(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   float a = impl.test_cases_floats[iter];
+  //
+  //   __m128 ret = _mm_set_ps1(a);
+  //
+  //   return validate_float(ret, a, a, a, a);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_set_rounding_mode(const SSE2RVV_TEST_IMPL &impl,
+                                   uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   result_t res_toward_zero, res_to_neg_inf, res_to_pos_inf, res_nearest;
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b, c;
+  //
+  //   _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+  //   b = _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION);
+  //   c = _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+  //   res_toward_zero = validate_128bits(c, b);
+  //
+  //   _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+  //   b = _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION);
+  //   c = _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  //   res_to_neg_inf = validate_128bits(c, b);
+  //
+  //   _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+  //   b = _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION);
+  //   c = _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+  //   res_to_pos_inf = validate_128bits(c, b);
+  //
+  //   _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+  //   b = _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION);
+  //   c = _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  //   res_nearest = validate_128bits(c, b);
+  //
+  //   if (res_toward_zero == TEST_SUCCESS && res_to_neg_inf == TEST_SUCCESS &&
+  //       res_to_pos_inf == TEST_SUCCESS && res_nearest == TEST_SUCCESS) {
+  //     return TEST_SUCCESS;
+  //   } else {
+  //     return TEST_FAIL;
+  //   }
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_set_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   float a = impl.test_cases_floats[iter];
+  //   __m128 c = _mm_set_ss(a);
+  //   return validate_float(c, a, 0, 0, 0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_set1_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   float w = impl.test_cases_floats[iter];
+  //   __m128 a = _mm_set1_ps(w);
+  //   return validate_float(a, w, w, w, w);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_setcsr(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   return test_mm_set_rounding_mode(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_setr_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   float x = impl.test_cases_floats[iter];
+  //   float y = impl.test_cases_floats[iter + 1];
+  //   float z = impl.test_cases_floats[iter + 2];
+  //   float w = impl.test_cases_floats[iter + 3];
+  //
+  //   __m128 ret = _mm_setr_ps(w, z, y, x);
+  //
+  //   return validate_float(ret, w, z, y, x);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_setzero_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   __m128 a = _mm_setzero_ps();
+  //   return validate_float(a, 0, 0, 0, 0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sfence(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   /* FIXME: Assume that memory barriers always function as intended. */
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_shuffle_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   __m64 a;
+  //   __m64 d;
+  //
+  // #define TEST_IMPL(IDX)
+  //   a = load_m64(_a);
+  //   d = _mm_shuffle_pi16(a, IDX);
+  //
+  //   int16_t _d##IDX[4];
+  //   _d##IDX[0] = _a[IDX & 0x3];
+  //   _d##IDX[1] = _a[(IDX >> 2) & 0x3];
+  //   _d##IDX[2] = _a[(IDX >> 4) & 0x3];
+  //   _d##IDX[3] = _a[(IDX >> 6) & 0x3];
+  //   if (VALIDATE_INT16_M64(d, _d##IDX) != TEST_SUCCESS) {
+  //     return TEST_FAIL;
+  //   }
+  //
+  //   IMM_256_ITER
+  // #undef TEST_IMPL
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+// Note, NEON does not have a general purpose shuffled command like SSE.
+// When invoking this method, there is special code for a number of the most
+// common shuffle permutations
+result_t test_mm_shuffle_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   result_t isValid = TEST_SUCCESS;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  // Test many permutations of the shuffle operation, including all
+  // permutations which have an optimized/customized implementation
+  //   __m128 ret;
+  //   ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(0, 1, 2, 3));
+  //   if (!validate_float(ret, _a[3], _a[2], _b[1], _b[0])) {
+  //     isValid = TEST_FAIL;
+  //   }
+  //   ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 2, 1, 0));
+  //   if (!validate_float(ret, _a[0], _a[1], _b[2], _b[3])) {
+  //     isValid = TEST_FAIL;
+  //   }
+  //   ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(0, 0, 1, 1));
+  //   if (!validate_float(ret, _a[1], _a[1], _b[0], _b[0])) {
+  //     isValid = TEST_FAIL;
+  //   }
+  //   ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 0, 2));
+  //   if (!validate_float(ret, _a[2], _a[0], _b[1], _b[3])) {
+  //     isValid = TEST_FAIL;
+  //   }
+  //   ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2));
+  //   if (!validate_float(ret, _a[2], _a[3], _b[0], _b[1])) {
+  //     isValid = TEST_FAIL;
+  //   }
+  //   ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 3, 0, 1));
+  //   if (!validate_float(ret, _a[1], _a[0], _b[3], _b[2])) {
+  //     isValid = TEST_FAIL;
+  //   }
+  //   ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(0, 0, 2, 2));
+  //   if (!validate_float(ret, _a[2], _a[2], _b[0], _b[0])) {
+  //     isValid = TEST_FAIL;
+  //   }
+  //   ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 2, 0, 0));
+  //   if (!validate_float(ret, _a[0], _a[0], _b[2], _b[2])) {
+  //     isValid = TEST_FAIL;
+  //   }
+  //   ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 2, 0, 2));
+  //   if (!validate_float(ret, _a[2], _a[0], _b[2], _b[3])) {
+  //     isValid = TEST_FAIL;
+  //   }
+  //   ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 1, 3, 3));
+  //   if (!validate_float(ret, _a[3], _a[3], _b[1], _b[1])) {
+  //     isValid = TEST_FAIL;
+  //   }
+  //   ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 1, 0));
+  //   if (!validate_float(ret, _a[0], _a[1], _b[0], _b[2])) {
+  //     isValid = TEST_FAIL;
+  //   }
+  //   ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 0, 1));
+  //   if (!validate_float(ret, _a[1], _a[0], _b[0], _b[2])) {
+  //     isValid = TEST_FAIL;
+  //   }
+  //   ret = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 3, 2));
+  //   if (!validate_float(ret, _a[2], _a[3], _b[0], _b[2])) {
+  //     isValid = TEST_FAIL;
+  //   }
+  //
+  //   return isValid;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sqrt_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = (const float *)impl.test_cases_float_pointer1;
+  //
+  //   float f0 = sqrt(_a[0]);
+  //   float f1 = sqrt(_a[1]);
+  //   float f2 = sqrt(_a[2]);
+  //   float f3 = sqrt(_a[3]);
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 c = _mm_sqrt_ps(a);
+  //
+  //   return validate_float_error(c, f0, f1, f2, f3, 0.000001f);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sqrt_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = (const float *)impl.test_cases_float_pointer1;
+  //
+  //   float f0 = sqrt(_a[0]);
+  //   float f1 = _a[1];
+  //   float f2 = _a[2];
+  //   float f3 = _a[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 c = _mm_sqrt_ss(a);
+  //
+  //   return validate_float_error(c, f0, f1, f2, f3, 0.000001f);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_store_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   int32_t *p = impl.test_cases_int_pointer1;
+  //   int32_t x = impl.test_cases_ints[iter];
+  //   int32_t y = impl.test_cases_ints[iter + 1];
+  //   int32_t z = impl.test_cases_ints[iter + 2];
+  //   int32_t w = impl.test_cases_ints[iter + 3];
+  //   __m128i a = _mm_set_epi32(x, y, z, w);
+  //   _mm_store_ps((float *)p, *(const __m128 *)&a);
+  //   ASSERT_RETURN(p[0] == w);
+  //   ASSERT_RETURN(p[1] == z);
+  //   ASSERT_RETURN(p[2] == y);
+  //   ASSERT_RETURN(p[3] == x);
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_store_ps1(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   float *p = impl.test_cases_float_pointer1;
+  //   float d[4];
+  //
+  //   __m128 a = load_m128(p);
+  //   _mm_store_ps1(d, a);
+  //
+  //   ASSERT_RETURN(d[0] == *p);
+  //   ASSERT_RETURN(d[1] == *p);
+  //   ASSERT_RETURN(d[2] == *p);
+  //   ASSERT_RETURN(d[3] == *p);
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_store_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   float x = impl.test_cases_floats[iter];
+  //   float p[4];
+  //
+  //   __m128 a = _mm_set_ss(x);
+  //   _mm_store_ss(p, a);
+  //   ASSERT_RETURN(p[0] == x);
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_store1_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   float *p = impl.test_cases_float_pointer1;
+  //   float d[4];
+  //
+  //   __m128 a = load_m128(p);
+  //   _mm_store1_ps(d, a);
+  //
+  //   ASSERT_RETURN(d[0] == *p);
+  //   ASSERT_RETURN(d[1] == *p);
+  //   ASSERT_RETURN(d[2] == *p);
+  //   ASSERT_RETURN(d[3] == *p);
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_storeh_pi(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *p = impl.test_cases_float_pointer1;
+  //   float d[4] = {1.0f, 2.0f, 3.0f, 4.0f};
+  //   __m128 a = _mm_load_ps(p);
+  //   __m64 *b = (__m64 *)d;
+  //
+  //   _mm_storeh_pi(b, a);
+  //   ASSERT_RETURN(d[0] == p[2]);
+  //   ASSERT_RETURN(d[1] == p[3]);
+  //   ASSERT_RETURN(d[2] == 3.0f);
+  //   ASSERT_RETURN(d[3] == 4.0f);
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_storel_pi(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *p = impl.test_cases_float_pointer1;
+  //   float d[4] = {1.0f, 2.0f, 3.0f, 4.0f};
+  //   __m128 a = _mm_load_ps(p);
+  //   __m64 *b = (__m64 *)d;
+  //
+  //   _mm_storel_pi(b, a);
+  //   ASSERT_RETURN(d[0] == p[0]);
+  //   ASSERT_RETURN(d[1] == p[1]);
+  //   ASSERT_RETURN(d[2] == 3.0f);
+  //   ASSERT_RETURN(d[3] == 4.0f);
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_storer_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   float *p = impl.test_cases_float_pointer1;
+  //   float d[4];
+  //
+  //   __m128 a = load_m128(p);
+  //   _mm_storer_ps(d, a);
+  //
+  //   ASSERT_RETURN(d[0] == p[3]);
+  //   ASSERT_RETURN(d[1] == p[2]);
+  //   ASSERT_RETURN(d[2] == p[1]);
+  //   ASSERT_RETURN(d[3] == p[0]);
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_storeu_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   float *_a = impl.test_cases_float_pointer1;
+  //   float f[4];
+  //   __m128 a = _mm_load_ps(_a);
+  //
+  //   _mm_storeu_ps(f, a);
+  //   return validate_float(a, f[0], f[1], f[2], f[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_storeu_si16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // The GCC version before 11 does not implement intrinsic function
+  // _mm_storeu_si16. Check https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95483
+  // for more information.
+  // #if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
+  //   return TEST_UNIMPL;
+  // #else
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   __m128i b;
+  //   __m128i a = load_m128i(_a);
+  //   _mm_storeu_si16(&b, a);
+  //   int16_t *_b = (int16_t *)&b;
+  //   int16_t *_c = (int16_t *)&a;
+  //   return validate_int16(b, _c[0], _b[1], _b[2], _b[3], _b[4], _b[5], _b[6],
+  //                         _b[7]);
+  // #endif
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_storeu_si64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // Versions of GCC prior to 9 do not implement intrinsic function
+  // _mm_storeu_si64. Check https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87558
+  // for more information.
+  // #if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ < 9)
+  //   return TEST_UNIMPL;
+  // #else
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   __m128i b;
+  //   __m128i a = load_m128i(_a);
+  //   _mm_storeu_si64(&b, a);
+  //   int64_t *_b = (int64_t *)&b;
+  //   int64_t *_c = (int64_t *)&a;
+  //   return validate_int64(b, _c[0], _b[1]);
+  // #endif
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_stream_pi(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //   __m64 a = load_m64(_a);
+  //   __m64 p;
+  //
+  //   _mm_stream_pi(&p, a);
+  //   return validate_int64(p, _a[0]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_stream_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   __m128 a = load_m128(_a);
+  //   alignas(16) float p[4];
+  //
+  //   _mm_stream_ps(p, a);
+  //   ASSERT_RETURN(p[0] == _a[0]);
+  //   ASSERT_RETURN(p[1] == _a[1]);
+  //   ASSERT_RETURN(p[2] == _a[2]);
+  //   ASSERT_RETURN(p[3] == _a[3]);
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sub_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   float dx = _a[0] - _b[0];
+  //   float dy = _a[1] - _b[1];
+  //   float dz = _a[2] - _b[2];
+  //   float dw = _a[3] - _b[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 c = _mm_sub_ps(a, b);
+  //   return validate_float(c, dx, dy, dz, dw);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sub_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   float dx = _a[0] - _b[0];
+  //   float dy = _a[1];
+  //   float dz = _a[2];
+  //   float dw = _a[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 c = _mm_sub_ss(a, b);
+  //   return validate_float(c, dx, dy, dz, dw);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_ucomieq_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // _mm_ucomieq_ss is equal to _mm_comieq_ss
+  //   return test_mm_comieq_ss(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_ucomige_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // _mm_ucomige_ss is equal to _mm_comige_ss
+  //   return test_mm_comige_ss(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_ucomigt_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // _mm_ucomigt_ss is equal to _mm_comigt_ss
+  //   return test_mm_comigt_ss(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_ucomile_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // _mm_ucomile_ss is equal to _mm_comile_ss
+  //   return test_mm_comile_ss(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_ucomilt_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // _mm_ucomilt_ss is equal to _mm_comilt_ss
+  //   return test_mm_comilt_ss(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_ucomineq_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // _mm_ucomineq_ss is equal to _mm_comineq_ss
+  //   return test_mm_comineq_ss(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_undefined_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   __m128 a = _mm_undefined_ps();
+  //   a = _mm_xor_ps(a, a);
+  //   return validate_float(a, 0, 0, 0, 0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_unpackhi_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   float *_a = impl.test_cases_float_pointer1;
+  //   float *_b = impl.test_cases_float_pointer1;
+  //
+  //   float f0 = _a[2];
+  //   float f1 = _b[2];
+  //   float f2 = _a[3];
+  //   float f3 = _b[3];
+  //
+  //   __m128 a = _mm_load_ps(_a);
+  //   __m128 b = _mm_load_ps(_b);
+  //   __m128 c = _mm_unpackhi_ps(a, b);
+  //   return validate_float(c, f0, f1, f2, f3);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_unpacklo_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   float *_a = impl.test_cases_float_pointer1;
+  //   float *_b = impl.test_cases_float_pointer1;
+  //
+  //   float f0 = _a[0];
+  //   float f1 = _b[0];
+  //   float f2 = _a[1];
+  //   float f3 = _b[1];
+  //
+  //   __m128 a = _mm_load_ps(_a);
+  //   __m128 b = _mm_load_ps(_b);
+  //   __m128 c = _mm_unpacklo_ps(a, b);
+  //
+  //   return validate_float(c, f0, f1, f2, f3);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_xor_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_float_pointer1;
+  //   const int32_t *_b = (const int32_t *)impl.test_cases_float_pointer2;
+  //
+  //   int32_t d0 = _a[0] ^ _b[0];
+  //   int32_t d1 = _a[1] ^ _b[1];
+  //   int32_t d2 = _a[2] ^ _b[2];
+  //   int32_t d3 = _a[3] ^ _b[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 c = _mm_xor_ps(a, b);
+  //
+  //   return validate_float(c, *((float *)&d0), *((float *)&d1), *((float
+  //   *)&d2),
+  //                         *((float *)&d3));
+  return TEST_UNIMPL;
+}
+
+/* SSE2 */
+result_t test_mm_add_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //
+  //   int16_t d[8];
+  //   d[0] = _a[0] + _b[0];
+  //   d[1] = _a[1] + _b[1];
+  //   d[2] = _a[2] + _b[2];
+  //   d[3] = _a[3] + _b[3];
+  //   d[4] = _a[4] + _b[4];
+  //   d[5] = _a[5] + _b[5];
+  //   d[6] = _a[6] + _b[6];
+  //   d[7] = _a[7] + _b[7];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_add_epi16(a, b);
+  //
+  //   return VALIDATE_INT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_add_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = impl.test_cases_int_pointer1;
+  //   const int32_t *_b = impl.test_cases_int_pointer2;
+  //   int32_t d[4];
+  //   d[0] = _a[0] + _b[0];
+  //   d[1] = _a[1] + _b[1];
+  //   d[2] = _a[2] + _b[2];
+  //   d[3] = _a[3] + _b[3];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_add_epi32(a, b);
+  //   return VALIDATE_INT32_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_add_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //   const int64_t *_b = (const int64_t *)impl.test_cases_int_pointer2;
+  //
+  //   int64_t d0 = _a[0] + _b[0];
+  //   int64_t d1 = _a[1] + _b[1];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_add_epi64(a, b);
+  //
+  //   return validate_int64(c, d0, d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_add_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //   int8_t d[16];
+  //   d[0] = _a[0] + _b[0];
+  //   d[1] = _a[1] + _b[1];
+  //   d[2] = _a[2] + _b[2];
+  //   d[3] = _a[3] + _b[3];
+  //   d[4] = _a[4] + _b[4];
+  //   d[5] = _a[5] + _b[5];
+  //   d[6] = _a[6] + _b[6];
+  //   d[7] = _a[7] + _b[7];
+  //   d[8] = _a[8] + _b[8];
+  //   d[9] = _a[9] + _b[9];
+  //   d[10] = _a[10] + _b[10];
+  //   d[11] = _a[11] + _b[11];
+  //   d[12] = _a[12] + _b[12];
+  //   d[13] = _a[13] + _b[13];
+  //   d[14] = _a[14] + _b[14];
+  //   d[15] = _a[15] + _b[15];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_add_epi8(a, b);
+  //   return VALIDATE_INT8_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_add_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   double d0 = _a[0] + _b[0];
+  //   double d1 = _a[1] + _b[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_add_pd(a, b);
+  //   return validate_double(c, d0, d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_add_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   double d0 = _a[0] + _b[0];
+  //   double d1 = _a[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_add_sd(a, b);
+  //   return validate_double(c, d0, d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_add_si64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //   const int64_t *_b = (const int64_t *)impl.test_cases_int_pointer2;
+  //
+  //   int64_t d0 = _a[0] + _b[0];
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 c = _mm_add_si64(a, b);
+  //
+  //   return validate_int64(c, d0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_adds_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //   int32_t d[8];
+  //   d[0] = (int32_t)_a[0] + (int32_t)_b[0];
+  //   if (d[0] > 32767)
+  //     d[0] = 32767;
+  //   if (d[0] < -32768)
+  //     d[0] = -32768;
+  //   d[1] = (int32_t)_a[1] + (int32_t)_b[1];
+  //   if (d[1] > 32767)
+  //     d[1] = 32767;
+  //   if (d[1] < -32768)
+  //     d[1] = -32768;
+  //   d[2] = (int32_t)_a[2] + (int32_t)_b[2];
+  //   if (d[2] > 32767)
+  //     d[2] = 32767;
+  //   if (d[2] < -32768)
+  //     d[2] = -32768;
+  //   d[3] = (int32_t)_a[3] + (int32_t)_b[3];
+  //   if (d[3] > 32767)
+  //     d[3] = 32767;
+  //   if (d[3] < -32768)
+  //     d[3] = -32768;
+  //   d[4] = (int32_t)_a[4] + (int32_t)_b[4];
+  //   if (d[4] > 32767)
+  //     d[4] = 32767;
+  //   if (d[4] < -32768)
+  //     d[4] = -32768;
+  //   d[5] = (int32_t)_a[5] + (int32_t)_b[5];
+  //   if (d[5] > 32767)
+  //     d[5] = 32767;
+  //   if (d[5] < -32768)
+  //     d[5] = -32768;
+  //   d[6] = (int32_t)_a[6] + (int32_t)_b[6];
+  //   if (d[6] > 32767)
+  //     d[6] = 32767;
+  //   if (d[6] < -32768)
+  //     d[6] = -32768;
+  //   d[7] = (int32_t)_a[7] + (int32_t)_b[7];
+  //   if (d[7] > 32767)
+  //     d[7] = 32767;
+  //   if (d[7] < -32768)
+  //     d[7] = -32768;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //
+  //   __m128i c = _mm_adds_epi16(a, b);
+  //   return VALIDATE_INT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_adds_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //
+  //   int16_t d[16];
+  //   for (int i = 0; i < 16; i++) {
+  //     d[i] = (int16_t)_a[i] + (int16_t)_b[i];
+  //     if (d[i] > 127)
+  //       d[i] = 127;
+  //     if (d[i] < -128)
+  //       d[i] = -128;
+  //   }
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_adds_epi8(a, b);
+  //
+  //   return VALIDATE_INT8_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_adds_epu16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   uint32_t max = 0xFFFF;
+  //   const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1;
+  //   const uint16_t *_b = (const uint16_t *)impl.test_cases_int_pointer2;
+  //
+  //   uint16_t d[8];
+  //   d[0] = (uint32_t)_a[0] + (uint32_t)_b[0] > max ? max : _a[0] + _b[0];
+  //   d[1] = (uint32_t)_a[1] + (uint32_t)_b[1] > max ? max : _a[1] + _b[1];
+  //   d[2] = (uint32_t)_a[2] + (uint32_t)_b[2] > max ? max : _a[2] + _b[2];
+  //   d[3] = (uint32_t)_a[3] + (uint32_t)_b[3] > max ? max : _a[3] + _b[3];
+  //   d[4] = (uint32_t)_a[4] + (uint32_t)_b[4] > max ? max : _a[4] + _b[4];
+  //   d[5] = (uint32_t)_a[5] + (uint32_t)_b[5] > max ? max : _a[5] + _b[5];
+  //   d[6] = (uint32_t)_a[6] + (uint32_t)_b[6] > max ? max : _a[6] + _b[6];
+  //   d[7] = (uint32_t)_a[7] + (uint32_t)_b[7] > max ? max : _a[7] + _b[7];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_adds_epu16(a, b);
+  //
+  //   return VALIDATE_INT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_adds_epu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //   uint8_t d[16];
+  //   d[0] = (uint8_t)_a[0] + (uint8_t)_b[0];
+  //   if (d[0] < (uint8_t)_a[0])
+  //     d[0] = 255;
+  //   d[1] = (uint8_t)_a[1] + (uint8_t)_b[1];
+  //   if (d[1] < (uint8_t)_a[1])
+  //     d[1] = 255;
+  //   d[2] = (uint8_t)_a[2] + (uint8_t)_b[2];
+  //   if (d[2] < (uint8_t)_a[2])
+  //     d[2] = 255;
+  //   d[3] = (uint8_t)_a[3] + (uint8_t)_b[3];
+  //   if (d[3] < (uint8_t)_a[3])
+  //     d[3] = 255;
+  //   d[4] = (uint8_t)_a[4] + (uint8_t)_b[4];
+  //   if (d[4] < (uint8_t)_a[4])
+  //     d[4] = 255;
+  //   d[5] = (uint8_t)_a[5] + (uint8_t)_b[5];
+  //   if (d[5] < (uint8_t)_a[5])
+  //     d[5] = 255;
+  //   d[6] = (uint8_t)_a[6] + (uint8_t)_b[6];
+  //   if (d[6] < (uint8_t)_a[6])
+  //     d[6] = 255;
+  //   d[7] = (uint8_t)_a[7] + (uint8_t)_b[7];
+  //   if (d[7] < (uint8_t)_a[7])
+  //     d[7] = 255;
+  //   d[8] = (uint8_t)_a[8] + (uint8_t)_b[8];
+  //   if (d[8] < (uint8_t)_a[8])
+  //     d[8] = 255;
+  //   d[9] = (uint8_t)_a[9] + (uint8_t)_b[9];
+  //   if (d[9] < (uint8_t)_a[9])
+  //     d[9] = 255;
+  //   d[10] = (uint8_t)_a[10] + (uint8_t)_b[10];
+  //   if (d[10] < (uint8_t)_a[10])
+  //     d[10] = 255;
+  //   d[11] = (uint8_t)_a[11] + (uint8_t)_b[11];
+  //   if (d[11] < (uint8_t)_a[11])
+  //     d[11] = 255;
+  //   d[12] = (uint8_t)_a[12] + (uint8_t)_b[12];
+  //   if (d[12] < (uint8_t)_a[12])
+  //     d[12] = 255;
+  //   d[13] = (uint8_t)_a[13] + (uint8_t)_b[13];
+  //   if (d[13] < (uint8_t)_a[13])
+  //     d[13] = 255;
+  //   d[14] = (uint8_t)_a[14] + (uint8_t)_b[14];
+  //   if (d[14] < (uint8_t)_a[14])
+  //     d[14] = 255;
+  //   d[15] = (uint8_t)_a[15] + (uint8_t)_b[15];
+  //   if (d[15] < (uint8_t)_a[15])
+  //     d[15] = 255;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_adds_epu8(a, b);
+  //   return VALIDATE_INT8_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_and_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_float_pointer1;
+  //   const int64_t *_b = (const int64_t *)impl.test_cases_float_pointer2;
+  //
+  //   int64_t d0 = _a[0] & _b[0];
+  //   int64_t d1 = _a[1] & _b[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_and_pd(a, b);
+  //
+  //   return validate_double(c, *((double *)&d0), *((double *)&d1));
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_and_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = impl.test_cases_int_pointer1;
+  //   const int32_t *_b = impl.test_cases_int_pointer2;
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128 fc = _mm_and_ps(*(const __m128 *)&a, *(const __m128 *)&b);
+  //   __m128i c = *(const __m128i *)&fc;
+  // now for the assertion...
+  //   const uint32_t *ia = (const uint32_t *)&a;
+  //   const uint32_t *ib = (const uint32_t *)&b;
+  //   uint32_t r[4];
+  //   r[0] = ia[0] & ib[0];
+  //   r[1] = ia[1] & ib[1];
+  //   r[2] = ia[2] & ib[2];
+  //   r[3] = ia[3] & ib[3];
+  //   __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]);
+  //   result_t res = VALIDATE_INT32_M128(c, r);
+  //   if (res) {
+  //     res = VALIDATE_INT32_M128(ret, r);
+  //   }
+  //   return res;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_andnot_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_andnot_pd(a, b);
+  //
+  // Take AND operation a complement of 'a' and 'b'. Bitwise operations are
+  // not allowed on float/double datatype, so 'a' and 'b' are calculated in
+  // uint64_t datatype.
+  //   const uint64_t *ia = (const uint64_t *)&a;
+  //   const uint64_t *ib = (const uint64_t *)&b;
+  //   uint64_t r0 = ~ia[0] & ib[0];
+  //   uint64_t r1 = ~ia[1] & ib[1];
+  //   return validate_uint64(*(const __m128i *)&c, r0, r1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_andnot_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = impl.test_cases_int_pointer1;
+  //   const int32_t *_b = impl.test_cases_int_pointer2;
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128 fc = _mm_andnot_ps(*(const __m128 *)&a, *(const __m128 *)&b);
+  //   __m128i c = *(const __m128i *)&fc;
+  // now for the assertion...
+  //   const uint32_t *ia = (const uint32_t *)&a;
+  //   const uint32_t *ib = (const uint32_t *)&b;
+  //   uint32_t r[4];
+  //   r[0] = ~ia[0] & ib[0];
+  //   r[1] = ~ia[1] & ib[1];
+  //   r[2] = ~ia[2] & ib[2];
+  //   r[3] = ~ia[3] & ib[3];
+  //   __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]);
+  //   result_t res = TEST_SUCCESS;
+  //   res = VALIDATE_INT32_M128(c, r);
+  //   if (res) {
+  //     res = VALIDATE_INT32_M128(ret, r);
+  //   }
+  //   return res;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_avg_epu16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //   uint16_t d[8];
+  //   d[0] = ((uint16_t)_a[0] + (uint16_t)_b[0] + 1) >> 1;
+  //   d[1] = ((uint16_t)_a[1] + (uint16_t)_b[1] + 1) >> 1;
+  //   d[2] = ((uint16_t)_a[2] + (uint16_t)_b[2] + 1) >> 1;
+  //   d[3] = ((uint16_t)_a[3] + (uint16_t)_b[3] + 1) >> 1;
+  //   d[4] = ((uint16_t)_a[4] + (uint16_t)_b[4] + 1) >> 1;
+  //   d[5] = ((uint16_t)_a[5] + (uint16_t)_b[5] + 1) >> 1;
+  //   d[6] = ((uint16_t)_a[6] + (uint16_t)_b[6] + 1) >> 1;
+  //   d[7] = ((uint16_t)_a[7] + (uint16_t)_b[7] + 1) >> 1;
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_avg_epu16(a, b);
+  //   return VALIDATE_UINT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_avg_epu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //   uint8_t d[16];
+  //   d[0] = ((uint8_t)_a[0] + (uint8_t)_b[0] + 1) >> 1;
+  //   d[1] = ((uint8_t)_a[1] + (uint8_t)_b[1] + 1) >> 1;
+  //   d[2] = ((uint8_t)_a[2] + (uint8_t)_b[2] + 1) >> 1;
+  //   d[3] = ((uint8_t)_a[3] + (uint8_t)_b[3] + 1) >> 1;
+  //   d[4] = ((uint8_t)_a[4] + (uint8_t)_b[4] + 1) >> 1;
+  //   d[5] = ((uint8_t)_a[5] + (uint8_t)_b[5] + 1) >> 1;
+  //   d[6] = ((uint8_t)_a[6] + (uint8_t)_b[6] + 1) >> 1;
+  //   d[7] = ((uint8_t)_a[7] + (uint8_t)_b[7] + 1) >> 1;
+  //   d[8] = ((uint8_t)_a[8] + (uint8_t)_b[8] + 1) >> 1;
+  //   d[9] = ((uint8_t)_a[9] + (uint8_t)_b[9] + 1) >> 1;
+  //   d[10] = ((uint8_t)_a[10] + (uint8_t)_b[10] + 1) >> 1;
+  //   d[11] = ((uint8_t)_a[11] + (uint8_t)_b[11] + 1) >> 1;
+  //   d[12] = ((uint8_t)_a[12] + (uint8_t)_b[12] + 1) >> 1;
+  //   d[13] = ((uint8_t)_a[13] + (uint8_t)_b[13] + 1) >> 1;
+  //   d[14] = ((uint8_t)_a[14] + (uint8_t)_b[14] + 1) >> 1;
+  //   d[15] = ((uint8_t)_a[15] + (uint8_t)_b[15] + 1) >> 1;
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_avg_epu8(a, b);
+  //   return VALIDATE_UINT8_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_bslli_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   return test_mm_slli_si128(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_bsrli_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   return test_mm_srli_si128(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_castpd_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const __m128d a = load_m128d(_a);
+  //   const __m128 _c = load_m128(_a);
+  //
+  //   __m128 r = _mm_castpd_ps(a);
+  //
+  //   return validate_128bits(r, _c);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_castpd_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const __m128d a = load_m128d(_a);
+  //   const __m128i *_c = (const __m128i *)_a;
+  //
+  //   __m128i r = _mm_castpd_si128(a);
+  //
+  //   return validate_128bits(r, *_c);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_castps_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const __m128 a = load_m128(_a);
+  //   const __m128d *_c = (const __m128d *)_a;
+  //
+  //   __m128d r = _mm_castps_pd(a);
+  //
+  //   return validate_128bits(r, *_c);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_castps_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //
+  //   const __m128i *_c = (const __m128i *)_a;
+  //
+  //   const __m128 a = load_m128(_a);
+  //   __m128i r = _mm_castps_si128(a);
+  //
+  //   return validate_128bits(r, *_c);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_castsi128_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = impl.test_cases_int_pointer1;
+  //
+  //   const __m128d *_c = (const __m128d *)_a;
+  //
+  //   const __m128i a = load_m128i(_a);
+  //   __m128d r = _mm_castsi128_pd(a);
+  //
+  //   return validate_128bits(r, *_c);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_castsi128_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = impl.test_cases_int_pointer1;
+  //
+  //   const __m128 *_c = (const __m128 *)_a;
+  //
+  //   const __m128i a = load_m128i(_a);
+  //   __m128 r = _mm_castsi128_ps(a);
+  //
+  //   return validate_128bits(r, *_c);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_clflush(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   /* FIXME: Assume that we have portable mechanisms to flush cache. */
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpeq_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //   int16_t d[8];
+  //   d[0] = (_a[0] == _b[0]) ? ~UINT16_C(0) : 0x0;
+  //   d[1] = (_a[1] == _b[1]) ? ~UINT16_C(0) : 0x0;
+  //   d[2] = (_a[2] == _b[2]) ? ~UINT16_C(0) : 0x0;
+  //   d[3] = (_a[3] == _b[3]) ? ~UINT16_C(0) : 0x0;
+  //   d[4] = (_a[4] == _b[4]) ? ~UINT16_C(0) : 0x0;
+  //   d[5] = (_a[5] == _b[5]) ? ~UINT16_C(0) : 0x0;
+  //   d[6] = (_a[6] == _b[6]) ? ~UINT16_C(0) : 0x0;
+  //   d[7] = (_a[7] == _b[7]) ? ~UINT16_C(0) : 0x0;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_cmpeq_epi16(a, b);
+  //   return VALIDATE_INT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpeq_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = impl.test_cases_int_pointer1;
+  //   const int32_t *_b = impl.test_cases_int_pointer2;
+  //
+  //   int32_t d[4];
+  //   d[0] = (_a[0] == _b[0]) ? ~UINT32_C(0) : 0x0;
+  //   d[1] = (_a[1] == _b[1]) ? ~UINT32_C(0) : 0x0;
+  //   d[2] = (_a[2] == _b[2]) ? ~UINT32_C(0) : 0x0;
+  //   d[3] = (_a[3] == _b[3]) ? ~UINT32_C(0) : 0x0;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_cmpeq_epi32(a, b);
+  //
+  //   return VALIDATE_INT32_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpeq_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //   int8_t d[16];
+  //   d[0] = (_a[0] == _b[0]) ? ~UINT8_C(0) : 0x00;
+  //   d[1] = (_a[1] == _b[1]) ? ~UINT8_C(0) : 0x00;
+  //   d[2] = (_a[2] == _b[2]) ? ~UINT8_C(0) : 0x00;
+  //   d[3] = (_a[3] == _b[3]) ? ~UINT8_C(0) : 0x00;
+  //   d[4] = (_a[4] == _b[4]) ? ~UINT8_C(0) : 0x00;
+  //   d[5] = (_a[5] == _b[5]) ? ~UINT8_C(0) : 0x00;
+  //   d[6] = (_a[6] == _b[6]) ? ~UINT8_C(0) : 0x00;
+  //   d[7] = (_a[7] == _b[7]) ? ~UINT8_C(0) : 0x00;
+  //   d[8] = (_a[8] == _b[8]) ? ~UINT8_C(0) : 0x00;
+  //   d[9] = (_a[9] == _b[9]) ? ~UINT8_C(0) : 0x00;
+  //   d[10] = (_a[10] == _b[10]) ? ~UINT8_C(0) : 0x00;
+  //   d[11] = (_a[11] == _b[11]) ? ~UINT8_C(0) : 0x00;
+  //   d[12] = (_a[12] == _b[12]) ? ~UINT8_C(0) : 0x00;
+  //   d[13] = (_a[13] == _b[13]) ? ~UINT8_C(0) : 0x00;
+  //   d[14] = (_a[14] == _b[14]) ? ~UINT8_C(0) : 0x00;
+  //   d[15] = (_a[15] == _b[15]) ? ~UINT8_C(0) : 0x00;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_cmpeq_epi8(a, b);
+  //   return VALIDATE_INT8_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpeq_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   uint64_t d0 = (_a[0] == _b[0]) ? 0xffffffffffffffff : 0;
+  //   uint64_t d1 = (_a[1] == _b[1]) ? 0xffffffffffffffff : 0;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_cmpeq_pd(a, b);
+  //   return validate_double(c, *(double *)&d0, *(double *)&d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpeq_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   const uint64_t d0 = (_a[0] == _b[0]) ? ~UINT64_C(0) : 0;
+  //   const uint64_t d1 = ((const uint64_t *)_a)[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_cmpeq_sd(a, b);
+  //
+  //   return validate_double(c, *(const double *)&d0, *(const double *)&d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpge_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   uint64_t d0 = (_a[0] >= _b[0]) ? ~UINT64_C(0) : 0;
+  //   uint64_t d1 = (_a[1] >= _b[1]) ? ~UINT64_C(0) : 0;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_cmpge_pd(a, b);
+  //
+  //   return validate_double(c, *(double *)&d0, *(double *)&d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpge_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   double *_a = (double *)impl.test_cases_float_pointer1;
+  //   double *_b = (double *)impl.test_cases_float_pointer2;
+  //   uint64_t d0 = (_a[0] >= _b[0]) ? ~UINT64_C(0) : 0;
+  //   uint64_t d1 = ((uint64_t *)_a)[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_cmpge_sd(a, b);
+  //
+  //   return validate_double(c, *(double *)&d0, *(double *)&d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpgt_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //   uint16_t d[8];
+  //   d[0] = _a[0] > _b[0] ? ~UINT16_C(0) : 0;
+  //   d[1] = _a[1] > _b[1] ? ~UINT16_C(0) : 0;
+  //   d[2] = _a[2] > _b[2] ? ~UINT16_C(0) : 0;
+  //   d[3] = _a[3] > _b[3] ? ~UINT16_C(0) : 0;
+  //   d[4] = _a[4] > _b[4] ? ~UINT16_C(0) : 0;
+  //   d[5] = _a[5] > _b[5] ? ~UINT16_C(0) : 0;
+  //   d[6] = _a[6] > _b[6] ? ~UINT16_C(0) : 0;
+  //   d[7] = _a[7] > _b[7] ? ~UINT16_C(0) : 0;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_cmpgt_epi16(a, b);
+  //
+  //   return VALIDATE_INT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpgt_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = impl.test_cases_int_pointer1;
+  //   const int32_t *_b = impl.test_cases_int_pointer2;
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //
+  //   int32_t result[4];
+  //
+  //   result[0] = _a[0] > _b[0] ? -1 : 0;
+  //   result[1] = _a[1] > _b[1] ? -1 : 0;
+  //   result[2] = _a[2] > _b[2] ? -1 : 0;
+  //   result[3] = _a[3] > _b[3] ? -1 : 0;
+  //
+  //   __m128i iret = _mm_cmpgt_epi32(a, b);
+  //   return VALIDATE_INT32_M128(iret, result);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpgt_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //   int8_t d[16];
+  //   d[0] = (_a[0] > _b[0]) ? ~UINT8_C(0) : 0x00;
+  //   d[1] = (_a[1] > _b[1]) ? ~UINT8_C(0) : 0x00;
+  //   d[2] = (_a[2] > _b[2]) ? ~UINT8_C(0) : 0x00;
+  //   d[3] = (_a[3] > _b[3]) ? ~UINT8_C(0) : 0x00;
+  //   d[4] = (_a[4] > _b[4]) ? ~UINT8_C(0) : 0x00;
+  //   d[5] = (_a[5] > _b[5]) ? ~UINT8_C(0) : 0x00;
+  //   d[6] = (_a[6] > _b[6]) ? ~UINT8_C(0) : 0x00;
+  //   d[7] = (_a[7] > _b[7]) ? ~UINT8_C(0) : 0x00;
+  //   d[8] = (_a[8] > _b[8]) ? ~UINT8_C(0) : 0x00;
+  //   d[9] = (_a[9] > _b[9]) ? ~UINT8_C(0) : 0x00;
+  //   d[10] = (_a[10] > _b[10]) ? ~UINT8_C(0) : 0x00;
+  //   d[11] = (_a[11] > _b[11]) ? ~UINT8_C(0) : 0x00;
+  //   d[12] = (_a[12] > _b[12]) ? ~UINT8_C(0) : 0x00;
+  //   d[13] = (_a[13] > _b[13]) ? ~UINT8_C(0) : 0x00;
+  //   d[14] = (_a[14] > _b[14]) ? ~UINT8_C(0) : 0x00;
+  //   d[15] = (_a[15] > _b[15]) ? ~UINT8_C(0) : 0x00;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_cmpgt_epi8(a, b);
+  //   return VALIDATE_INT8_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpgt_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   uint64_t d0 = (_a[0] > _b[0]) ? ~UINT64_C(0) : 0;
+  //   uint64_t d1 = (_a[1] > _b[1]) ? ~UINT64_C(0) : 0;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_cmpgt_pd(a, b);
+  //
+  //   return validate_double(c, *(double *)&d0, *(double *)&d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpgt_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   double *_a = (double *)impl.test_cases_float_pointer1;
+  //   double *_b = (double *)impl.test_cases_float_pointer2;
+  //   uint64_t d0 = (_a[0] > _b[0]) ? ~UINT64_C(0) : 0;
+  //   uint64_t d1 = ((uint64_t *)_a)[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_cmpgt_sd(a, b);
+  //
+  //   return validate_double(c, *(double *)&d0, *(double *)&d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmple_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   uint64_t d0 = (_a[0] <= _b[0]) ? ~UINT64_C(0) : 0;
+  //   uint64_t d1 = (_a[1] <= _b[1]) ? ~UINT64_C(0) : 0;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_cmple_pd(a, b);
+  //
+  //   return validate_double(c, *(double *)&d0, *(double *)&d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmple_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   double *_a = (double *)impl.test_cases_float_pointer1;
+  //   double *_b = (double *)impl.test_cases_float_pointer2;
+  //   uint64_t d0 = (_a[0] <= _b[0]) ? ~UINT64_C(0) : 0;
+  //   uint64_t d1 = ((uint64_t *)_a)[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_cmple_sd(a, b);
+  //
+  //   return validate_double(c, *(double *)&d0, *(double *)&d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmplt_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //   uint16_t d[8];
+  //   d[0] = _a[0] < _b[0] ? ~UINT16_C(0) : 0;
+  //   d[1] = _a[1] < _b[1] ? ~UINT16_C(0) : 0;
+  //   d[2] = _a[2] < _b[2] ? ~UINT16_C(0) : 0;
+  //   d[3] = _a[3] < _b[3] ? ~UINT16_C(0) : 0;
+  //   d[4] = _a[4] < _b[4] ? ~UINT16_C(0) : 0;
+  //   d[5] = _a[5] < _b[5] ? ~UINT16_C(0) : 0;
+  //   d[6] = _a[6] < _b[6] ? ~UINT16_C(0) : 0;
+  //   d[7] = _a[7] < _b[7] ? ~UINT16_C(0) : 0;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_cmplt_epi16(a, b);
+  //
+  //   return VALIDATE_UINT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmplt_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = impl.test_cases_int_pointer1;
+  //   const int32_t *_b = impl.test_cases_int_pointer2;
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //
+  //   int32_t result[4];
+  //   result[0] = _a[0] < _b[0] ? -1 : 0;
+  //   result[1] = _a[1] < _b[1] ? -1 : 0;
+  //   result[2] = _a[2] < _b[2] ? -1 : 0;
+  //   result[3] = _a[3] < _b[3] ? -1 : 0;
+  //
+  //   __m128i iret = _mm_cmplt_epi32(a, b);
+  //   return VALIDATE_INT32_M128(iret, result);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmplt_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //   int8_t d[16];
+  //   d[0] = (_a[0] < _b[0]) ? ~UINT8_C(0) : 0x00;
+  //   d[1] = (_a[1] < _b[1]) ? ~UINT8_C(0) : 0x00;
+  //   d[2] = (_a[2] < _b[2]) ? ~UINT8_C(0) : 0x00;
+  //   d[3] = (_a[3] < _b[3]) ? ~UINT8_C(0) : 0x00;
+  //   d[4] = (_a[4] < _b[4]) ? ~UINT8_C(0) : 0x00;
+  //   d[5] = (_a[5] < _b[5]) ? ~UINT8_C(0) : 0x00;
+  //   d[6] = (_a[6] < _b[6]) ? ~UINT8_C(0) : 0x00;
+  //   d[7] = (_a[7] < _b[7]) ? ~UINT8_C(0) : 0x00;
+  //   d[8] = (_a[8] < _b[8]) ? ~UINT8_C(0) : 0x00;
+  //   d[9] = (_a[9] < _b[9]) ? ~UINT8_C(0) : 0x00;
+  //   d[10] = (_a[10] < _b[10]) ? ~UINT8_C(0) : 0x00;
+  //   d[11] = (_a[11] < _b[11]) ? ~UINT8_C(0) : 0x00;
+  //   d[12] = (_a[12] < _b[12]) ? ~UINT8_C(0) : 0x00;
+  //   d[13] = (_a[13] < _b[13]) ? ~UINT8_C(0) : 0x00;
+  //   d[14] = (_a[14] < _b[14]) ? ~UINT8_C(0) : 0x00;
+  //   d[15] = (_a[15] < _b[15]) ? ~UINT8_C(0) : 0x00;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_cmplt_epi8(a, b);
+  //   return VALIDATE_INT8_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmplt_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //
+  //   int64_t f0 = (_a[0] < _b[0]) ? ~UINT64_C(0) : UINT64_C(0);
+  //   int64_t f1 = (_a[1] < _b[1]) ? ~UINT64_C(0) : UINT64_C(0);
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_cmplt_pd(a, b);
+  //
+  //   return validate_double(c, *(double *)&f0, *(double *)&f1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmplt_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   double *_a = (double *)impl.test_cases_float_pointer1;
+  //   double *_b = (double *)impl.test_cases_float_pointer2;
+  //   uint64_t d0 = (_a[0] < _b[0]) ? ~UINT64_C(0) : 0;
+  //   uint64_t d1 = ((uint64_t *)_a)[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_cmplt_sd(a, b);
+  //
+  //   return validate_double(c, *(double *)&d0, *(double *)&d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpneq_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //
+  //   int64_t f0 = (_a[0] != _b[0]) ? ~UINT64_C(0) : UINT64_C(0);
+  //   int64_t f1 = (_a[1] != _b[1]) ? ~UINT64_C(0) : UINT64_C(0);
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_cmpneq_pd(a, b);
+  //
+  //   return validate_double(c, *(double *)&f0, *(double *)&f1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpneq_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   double *_a = (double *)impl.test_cases_float_pointer1;
+  //   double *_b = (double *)impl.test_cases_float_pointer2;
+  //
+  //   int64_t f0 = (_a[0] != _b[0]) ? ~UINT64_C(0) : UINT64_C(0);
+  //   int64_t f1 = ((int64_t *)_a)[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_cmpneq_sd(a, b);
+  //
+  //   return validate_double(c, *(double *)&f0, *(double *)&f1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpnge_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   uint64_t d0 = !(_a[0] >= _b[0]) ? ~UINT64_C(0) : 0;
+  //   uint64_t d1 = !(_a[1] >= _b[1]) ? ~UINT64_C(0) : 0;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_cmpnge_pd(a, b);
+  //
+  //   return validate_double(c, *(double *)&d0, *(double *)&d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpnge_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   double *_a = (double *)impl.test_cases_float_pointer1;
+  //   double *_b = (double *)impl.test_cases_float_pointer2;
+  //   uint64_t d0 = !(_a[0] >= _b[0]) ? ~UINT64_C(0) : 0;
+  //   uint64_t d1 = ((uint64_t *)_a)[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_cmpnge_sd(a, b);
+  //
+  //   return validate_double(c, *(double *)&d0, *(double *)&d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpngt_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   uint64_t d0 = !(_a[0] > _b[0]) ? ~UINT64_C(0) : 0;
+  //   uint64_t d1 = !(_a[1] > _b[1]) ? ~UINT64_C(0) : 0;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_cmpngt_pd(a, b);
+  //
+  //   return validate_double(c, *(double *)&d0, *(double *)&d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpngt_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   double *_a = (double *)impl.test_cases_float_pointer1;
+  //   double *_b = (double *)impl.test_cases_float_pointer2;
+  //   uint64_t d0 = !(_a[0] > _b[0]) ? ~UINT64_C(0) : 0;
+  //   uint64_t d1 = ((uint64_t *)_a)[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_cmpngt_sd(a, b);
+  //
+  //   return validate_double(c, *(double *)&d0, *(double *)&d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpnle_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   uint64_t d0 = !(_a[0] <= _b[0]) ? ~UINT64_C(0) : 0;
+  //   uint64_t d1 = !(_a[1] <= _b[1]) ? ~UINT64_C(0) : 0;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_cmpnle_pd(a, b);
+  //
+  //   return validate_double(c, *(double *)&d0, *(double *)&d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpnle_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   double *_a = (double *)impl.test_cases_float_pointer1;
+  //   double *_b = (double *)impl.test_cases_float_pointer2;
+  //   uint64_t d0 = !(_a[0] <= _b[0]) ? ~UINT64_C(0) : 0;
+  //   uint64_t d1 = ((uint64_t *)_a)[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_cmpnle_sd(a, b);
+  //
+  //   return validate_double(c, *(double *)&d0, *(double *)&d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpnlt_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   uint64_t d0 = !(_a[0] < _b[0]) ? ~UINT64_C(0) : 0;
+  //   uint64_t d1 = !(_a[1] < _b[1]) ? ~UINT64_C(0) : 0;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_cmpnlt_pd(a, b);
+  //
+  //   return validate_double(c, *(double *)&d0, *(double *)&d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpnlt_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   double *_a = (double *)impl.test_cases_float_pointer1;
+  //   double *_b = (double *)impl.test_cases_float_pointer2;
+  //   uint64_t d0 = !(_a[0] < _b[0]) ? ~UINT64_C(0) : 0;
+  //   uint64_t d1 = ((uint64_t *)_a)[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_cmpnlt_sd(a, b);
+  //
+  //   return validate_double(c, *(double *)&d0, *(double *)&d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpord_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   __m128d a = _mm_load_pd(_a);
+  //   __m128d b = _mm_load_pd(_b);
+  //
+  //   double result[2];
+  //
+  //   for (uint32_t i = 0; i < 2; i++) {
+  //     result[i] = cmp_noNaN(_a[i], _b[i]);
+  //   }
+  //
+  //   __m128d ret = _mm_cmpord_pd(a, b);
+  //
+  //   return validate_double(ret, result[0], result[1]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpord_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   __m128d a = _mm_load_pd(_a);
+  //   __m128d b = _mm_load_pd(_b);
+  //
+  //   double c0 = cmp_noNaN(_a[0], _b[0]);
+  //   double c1 = _a[1];
+  //
+  //   __m128d ret = _mm_cmpord_sd(a, b);
+  //   return validate_double(ret, c0, c1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpunord_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   __m128d a = _mm_load_pd(_a);
+  //   __m128d b = _mm_load_pd(_b);
+  //
+  //   double result[2];
+  //   result[0] = cmp_hasNaN(_a[0], _b[0]);
+  //   result[1] = cmp_hasNaN(_a[1], _b[1]);
+  //
+  //   __m128d ret = _mm_cmpunord_pd(a, b);
+  //   return validate_double(ret, result[0], result[1]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpunord_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   double *_a = (double *)impl.test_cases_float_pointer1;
+  //   double *_b = (double *)impl.test_cases_float_pointer2;
+  //   __m128d a = _mm_load_pd(_a);
+  //   __m128d b = _mm_load_pd(_b);
+  //
+  //   double result[2];
+  //   result[0] = cmp_hasNaN(_a[0], _b[0]);
+  //   result[1] = _a[1];
+  //
+  //   __m128d ret = _mm_cmpunord_sd(a, b);
+  //   return validate_double(ret, result[0], result[1]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_comieq_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // FIXME:
+  // The GCC does not implement _mm_comieq_sd correctly.
+  // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98612 for more
+  // information.
+  // #if defined(__GNUC__) && !defined(__clang__)
+  //   return TEST_UNIMPL;
+  // #else
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   int32_t _c = (_a[0] == _b[0]) ? 1 : 0;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   int32_t c = _mm_comieq_sd(a, b);
+  //
+  //   ASSERT_RETURN(c == _c);
+  //   return TEST_SUCCESS;
+  // #endif
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_comige_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   int32_t _c = (_a[0] >= _b[0]) ? 1 : 0;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   int32_t c = _mm_comige_sd(a, b);
+  //
+  //   ASSERT_RETURN(c == _c);
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_comigt_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   int32_t _c = (_a[0] > _b[0]) ? 1 : 0;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   int32_t c = _mm_comigt_sd(a, b);
+  //
+  //   ASSERT_RETURN(c == _c);
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_comile_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // FIXME:
+  // The GCC does not implement _mm_comile_sd correctly.
+  // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98612 for more
+  // information.
+  // #if defined(__GNUC__) && !defined(__clang__)
+  //   return TEST_UNIMPL;
+  // #else
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   int32_t _c = (_a[0] <= _b[0]) ? 1 : 0;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   int32_t c = _mm_comile_sd(a, b);
+  //
+  //   ASSERT_RETURN(c == _c);
+  //   return TEST_SUCCESS;
+  // #endif
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_comilt_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // FIXME:
+  // The GCC does not implement _mm_comilt_sd correctly.
+  // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98612 for more
+  // information.
+  // #if defined(__GNUC__) && !defined(__clang__)
+  //   return TEST_UNIMPL;
+  // #else
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   int32_t _c = (_a[0] < _b[0]) ? 1 : 0;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   int32_t c = _mm_comilt_sd(a, b);
+  //
+  //   ASSERT_RETURN(c == _c);
+  //   return TEST_SUCCESS;
+  // #endif
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_comineq_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // FIXME:
+  // The GCC does not implement _mm_comineq_sd correctly.
+  // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98612 for more
+  // information.
+  // #if defined(__GNUC__) && !defined(__clang__)
+  //   return TEST_UNIMPL;
+  // #else
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   int32_t _c = (_a[0] != _b[0]) ? 1 : 0;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   int32_t c = _mm_comineq_sd(a, b);
+  //
+  //   ASSERT_RETURN(c == _c);
+  //   return TEST_SUCCESS;
+  // #endif
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtepi32_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = impl.test_cases_int_pointer1;
+  //   __m128i a = load_m128i(_a);
+  //   double trun[2] = {(double)_a[0], (double)_a[1]};
+  //
+  //   __m128d ret = _mm_cvtepi32_pd(a);
+  //   return validate_double(ret, trun[0], trun[1]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtepi32_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = impl.test_cases_int_pointer1;
+  //   __m128i a = load_m128i(_a);
+  //   float trun[4];
+  //   for (uint32_t i = 0; i < 4; i++) {
+  //     trun[i] = (float)_a[i];
+  //   }
+  //
+  //   __m128 ret = _mm_cvtepi32_ps(a);
+  //   return validate_float(ret, trun[0], trun[1], trun[2], trun[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtpd_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   int32_t d[2];
+  //
+  //   switch (iter & 0x3) {
+  //   case 0:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+  //     d[0] = (int32_t)(bankersRounding(_a[0]));
+  //     d[1] = (int32_t)(bankersRounding(_a[1]));
+  //     break;
+  //   case 1:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+  //     d[0] = (int32_t)(floor(_a[0]));
+  //     d[1] = (int32_t)(floor(_a[1]));
+  //     break;
+  //   case 2:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+  //     d[0] = (int32_t)(ceil(_a[0]));
+  //     d[1] = (int32_t)(ceil(_a[1]));
+  //     break;
+  //   case 3:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+  //     d[0] = (int32_t)(_a[0]);
+  //     d[1] = (int32_t)(_a[1]);
+  //     break;
+  //   }
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128i ret = _mm_cvtpd_epi32(a);
+  //
+  //   return validate_int32(ret, d[0], d[1], 0, 0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtpd_pi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   int32_t d[2];
+  //
+  //   switch (iter & 0x3) {
+  //   case 0:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+  //     d[0] = (int32_t)(bankersRounding(_a[0]));
+  //     d[1] = (int32_t)(bankersRounding(_a[1]));
+  //     break;
+  //   case 1:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+  //     d[0] = (int32_t)(floor(_a[0]));
+  //     d[1] = (int32_t)(floor(_a[1]));
+  //     break;
+  //   case 2:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+  //     d[0] = (int32_t)(ceil(_a[0]));
+  //     d[1] = (int32_t)(ceil(_a[1]));
+  //     break;
+  //   case 3:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+  //     d[0] = (int32_t)(_a[0]);
+  //     d[1] = (int32_t)(_a[1]);
+  //     break;
+  //   }
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m64 ret = _mm_cvtpd_pi32(a);
+  //
+  //   return VALIDATE_INT32_M64(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtpd_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   float f0 = (float)_a[0];
+  //   float f1 = (float)_a[1];
+  //   const __m128d a = load_m128d(_a);
+  //
+  //   __m128 r = _mm_cvtpd_ps(a);
+  //
+  //   return validate_float(r, f0, f1, 0, 0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtpi32_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = impl.test_cases_int_pointer1;
+  //   __m64 a = load_m64(_a);
+  //
+  //   double trun[2] = {(double)_a[0], (double)_a[1]};
+  //
+  //   __m128d ret = _mm_cvtpi32_pd(a);
+  //
+  //   return validate_double(ret, trun[0], trun[1]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtps_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   __m128 a = load_m128(_a);
+  //   int32_t d[4];
+  //   switch (iter & 0x3) {
+  //   case 0:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+  //     for (uint32_t i = 0; i < 4; i++) {
+  //       d[i] = (int32_t)(bankersRounding(_a[i]));
+  //     }
+  //     break;
+  //   case 1:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+  //     for (uint32_t i = 0; i < 4; i++) {
+  //       d[i] = (int32_t)(floorf(_a[i]));
+  //     }
+  //     break;
+  //   case 2:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+  //     for (uint32_t i = 0; i < 4; i++) {
+  //       d[i] = (int32_t)(ceilf(_a[i]));
+  //     }
+  //     break;
+  //   case 3:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+  //     for (uint32_t i = 0; i < 4; i++) {
+  //       d[i] = (int32_t)(_a[i]);
+  //     }
+  //     break;
+  //   }
+  //
+  //   __m128i ret = _mm_cvtps_epi32(a);
+  //   return VALIDATE_INT32_M128(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtps_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   double d0 = (double)_a[0];
+  //   double d1 = (double)_a[1];
+  //   const __m128 a = load_m128(_a);
+  //
+  //   __m128d r = _mm_cvtps_pd(a);
+  //
+  //   return validate_double(r, d0, d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtsd_f64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //
+  //   double d = _a[0];
+  //
+  //   const __m128d *a = (const __m128d *)_a;
+  //   double r = _mm_cvtsd_f64(*a);
+  //
+  //   return r == d ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtsd_si32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   int32_t d;
+  //
+  //   switch (iter & 0x3) {
+  //   case 0:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+  //     d = (int32_t)(bankersRounding(_a[0]));
+  //     break;
+  //   case 1:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+  //     d = (int32_t)(floor(_a[0]));
+  //     break;
+  //   case 2:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+  //     d = (int32_t)(ceil(_a[0]));
+  //     break;
+  //   case 3:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+  //     d = (int32_t)(_a[0]);
+  //     break;
+  //   }
+  //
+  //   __m128d a = load_m128d(_a);
+  //   int32_t ret = _mm_cvtsd_si32(a);
+  //
+  //   return ret == d ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtsd_si64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   int64_t d;
+  //
+  //   switch (iter & 0x3) {
+  //   case 0:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+  //     d = (int64_t)(bankersRounding(_a[0]));
+  //     break;
+  //   case 1:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+  //     d = (int64_t)(floor(_a[0]));
+  //     break;
+  //   case 2:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+  //     d = (int64_t)(ceil(_a[0]));
+  //     break;
+  //   case 3:
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+  //     d = (int64_t)(_a[0]);
+  //     break;
+  //   }
+  //
+  //   __m128d a = load_m128d(_a);
+  //   int64_t ret = _mm_cvtsd_si64(a);
+  //
+  //   return ret == d ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtsd_si64x(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   return test_mm_cvtsd_si64(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtsd_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //
+  //   float f0 = _b[0];
+  //   float f1 = _a[1];
+  //   float f2 = _a[2];
+  //   float f3 = _a[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128 c = _mm_cvtsd_ss(a, b);
+  //
+  //   return validate_float(c, f0, f1, f2, f3);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtsi128_si32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //
+  //   int32_t d = _a[0];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   int c = _mm_cvtsi128_si32(a);
+  //
+  //   return d == c ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtsi128_si64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //
+  //   int64_t d = _a[0];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   int64_t c = _mm_cvtsi128_si64(a);
+  //
+  //   return d == c ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtsi128_si64x(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   return test_mm_cvtsi128_si64(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtsi32_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const int32_t b = (const int32_t)impl.test_cases_ints[iter];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d c = _mm_cvtsi32_sd(a, b);
+  //
+  //   return validate_double(c, b, _a[1]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtsi32_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //
+  //   int32_t d = _a[0];
+  //
+  //   __m128i c = _mm_cvtsi32_si128(*_a);
+  //
+  //   return validate_int32(c, d, 0, 0, 0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtsi64_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const int64_t b = (const int64_t)impl.test_cases_ints[iter];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d c = _mm_cvtsi64_sd(a, b);
+  //
+  //   return validate_double(c, b, _a[1]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtsi64_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //
+  //   int64_t d = _a[0];
+  //
+  //   __m128i c = _mm_cvtsi64_si128(*_a);
+  //
+  //   return validate_int64(c, d, 0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtsi64x_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   return test_mm_cvtsi64_sd(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtsi64x_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   return test_mm_cvtsi64_si128(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtss_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   double d0 = double(_b[0]);
+  //   double d1 = _a[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128d c = _mm_cvtss_sd(a, b);
+  //   return validate_double(c, d0, d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvttpd_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   int32_t d0 = (int32_t)(_a[0]);
+  //   int32_t d1 = (int32_t)(_a[1]);
+  //
+  //   __m128i ret = _mm_cvttpd_epi32(a);
+  //   return validate_int32(ret, d0, d1, 0, 0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvttpd_pi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   int32_t d0 = (int32_t)(_a[0]);
+  //   int32_t d1 = (int32_t)(_a[1]);
+  //
+  //   __m64 ret = _mm_cvttpd_pi32(a);
+  //   return validate_int32(ret, d0, d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvttps_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   __m128 a = load_m128(_a);
+  //   int32_t trun[4];
+  //   for (uint32_t i = 0; i < 4; i++) {
+  //     trun[i] = (int32_t)_a[i];
+  //   }
+  //
+  //   __m128i ret = _mm_cvttps_epi32(a);
+  //   return VALIDATE_INT32_M128(ret, trun);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvttsd_si32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //
+  //   __m128d a = _mm_load_sd(_a);
+  //   int32_t ret = _mm_cvttsd_si32(a);
+  //
+  //   return ret == (int32_t)_a[0] ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvttsd_si64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //
+  //   __m128d a = _mm_load_sd(_a);
+  //   int64_t ret = _mm_cvttsd_si64(a);
+  //
+  //   return ret == (int64_t)_a[0] ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvttsd_si64x(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // #if defined(__clang__)
+  // The intrinsic _mm_cvttsd_si64x() does not exist in Clang
+  //   return TEST_UNIMPL;
+  // #else
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //
+  //   __m128d a = _mm_load_sd(_a);
+  //   int64_t ret = _mm_cvttsd_si64x(a);
+  //
+  //   return ret == (int64_t)_a[0] ? TEST_SUCCESS : TEST_FAIL;
+  // #endif
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_div_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   double d0 = 0.0, d1 = 0.0;
+  //
+  //   if (_b[0] != 0.0)
+  //     d0 = _a[0] / _b[0];
+  //   if (_b[1] != 0.0)
+  //     d1 = _a[1] / _b[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_div_pd(a, b);
+  //   return validate_double(c, d0, d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_div_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //
+  //   double d0 = _a[0] / _b[0];
+  //   double d1 = _a[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //
+  //   __m128d c = _mm_div_sd(a, b);
+  //
+  //   return validate_double(c, d0, d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_extract_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   uint16_t *_a = (uint16_t *)impl.test_cases_int_pointer1;
+  //   const int idx = iter & 0x7;
+  //   __m128i a = load_m128i(_a);
+  //   int c;
+  //   switch (idx) {
+  //   case 0:
+  //     c = _mm_extract_epi16(a, 0);
+  //     break;
+  //   case 1:
+  //     c = _mm_extract_epi16(a, 1);
+  //     break;
+  //   case 2:
+  //     c = _mm_extract_epi16(a, 2);
+  //     break;
+  //   case 3:
+  //     c = _mm_extract_epi16(a, 3);
+  //     break;
+  //   case 4:
+  //     c = _mm_extract_epi16(a, 4);
+  //     break;
+  //   case 5:
+  //     c = _mm_extract_epi16(a, 5);
+  //     break;
+  //   case 6:
+  //     c = _mm_extract_epi16(a, 6);
+  //     break;
+  //   case 7:
+  //     c = _mm_extract_epi16(a, 7);
+  //     break;
+  //   }
+  //
+  //   ASSERT_RETURN(c == *(_a + idx));
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_insert_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t insert = (int16_t)*impl.test_cases_int_pointer2;
+  //
+  // #define TEST_IMPL(IDX)
+  //   int16_t d##IDX[8];
+  //   for (int i = 0; i < 8; i++) {
+  //     d##IDX[i] = _a[i];
+  //   }
+  //   d##IDX[IDX] = insert;
+  //
+  //   __m128i a##IDX = load_m128i(_a);
+  //   __m128i b##IDX = _mm_insert_epi16(a##IDX, insert, IDX);
+  //   CHECK_RESULT(VALIDATE_INT16_M128(b##IDX, d##IDX))
+  //
+  //   IMM_8_ITER
+  // #undef TEST_IMPL
+  //
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_lfence(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   /* FIXME: Assume that memory barriers always function as intended. */
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_load_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *p = (const double *)impl.test_cases_float_pointer1;
+  //   __m128d a = _mm_load_pd(p);
+  //   return validate_double(a, p[0], p[1]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_load_pd1(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *p = (const double *)impl.test_cases_float_pointer1;
+  //   __m128d a = _mm_load_pd1(p);
+  //   return validate_double(a, p[0], p[0]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_load_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *p = (const double *)impl.test_cases_float_pointer1;
+  //   __m128d a = _mm_load_sd(p);
+  //   return validate_double(a, p[0], 0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_load_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *addr = impl.test_cases_int_pointer1;
+  //
+  //   __m128i ret = _mm_load_si128((const __m128i *)addr);
+  //
+  //   return VALIDATE_INT32_M128(ret, addr);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_load1_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *addr = (const double *)impl.test_cases_float_pointer1;
+  //
+  //   __m128d ret = _mm_load1_pd(addr);
+  //
+  //   return validate_double(ret, addr[0], addr[0]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_loadh_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *addr = (const double *)impl.test_cases_float_pointer2;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d ret = _mm_loadh_pd(a, addr);
+  //
+  //   return validate_double(ret, _a[0], addr[0]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_loadl_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *addr = (const int64_t *)impl.test_cases_int_pointer1;
+  //
+  //   __m128i ret = _mm_loadl_epi64((const __m128i *)addr);
+  //
+  //   return validate_int64(ret, addr[0], 0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_loadl_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *addr = (const double *)impl.test_cases_float_pointer2;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d ret = _mm_loadl_pd(a, addr);
+  //
+  //   return validate_double(ret, addr[0], _a[1]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_loadr_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *addr = (const double *)impl.test_cases_float_pointer1;
+  //
+  //   __m128d ret = _mm_loadr_pd(addr);
+  //
+  //   return validate_double(ret, addr[1], addr[0]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_loadu_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *p = (const double *)impl.test_cases_float_pointer1;
+  //   __m128d a = _mm_loadu_pd(p);
+  //   return validate_double(a, p[0], p[1]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_loadu_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   __m128i c = _mm_loadu_si128((const __m128i *)_a);
+  //   return VALIDATE_INT32_M128(c, _a);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_loadu_si32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // The GCC version before 11 does not implement intrinsic function
+  // _mm_loadu_si32. Check https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95483
+  // for more information.
+  // #if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
+  //   return TEST_UNIMPL;
+  // #else
+  //   const int32_t *addr = (const int32_t *)impl.test_cases_int_pointer1;
+  //
+  //   __m128i ret = _mm_loadu_si32((const void *)addr);
+  //
+  //   return validate_int32(ret, addr[0], 0, 0, 0);
+  // #endif
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_madd_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //   int32_t d0 = (int32_t)_a[0] * _b[0];
+  //   int32_t d1 = (int32_t)_a[1] * _b[1];
+  //   int32_t d2 = (int32_t)_a[2] * _b[2];
+  //   int32_t d3 = (int32_t)_a[3] * _b[3];
+  //   int32_t d4 = (int32_t)_a[4] * _b[4];
+  //   int32_t d5 = (int32_t)_a[5] * _b[5];
+  //   int32_t d6 = (int32_t)_a[6] * _b[6];
+  //   int32_t d7 = (int32_t)_a[7] * _b[7];
+  //
+  //   int32_t e[4];
+  //   e[0] = d0 + d1;
+  //   e[1] = d2 + d3;
+  //   e[2] = d4 + d5;
+  //   e[3] = d6 + d7;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_madd_epi16(a, b);
+  //   return VALIDATE_INT32_M128(c, e);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_maskmoveu_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
+  //   const uint8_t *_mask = (const uint8_t *)impl.test_cases_int_pointer2;
+  //   char mem_addr[16];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i mask = load_m128i(_mask);
+  //   _mm_maskmoveu_si128(a, mask, mem_addr);
+  //
+  //   for (int i = 0; i < 16; i++) {
+  //     if (_mask[i] >> 7) {
+  //       ASSERT_RETURN(_a[i] == (uint8_t)mem_addr[i]);
+  //     }
+  //   }
+  //
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_max_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //   int16_t d[8];
+  //   d[0] = _a[0] > _b[0] ? _a[0] : _b[0];
+  //   d[1] = _a[1] > _b[1] ? _a[1] : _b[1];
+  //   d[2] = _a[2] > _b[2] ? _a[2] : _b[2];
+  //   d[3] = _a[3] > _b[3] ? _a[3] : _b[3];
+  //   d[4] = _a[4] > _b[4] ? _a[4] : _b[4];
+  //   d[5] = _a[5] > _b[5] ? _a[5] : _b[5];
+  //   d[6] = _a[6] > _b[6] ? _a[6] : _b[6];
+  //   d[7] = _a[7] > _b[7] ? _a[7] : _b[7];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //
+  //   __m128i c = _mm_max_epi16(a, b);
+  //   return VALIDATE_INT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_max_epu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //   uint8_t d[16];
+  //   d[0] =
+  //       ((uint8_t)_a[0] > (uint8_t)_b[0]) ? ((uint8_t)_a[0]) :
+  //       ((uint8_t)_b[0]);
+  //   d[1] =
+  //       ((uint8_t)_a[1] > (uint8_t)_b[1]) ? ((uint8_t)_a[1]) :
+  //       ((uint8_t)_b[1]);
+  //   d[2] =
+  //       ((uint8_t)_a[2] > (uint8_t)_b[2]) ? ((uint8_t)_a[2]) :
+  //       ((uint8_t)_b[2]);
+  //   d[3] =
+  //       ((uint8_t)_a[3] > (uint8_t)_b[3]) ? ((uint8_t)_a[3]) :
+  //       ((uint8_t)_b[3]);
+  //   d[4] =
+  //       ((uint8_t)_a[4] > (uint8_t)_b[4]) ? ((uint8_t)_a[4]) :
+  //       ((uint8_t)_b[4]);
+  //   d[5] =
+  //       ((uint8_t)_a[5] > (uint8_t)_b[5]) ? ((uint8_t)_a[5]) :
+  //       ((uint8_t)_b[5]);
+  //   d[6] =
+  //       ((uint8_t)_a[6] > (uint8_t)_b[6]) ? ((uint8_t)_a[6]) :
+  //       ((uint8_t)_b[6]);
+  //   d[7] =
+  //       ((uint8_t)_a[7] > (uint8_t)_b[7]) ? ((uint8_t)_a[7]) :
+  //       ((uint8_t)_b[7]);
+  //   d[8] =
+  //       ((uint8_t)_a[8] > (uint8_t)_b[8]) ? ((uint8_t)_a[8]) :
+  //       ((uint8_t)_b[8]);
+  //   d[9] =
+  //       ((uint8_t)_a[9] > (uint8_t)_b[9]) ? ((uint8_t)_a[9]) :
+  //       ((uint8_t)_b[9]);
+  //   d[10] = ((uint8_t)_a[10] > (uint8_t)_b[10]) ? ((uint8_t)_a[10])
+  //                                               : ((uint8_t)_b[10]);
+  //   d[11] = ((uint8_t)_a[11] > (uint8_t)_b[11]) ? ((uint8_t)_a[11])
+  //                                               : ((uint8_t)_b[11]);
+  //   d[12] = ((uint8_t)_a[12] > (uint8_t)_b[12]) ? ((uint8_t)_a[12])
+  //                                               : ((uint8_t)_b[12]);
+  //   d[13] = ((uint8_t)_a[13] > (uint8_t)_b[13]) ? ((uint8_t)_a[13])
+  //                                               : ((uint8_t)_b[13]);
+  //   d[14] = ((uint8_t)_a[14] > (uint8_t)_b[14]) ? ((uint8_t)_a[14])
+  //                                               : ((uint8_t)_b[14]);
+  //   d[15] = ((uint8_t)_a[15] > (uint8_t)_b[15]) ? ((uint8_t)_a[15])
+  //                                               : ((uint8_t)_b[15]);
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_max_epu8(a, b);
+  //   return VALIDATE_INT8_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_max_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //
+  //   double f0 = _a[0] > _b[0] ? _a[0] : _b[0];
+  //   double f1 = _a[1] > _b[1] ? _a[1] : _b[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_max_pd(a, b);
+  //
+  //   return validate_double(c, f0, f1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_max_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   double d0 = _a[0] > _b[0] ? _a[0] : _b[0];
+  //   double d1 = _a[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_max_sd(a, b);
+  //
+  //   return validate_double(c, d0, d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_mfence(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   /* FIXME: Assume that memory barriers always function as intended. */
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_min_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //   int16_t d[8];
+  //   d[0] = _a[0] < _b[0] ? _a[0] : _b[0];
+  //   d[1] = _a[1] < _b[1] ? _a[1] : _b[1];
+  //   d[2] = _a[2] < _b[2] ? _a[2] : _b[2];
+  //   d[3] = _a[3] < _b[3] ? _a[3] : _b[3];
+  //   d[4] = _a[4] < _b[4] ? _a[4] : _b[4];
+  //   d[5] = _a[5] < _b[5] ? _a[5] : _b[5];
+  //   d[6] = _a[6] < _b[6] ? _a[6] : _b[6];
+  //   d[7] = _a[7] < _b[7] ? _a[7] : _b[7];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_min_epi16(a, b);
+  //   return VALIDATE_INT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_min_epu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //   uint8_t d[16];
+  //   d[0] = ((uint8_t)_a[0] < (uint8_t)_b[0]) ? (uint8_t)_a[0] :
+  //   (uint8_t)_b[0]; d[1] = ((uint8_t)_a[1] < (uint8_t)_b[1]) ? (uint8_t)_a[1]
+  //   : (uint8_t)_b[1]; d[2] = ((uint8_t)_a[2] < (uint8_t)_b[2]) ?
+  //   (uint8_t)_a[2] : (uint8_t)_b[2]; d[3] = ((uint8_t)_a[3] < (uint8_t)_b[3])
+  //   ? (uint8_t)_a[3] : (uint8_t)_b[3]; d[4] = ((uint8_t)_a[4] <
+  //   (uint8_t)_b[4]) ? (uint8_t)_a[4] : (uint8_t)_b[4]; d[5] = ((uint8_t)_a[5]
+  //   < (uint8_t)_b[5]) ? (uint8_t)_a[5] : (uint8_t)_b[5]; d[6] =
+  //   ((uint8_t)_a[6] < (uint8_t)_b[6]) ? (uint8_t)_a[6] : (uint8_t)_b[6]; d[7]
+  //   = ((uint8_t)_a[7] < (uint8_t)_b[7]) ? (uint8_t)_a[7] : (uint8_t)_b[7];
+  //   d[8] = ((uint8_t)_a[8] < (uint8_t)_b[8]) ? (uint8_t)_a[8] :
+  //   (uint8_t)_b[8]; d[9] = ((uint8_t)_a[9] < (uint8_t)_b[9]) ? (uint8_t)_a[9]
+  //   : (uint8_t)_b[9]; d[10] =
+  //       ((uint8_t)_a[10] < (uint8_t)_b[10]) ? (uint8_t)_a[10] :
+  //       (uint8_t)_b[10];
+  //   d[11] =
+  //       ((uint8_t)_a[11] < (uint8_t)_b[11]) ? (uint8_t)_a[11] :
+  //       (uint8_t)_b[11];
+  //   d[12] =
+  //       ((uint8_t)_a[12] < (uint8_t)_b[12]) ? (uint8_t)_a[12] :
+  //       (uint8_t)_b[12];
+  //   d[13] =
+  //       ((uint8_t)_a[13] < (uint8_t)_b[13]) ? (uint8_t)_a[13] :
+  //       (uint8_t)_b[13];
+  //   d[14] =
+  //       ((uint8_t)_a[14] < (uint8_t)_b[14]) ? (uint8_t)_a[14] :
+  //       (uint8_t)_b[14];
+  //   d[15] =
+  //       ((uint8_t)_a[15] < (uint8_t)_b[15]) ? (uint8_t)_a[15] :
+  //       (uint8_t)_b[15];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_min_epu8(a, b);
+  //   return VALIDATE_INT8_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_min_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   double f0 = _a[0] < _b[0] ? _a[0] : _b[0];
+  //   double f1 = _a[1] < _b[1] ? _a[1] : _b[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //
+  //   __m128d c = _mm_min_pd(a, b);
+  //   return validate_double(c, f0, f1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_min_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   double d0 = _a[0] < _b[0] ? _a[0] : _b[0];
+  //   double d1 = _a[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_min_sd(a, b);
+  //
+  //   return validate_double(c, d0, d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_move_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //
+  //   int64_t d0 = _a[0];
+  //   int64_t d1 = 0;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i c = _mm_move_epi64(a);
+  //
+  //   return validate_int64(c, d0, d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_move_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //
+  //   double result[2];
+  //   result[0] = _b[0];
+  //   result[1] = _a[1];
+  //
+  //   __m128d ret = _mm_move_sd(a, b);
+  //   return validate_double(ret, result[0], result[1]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_movemask_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = impl.test_cases_int_pointer1;
+  //   __m128i a = load_m128i(_a);
+  //
+  //   const uint8_t *ip = (const uint8_t *)_a;
+  //   int ret = 0;
+  //   uint32_t mask = 1;
+  //   for (uint32_t i = 0; i < 16; i++) {
+  //     if (ip[i] & 0x80) {
+  //       ret |= mask;
+  //     }
+  //     mask = mask << 1;
+  //   }
+  //   int test = _mm_movemask_epi8(a);
+  //   ASSERT_RETURN(test == ret);
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_movemask_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   unsigned int _c = 0;
+  //   _c |= ((*(const uint64_t *)_a) >> 63) & 0x1;
+  //   _c |= (((*(const uint64_t *)(_a + 1)) >> 62) & 0x2);
+  //
+  //   __m128d a = load_m128d(_a);
+  //   int c = _mm_movemask_pd(a);
+  //
+  //   ASSERT_RETURN((unsigned int)c == _c);
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_movepi64_pi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //
+  //   int64_t d0 = _a[0];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m64 c = _mm_movepi64_pi64(a);
+  //
+  //   return validate_int64(c, d0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_movpi64_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //
+  //   int64_t d0 = _a[0];
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m128i c = _mm_movpi64_epi64(a);
+  //
+  //   return validate_int64(c, d0, 0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_mul_epu32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint32_t *_a = (const uint32_t *)impl.test_cases_int_pointer1;
+  //   const uint32_t *_b = (const uint32_t *)impl.test_cases_int_pointer2;
+  //   uint64_t dx = (uint64_t)(_a[0]) * (uint64_t)(_b[0]);
+  //   uint64_t dy = (uint64_t)(_a[2]) * (uint64_t)(_b[2]);
+  //
+  //   __m128i a = _mm_loadu_si128((const __m128i *)_a);
+  //   __m128i b = _mm_loadu_si128((const __m128i *)_b);
+  //   __m128i r = _mm_mul_epu32(a, b);
+  //   return validate_uint64(r, dx, dy);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_mul_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   double d0 = _a[0] * _b[0];
+  //   double d1 = _a[1] * _b[1];
+  //
+  //   __m128d a = _mm_load_pd(_a);
+  //   __m128d b = _mm_load_pd(_b);
+  //   __m128d c = _mm_mul_pd(a, b);
+  //   return validate_double(c, d0, d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_mul_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   double dx = _a[0] * _b[0];
+  //   double dy = _a[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_mul_sd(a, b);
+  //   return validate_double(c, dx, dy);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_mul_su32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint32_t *_a = (const uint32_t *)impl.test_cases_int_pointer1;
+  //   const uint32_t *_b = (const uint32_t *)impl.test_cases_int_pointer2;
+  //
+  //   uint64_t u = (uint64_t)(_a[0]) * (uint64_t)(_b[0]);
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 r = _mm_mul_su32(a, b);
+  //
+  //   return validate_uint64(r, u);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_mulhi_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //   int16_t d[8];
+  //   for (uint32_t i = 0; i < 8; i++) {
+  //     int32_t m = (int32_t)_a[i] * (int32_t)_b[i];
+  //     d[i] = (int16_t)(m >> 16);
+  //   }
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_mulhi_epi16(a, b);
+  //   return VALIDATE_INT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_mulhi_epu16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1;
+  //   const uint16_t *_b = (const uint16_t *)impl.test_cases_int_pointer2;
+  //   uint16_t d[8];
+  //   for (uint32_t i = 0; i < 8; i++) {
+  //     uint32_t m = (uint32_t)_a[i] * (uint32_t)_b[i];
+  //     d[i] = (uint16_t)(m >> 16);
+  //   }
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_mulhi_epu16(a, b);
+  //   return VALIDATE_INT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_mullo_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //   int16_t d[8];
+  //   d[0] = _a[0] * _b[0];
+  //   d[1] = _a[1] * _b[1];
+  //   d[2] = _a[2] * _b[2];
+  //   d[3] = _a[3] * _b[3];
+  //   d[4] = _a[4] * _b[4];
+  //   d[5] = _a[5] * _b[5];
+  //   d[6] = _a[6] * _b[6];
+  //   d[7] = _a[7] * _b[7];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_mullo_epi16(a, b);
+  //   return VALIDATE_INT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_or_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_float_pointer1;
+  //   const int64_t *_b = (const int64_t *)impl.test_cases_float_pointer2;
+  //
+  //   int64_t d0 = _a[0] | _b[0];
+  //   int64_t d1 = _a[1] | _b[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_or_pd(a, b);
+  //
+  //   return validate_double(c, *((double *)&d0), *((double *)&d1));
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_or_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = impl.test_cases_int_pointer1;
+  //   const int32_t *_b = impl.test_cases_int_pointer2;
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128 fc = _mm_or_ps(*(const __m128 *)&a, *(const __m128 *)&b);
+  //   __m128i c = *(const __m128i *)&fc;
+  // now for the assertion...
+  //   const uint32_t *ia = (const uint32_t *)&a;
+  //   const uint32_t *ib = (const uint32_t *)&b;
+  //   uint32_t r[4];
+  //   r[0] = ia[0] | ib[0];
+  //   r[1] = ia[1] | ib[1];
+  //   r[2] = ia[2] | ib[2];
+  //   r[3] = ia[3] | ib[3];
+  //   __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]);
+  //   result_t res = VALIDATE_INT32_M128(c, r);
+  //   if (res) {
+  //     res = VALIDATE_INT32_M128(ret, r);
+  //   }
+  //   return res;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_packs_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   int8_t max = INT8_MAX;
+  //   int8_t min = INT8_MIN;
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //
+  //   int8_t d[16];
+  //   for (int i = 0; i < 8; i++) {
+  //     if (_a[i] > max)
+  //       d[i] = max;
+  //     else if (_a[i] < min)
+  //       d[i] = min;
+  //     else
+  //       d[i] = (int8_t)_a[i];
+  //   }
+  //   for (int i = 0; i < 8; i++) {
+  //     if (_b[i] > max)
+  //       d[i + 8] = max;
+  //     else if (_b[i] < min)
+  //       d[i + 8] = min;
+  //     else
+  //       d[i + 8] = (int8_t)_b[i];
+  //   }
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_packs_epi16(a, b);
+  //
+  //   return VALIDATE_INT8_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_packs_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   int16_t max = INT16_MAX;
+  //   int16_t min = INT16_MIN;
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2;
+  //
+  //   int16_t d[8];
+  //   for (int i = 0; i < 4; i++) {
+  //     if (_a[i] > max)
+  //       d[i] = max;
+  //     else if (_a[i] < min)
+  //       d[i] = min;
+  //     else
+  //       d[i] = (int16_t)_a[i];
+  //   }
+  //   for (int i = 0; i < 4; i++) {
+  //     if (_b[i] > max)
+  //       d[i + 4] = max;
+  //     else if (_b[i] < min)
+  //       d[i + 4] = min;
+  //     else
+  //       d[i + 4] = (int16_t)_b[i];
+  //   }
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_packs_epi32(a, b);
+  //
+  //   return VALIDATE_INT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_packus_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   uint8_t max = UINT8_MAX;
+  //   uint8_t min = 0;
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //
+  //   uint8_t d[16];
+  //   for (int i = 0; i < 8; i++) {
+  //     if (_a[i] > (int16_t)max)
+  //       d[i] = max;
+  //     else if (_a[i] < (int16_t)min)
+  //       d[i] = min;
+  //     else
+  //       d[i] = (uint8_t)_a[i];
+  //   }
+  //   for (int i = 0; i < 8; i++) {
+  //     if (_b[i] > (int16_t)max)
+  //       d[i + 8] = max;
+  //     else if (_b[i] < (int16_t)min)
+  //       d[i + 8] = min;
+  //     else
+  //       d[i + 8] = (uint8_t)_b[i];
+  //   }
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_packus_epi16(a, b);
+  //
+  //   return VALIDATE_UINT8_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_pause(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   _mm_pause();
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sad_epu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
+  //   const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2;
+  //   uint16_t d0 = 0;
+  //   uint16_t d1 = 0;
+  //   for (int i = 0; i < 8; i++) {
+  //     d0 += abs(_a[i] - _b[i]);
+  //   }
+  //   for (int i = 8; i < 16; i++) {
+  //     d1 += abs(_a[i] - _b[i]);
+  //   }
+  //
+  //   const __m128i a = load_m128i(_a);
+  //   const __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_sad_epu8(a, b);
+  //   return validate_uint16(c, d0, 0, 0, 0, d1, 0, 0, 0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_set_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   int16_t d[8];
+  //   d[0] = _a[0];
+  //   d[1] = _a[1];
+  //   d[2] = _a[2];
+  //   d[3] = _a[3];
+  //   d[4] = _a[4];
+  //   d[5] = _a[5];
+  //   d[6] = _a[6];
+  //   d[7] = _a[7];
+  //
+  //   __m128i c = _mm_set_epi16(d[7], d[6], d[5], d[4], d[3], d[2], d[1],
+  //   d[0]); return VALIDATE_INT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_set_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   int32_t d[4];
+  //   d[3] = impl.test_cases_ints[iter];
+  //   d[2] = impl.test_cases_ints[iter + 1];
+  //   d[1] = impl.test_cases_ints[iter + 2];
+  //   d[0] = impl.test_cases_ints[iter + 3];
+  //   __m128i a = _mm_set_epi32(d[3], d[2], d[1], d[0]);
+  //   return VALIDATE_INT32_M128(a, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_set_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //
+  //   __m128i ret = _mm_set_epi64(load_m64(&_a[1]), load_m64(&_a[0]));
+  //
+  //   return validate_int64(ret, _a[0], _a[1]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_set_epi64x(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //
+  //   __m128i ret = _mm_set_epi64x(_a[1], _a[0]);
+  //
+  //   return validate_int64(ret, _a[0], _a[1]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_set_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   int8_t d[16];
+  //   d[0] = _a[0];
+  //   d[1] = _a[1];
+  //   d[2] = _a[2];
+  //   d[3] = _a[3];
+  //   d[4] = _a[4];
+  //   d[5] = _a[5];
+  //   d[6] = _a[6];
+  //   d[7] = _a[7];
+  //   d[8] = _a[8];
+  //   d[9] = _a[9];
+  //   d[10] = _a[10];
+  //   d[11] = _a[11];
+  //   d[12] = _a[12];
+  //   d[13] = _a[13];
+  //   d[14] = _a[14];
+  //   d[15] = _a[15];
+  //
+  //   __m128i c = _mm_set_epi8(d[15], d[14], d[13], d[12], d[11], d[10], d[9],
+  //   d[8],
+  //                            d[7], d[6], d[5], d[4], d[3], d[2], d[1], d[0]);
+  //   return VALIDATE_INT8_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_set_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *p = (const double *)impl.test_cases_float_pointer1;
+  //   double x = p[0];
+  //   double y = p[1];
+  //   __m128d a = _mm_set_pd(x, y);
+  //   return validate_double(a, y, x);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_set_pd1(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double _a = impl.test_cases_floats[iter];
+  //
+  //   __m128d a = _mm_set_pd1(_a);
+  //
+  //   return validate_double(a, _a, _a);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_set_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //
+  //   double f0 = _a[0];
+  //   double f1 = 0.0;
+  //
+  //   __m128d a = _mm_set_sd(_a[0]);
+  //   return validate_double(a, f0, f1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_set1_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   int16_t d0 = _a[0];
+  //
+  //   __m128i c = _mm_set1_epi16(d0);
+  //   return validate_int16(c, d0, d0, d0, d0, d0, d0, d0, d0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_set1_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   int32_t x = impl.test_cases_ints[iter];
+  //   __m128i a = _mm_set1_epi32(x);
+  //   return validate_int32(a, x, x, x, x);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_set1_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //
+  //   __m128i ret = _mm_set1_epi64(load_m64(&_a[0]));
+  //
+  //   return validate_int64(ret, _a[0], _a[0]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_set1_epi64x(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //
+  //   __m128i ret = _mm_set1_epi64x(_a[0]);
+  //
+  //   return validate_int64(ret, _a[0], _a[0]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_set1_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   int8_t d0 = _a[0];
+  //   __m128i c = _mm_set1_epi8(d0);
+  //   return validate_int8(c, d0, d0, d0, d0, d0, d0, d0, d0, d0, d0, d0, d0,
+  //   d0,
+  //                        d0, d0, d0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_set1_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   double d0 = _a[0];
+  //   __m128d c = _mm_set1_pd(d0);
+  //   return validate_double(c, d0, d0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_setr_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //
+  //   __m128i c =
+  //       _mm_setr_epi16(_a[0], _a[1], _a[2], _a[3], _a[4], _a[5], _a[6],
+  //       _a[7]);
+  //
+  //   return VALIDATE_INT16_M128(c, _a);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_setr_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   __m128i c = _mm_setr_epi32(_a[0], _a[1], _a[2], _a[3]);
+  //   return VALIDATE_INT32_M128(c, _a);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_setr_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //   __m128i c = _mm_setr_epi64(load_m64(&_a[0]), load_m64(&_a[1]));
+  //   return validate_int64(c, _a[0], _a[1]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_setr_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //
+  //   __m128i c = _mm_setr_epi8(_a[0], _a[1], _a[2], _a[3], _a[4], _a[5],
+  //   _a[6],
+  //                             _a[7], _a[8], _a[9], _a[10], _a[11], _a[12],
+  //                             _a[13], _a[14], _a[15]);
+  //
+  //   return VALIDATE_INT8_M128(c, _a);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_setr_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *p = (const double *)impl.test_cases_float_pointer1;
+  //
+  //   double x = p[0];
+  //   double y = p[1];
+  //
+  //   __m128d a = _mm_setr_pd(x, y);
+  //
+  //   return validate_double(a, x, y);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_setzero_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   __m128d a = _mm_setzero_pd();
+  //   return validate_double(a, 0, 0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_setzero_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   __m128i a = _mm_setzero_si128();
+  //   return validate_int32(a, 0, 0, 0, 0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_shuffle_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = impl.test_cases_int_pointer1;
+  //   __m128i a, c;
+  //
+  // #define TEST_IMPL(IDX)
+  //   int32_t d##IDX[4];
+  //   d##IDX[0] = _a[((IDX) & 0x3)];
+  //   d##IDX[1] = _a[((IDX >> 2) & 0x3)];
+  //   d##IDX[2] = _a[((IDX >> 4) & 0x3)];
+  //   d##IDX[3] = _a[((IDX >> 6) & 0x3)];
+  //
+  //   a = load_m128i(_a);
+  //   c = _mm_shuffle_epi32(a, IDX);
+  // CHECK_RESULT(VALIDATE_INT32_M128(c, d##IDX))
+  //
+  //   IMM_256_ITER
+  // #undef TEST_IMPL
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_shuffle_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   __m128d a, b, c;
+  //
+  // #define TEST_IMPL(IDX)
+  //   a = load_m128d(_a);
+  //   b = load_m128d(_b);
+  //   c = _mm_shuffle_pd(a, b, IDX);
+  //
+  //   double d0##IDX = _a[IDX & 0x1];
+  //   double d1##IDX = _b[(IDX & 0x2) >> 1];
+  // CHECK_RESULT(validate_double(c, d0##IDX, d1##IDX))
+  //
+  //   IMM_4_ITER
+  // #undef TEST_IMPL
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_shufflehi_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   __m128i a, c;
+  //
+  // #define TEST_IMPL(IDX)
+  //   int16_t d##IDX[8];
+  //   d##IDX[0] = _a[0];
+  //   d##IDX[1] = _a[1];
+  //   d##IDX[2] = _a[2];
+  //   d##IDX[3] = _a[3];
+  //   d##IDX[4] = ((const int64_t *)_a)[1] >> ((IDX & 0x3) * 16);
+  //   d##IDX[5] = ((const int64_t *)_a)[1] >> (((IDX >> 2) & 0x3) * 16);
+  //   d##IDX[6] = ((const int64_t *)_a)[1] >> (((IDX >> 4) & 0x3) * 16);
+  //   d##IDX[7] = ((const int64_t *)_a)[1] >> (((IDX >> 6) & 0x3) * 16);
+  //
+  //   a = load_m128i(_a);
+  //   c = _mm_shufflehi_epi16(a, IDX);
+  //
+  //   CHECK_RESULT(VALIDATE_INT16_M128(c, d##IDX))
+  //
+  //   IMM_256_ITER
+  // #undef TEST_IMPL
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_shufflelo_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   __m128i a, c;
+  //
+  // #define TEST_IMPL(IDX)
+  //   int16_t d##IDX[8];
+  //   d##IDX[0] = ((const int64_t *)_a)[0] >> ((IDX & 0x3) * 16);
+  //   d##IDX[1] = ((const int64_t *)_a)[0] >> (((IDX >> 2) & 0x3) * 16);
+  //   d##IDX[2] = ((const int64_t *)_a)[0] >> (((IDX >> 4) & 0x3) * 16);
+  //   d##IDX[3] = ((const int64_t *)_a)[0] >> (((IDX >> 6) & 0x3) * 16);
+  //   d##IDX[4] = _a[4];
+  //   d##IDX[5] = _a[5];
+  //   d##IDX[6] = _a[6];
+  //   d##IDX[7] = _a[7];
+  //
+  //   a = load_m128i(_a);
+  //   c = _mm_shufflelo_epi16(a, IDX);
+  //
+  //   CHECK_RESULT(VALIDATE_INT16_M128(c, d##IDX))
+  //
+  //   IMM_256_ITER
+  // #undef TEST_IMPL
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sll_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   __m128i a, b, c;
+  //
+  // #define TEST_IMPL(IDX)
+  //   uint16_t d##IDX[8];
+  //   d##IDX[0] = (IDX > 15) ? 0 : _a[0] << IDX;
+  //   d##IDX[1] = (IDX > 15) ? 0 : _a[1] << IDX;
+  //   d##IDX[2] = (IDX > 15) ? 0 : _a[2] << IDX;
+  //   d##IDX[3] = (IDX > 15) ? 0 : _a[3] << IDX;
+  //   d##IDX[4] = (IDX > 15) ? 0 : _a[4] << IDX;
+  //   d##IDX[5] = (IDX > 15) ? 0 : _a[5] << IDX;
+  //   d##IDX[6] = (IDX > 15) ? 0 : _a[6] << IDX;
+  //   d##IDX[7] = (IDX > 15) ? 0 : _a[7] << IDX;
+  //
+  //   a = load_m128i(_a);
+  //   b = _mm_set1_epi64x(IDX);
+  //   c = _mm_sll_epi16(a, b);
+  // CHECK_RESULT(VALIDATE_INT16_M128(c, d##IDX))
+  //
+  //   IMM_64_ITER
+  // #undef TEST_IMPL
+  //
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sll_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   __m128i a, b, c;
+  //
+  // #define TEST_IMPL(IDX)
+  //   uint32_t d##IDX[4];
+  //   d##IDX[0] = (IDX > 31) ? 0 : _a[0] << IDX;
+  //   d##IDX[1] = (IDX > 31) ? 0 : _a[1] << IDX;
+  //   d##IDX[2] = (IDX > 31) ? 0 : _a[2] << IDX;
+  //   d##IDX[3] = (IDX > 31) ? 0 : _a[3] << IDX;
+  //
+  //   a = load_m128i(_a);
+  //   b = _mm_set1_epi64x(IDX);
+  //   c = _mm_sll_epi32(a, b);
+  // CHECK_RESULT(VALIDATE_INT32_M128(c, d##IDX))
+  //
+  //   IMM_64_ITER
+  // #undef TEST_IMPL
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sll_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //   __m128i a, b, c;
+  //
+  // #define TEST_IMPL(IDX)
+  //   uint64_t d0##IDX = (IDX & ~63) ? 0 : _a[0] << IDX;
+  //   uint64_t d1##IDX = (IDX & ~63) ? 0 : _a[1] << IDX;
+  //
+  //   a = load_m128i(_a);
+  //   b = _mm_set1_epi64x(IDX);
+  //   c = _mm_sll_epi64(a, b);
+  //
+  //   CHECK_RESULT(validate_int64(c, d0##IDX, d1##IDX))
+  //
+  //   IMM_64_ITER
+  // #undef TEST_IMPL
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_slli_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   __m128i a, c;
+  //
+  // #define TEST_IMPL(IDX)
+  //   int16_t d##IDX[8];
+  //   d##IDX[0] = (IDX > 15) ? 0 : _a[0] << IDX;
+  //   d##IDX[1] = (IDX > 15) ? 0 : _a[1] << IDX;
+  //   d##IDX[2] = (IDX > 15) ? 0 : _a[2] << IDX;
+  //   d##IDX[3] = (IDX > 15) ? 0 : _a[3] << IDX;
+  //   d##IDX[4] = (IDX > 15) ? 0 : _a[4] << IDX;
+  //   d##IDX[5] = (IDX > 15) ? 0 : _a[5] << IDX;
+  //   d##IDX[6] = (IDX > 15) ? 0 : _a[6] << IDX;
+  //   d##IDX[7] = (IDX > 15) ? 0 : _a[7] << IDX;
+  //
+  //   a = load_m128i(_a);
+  //   c = _mm_slli_epi16(a, IDX);
+  // CHECK_RESULT(VALIDATE_INT16_M128(c, d##IDX))
+  //
+  //   IMM_64_ITER
+  // #undef TEST_IMPL
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_slli_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  // #if defined(__clang__)
+  // Clang compiler does not allow the second argument of _mm_slli_epi32() to
+  // be greater than 31.
+  //   const int count = (int)(iter % 33 - 1); // range: -1 ~ 31
+  // #else
+  //   const int count = (int)(iter % 34 - 1); // range: -1 ~ 32
+  // #endif
+  //
+  //   int32_t d[4];
+  //   d[0] = (count & ~31) ? 0 : _a[0] << count;
+  //   d[1] = (count & ~31) ? 0 : _a[1] << count;
+  //   d[2] = (count & ~31) ? 0 : _a[2] << count;
+  //   d[3] = (count & ~31) ? 0 : _a[3] << count;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i c = _mm_slli_epi32(a, count);
+  //   return VALIDATE_INT32_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_slli_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  // #if defined(__clang__)
+  // Clang compiler does not allow the second argument of "_mm_slli_epi64()"
+  // to be greater than 63.
+  //   const int count = (int)(iter % 65 - 1); // range: -1 ~ 63
+  // #else
+  //   const int count = (int)(iter % 66 - 1); // range: -1 ~ 64
+  // #endif
+  //   int64_t d0 = (count & ~63) ? 0 : _a[0] << count;
+  //   int64_t d1 = (count & ~63) ? 0 : _a[1] << count;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i c = _mm_slli_epi64(a, count);
+  //   return validate_int64(c, d0, d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_slli_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = impl.test_cases_int_pointer1;
+  //
+  //   int8_t d[16];
+  //   int count = (iter % 5) << 2;
+  //   for (int i = 0; i < 16; i++) {
+  //     if (i < count)
+  //       d[i] = 0;
+  //     else
+  //       d[i] = ((const int8_t *)_a)[i - count];
+  //   }
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i ret;
+  //   switch (iter % 5) {
+  //   case 0:
+  //     ret = _mm_slli_si128(a, 0);
+  //     break;
+  //   case 1:
+  //     ret = _mm_slli_si128(a, 4);
+  //     break;
+  //   case 2:
+  //     ret = _mm_slli_si128(a, 8);
+  //     break;
+  //   case 3:
+  //     ret = _mm_slli_si128(a, 12);
+  //     break;
+  //   case 4:
+  //     ret = _mm_slli_si128(a, 16);
+  //     break;
+  //   }
+  //
+  //   return VALIDATE_INT8_M128(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sqrt_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //
+  //   double f0 = sqrt(_a[0]);
+  //   double f1 = sqrt(_a[1]);
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d c = _mm_sqrt_pd(a);
+  //
+  //   return validate_double_error(c, f0, f1, 1.0e-15);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sqrt_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //
+  //   double f0 = sqrt(_b[0]);
+  //   double f1 = _a[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_sqrt_sd(a, b);
+  //
+  //   return validate_double_error(c, f0, f1, 1.0e-15);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sra_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int64_t count = (int64_t)(iter % 18 - 1); // range: -1 ~ 16
+  //
+  //   int16_t d[8];
+  //   d[0] = (count & ~15) ? (_a[0] < 0 ? ~UINT16_C(0) : 0) : (_a[0] >> count);
+  //   d[1] = (count & ~15) ? (_a[1] < 0 ? ~UINT16_C(0) : 0) : (_a[1] >> count);
+  //   d[2] = (count & ~15) ? (_a[2] < 0 ? ~UINT16_C(0) : 0) : (_a[2] >> count);
+  //   d[3] = (count & ~15) ? (_a[3] < 0 ? ~UINT16_C(0) : 0) : (_a[3] >> count);
+  //   d[4] = (count & ~15) ? (_a[4] < 0 ? ~UINT16_C(0) : 0) : (_a[4] >> count);
+  //   d[5] = (count & ~15) ? (_a[5] < 0 ? ~UINT16_C(0) : 0) : (_a[5] >> count);
+  //   d[6] = (count & ~15) ? (_a[6] < 0 ? ~UINT16_C(0) : 0) : (_a[6] >> count);
+  //   d[7] = (count & ~15) ? (_a[7] < 0 ? ~UINT16_C(0) : 0) : (_a[7] >> count);
+  //
+  //   __m128i a = _mm_load_si128((const __m128i *)_a);
+  //   __m128i b = _mm_set1_epi64x(count);
+  //   __m128i c = _mm_sra_epi16(a, b);
+  //
+  //   return VALIDATE_INT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sra_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   const int64_t count = (int64_t)(iter % 34 - 1); // range: -1 ~ 32
+  //
+  //   int32_t d[4];
+  //   d[0] = (count & ~31) ? (_a[0] < 0 ? ~UINT32_C(0) : 0) : _a[0] >> count;
+  //   d[1] = (count & ~31) ? (_a[1] < 0 ? ~UINT32_C(0) : 0) : _a[1] >> count;
+  //   d[2] = (count & ~31) ? (_a[2] < 0 ? ~UINT32_C(0) : 0) : _a[2] >> count;
+  //   d[3] = (count & ~31) ? (_a[3] < 0 ? ~UINT32_C(0) : 0) : _a[3] >> count;
+  //
+  //   __m128i a = _mm_load_si128((const __m128i *)_a);
+  //   __m128i b = _mm_set1_epi64x(count);
+  //   __m128i c = _mm_sra_epi32(a, b);
+  //
+  //   return VALIDATE_INT32_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_srai_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int32_t b = (int32_t)(iter % 18 - 1); // range: -1 ~ 16
+  //   int16_t d[8];
+  //   int count = (b & ~15) ? 15 : b;
+  //
+  //   for (int i = 0; i < 8; i++) {
+  //     d[i] = _a[i] >> count;
+  //   }
+  //
+  //   __m128i a = _mm_load_si128((const __m128i *)_a);
+  //   __m128i c = _mm_srai_epi16(a, b);
+  //
+  //   return VALIDATE_INT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_srai_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   const int32_t b = (int32_t)(iter % 34 - 1); // range: -1 ~ 32
+  //
+  //   int32_t d[4];
+  //   int count = (b & ~31) ? 31 : b;
+  //   for (int i = 0; i < 4; i++) {
+  //     d[i] = _a[i] >> count;
+  //   }
+  //
+  //   __m128i a = _mm_load_si128((const __m128i *)_a);
+  //   __m128i c = _mm_srai_epi32(a, b);
+  //
+  //   return VALIDATE_INT32_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_srl_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int64_t count = (int64_t)(iter % 18 - 1); // range: -1 ~ 16
+  //
+  //   uint16_t d[8];
+  //   d[0] = (count & ~15) ? 0 : (uint16_t)(_a[0]) >> count;
+  //   d[1] = (count & ~15) ? 0 : (uint16_t)(_a[1]) >> count;
+  //   d[2] = (count & ~15) ? 0 : (uint16_t)(_a[2]) >> count;
+  //   d[3] = (count & ~15) ? 0 : (uint16_t)(_a[3]) >> count;
+  //   d[4] = (count & ~15) ? 0 : (uint16_t)(_a[4]) >> count;
+  //   d[5] = (count & ~15) ? 0 : (uint16_t)(_a[5]) >> count;
+  //   d[6] = (count & ~15) ? 0 : (uint16_t)(_a[6]) >> count;
+  //   d[7] = (count & ~15) ? 0 : (uint16_t)(_a[7]) >> count;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = _mm_set1_epi64x(count);
+  //   __m128i c = _mm_srl_epi16(a, b);
+  //
+  //   return VALIDATE_INT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_srl_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   const int64_t count = (int64_t)(iter % 34 - 1); // range: -1 ~ 32
+  //
+  //   uint32_t d[4];
+  //   d[0] = (count & ~31) ? 0 : (uint32_t)(_a[0]) >> count;
+  //   d[1] = (count & ~31) ? 0 : (uint32_t)(_a[1]) >> count;
+  //   d[2] = (count & ~31) ? 0 : (uint32_t)(_a[2]) >> count;
+  //   d[3] = (count & ~31) ? 0 : (uint32_t)(_a[3]) >> count;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = _mm_set1_epi64x(count);
+  //   __m128i c = _mm_srl_epi32(a, b);
+  //
+  //   return VALIDATE_INT32_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_srl_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //   const int64_t count = (int64_t)(iter % 66 - 1); // range: -1 ~ 64
+  //
+  //   uint64_t d0 = (count & ~63) ? 0 : (uint64_t)(_a[0]) >> count;
+  //   uint64_t d1 = (count & ~63) ? 0 : (uint64_t)(_a[1]) >> count;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = _mm_set1_epi64x(count);
+  //   __m128i c = _mm_srl_epi64(a, b);
+  //
+  //   return validate_int64(c, d0, d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_srli_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int count = (int)(iter % 18 - 1); // range: -1 ~ 16
+  //
+  //   int16_t d[8];
+  //   d[0] = count & (~15) ? 0 : (uint16_t)(_a[0]) >> count;
+  //   d[1] = count & (~15) ? 0 : (uint16_t)(_a[1]) >> count;
+  //   d[2] = count & (~15) ? 0 : (uint16_t)(_a[2]) >> count;
+  //   d[3] = count & (~15) ? 0 : (uint16_t)(_a[3]) >> count;
+  //   d[4] = count & (~15) ? 0 : (uint16_t)(_a[4]) >> count;
+  //   d[5] = count & (~15) ? 0 : (uint16_t)(_a[5]) >> count;
+  //   d[6] = count & (~15) ? 0 : (uint16_t)(_a[6]) >> count;
+  //   d[7] = count & (~15) ? 0 : (uint16_t)(_a[7]) >> count;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i c = _mm_srli_epi16(a, count);
+  //
+  //   return VALIDATE_INT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_srli_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   const int count = (int)(iter % 34 - 1); // range: -1 ~ 32
+  //
+  //   int32_t d[4];
+  //   d[0] = count & (~31) ? 0 : (uint32_t)(_a[0]) >> count;
+  //   d[1] = count & (~31) ? 0 : (uint32_t)(_a[1]) >> count;
+  //   d[2] = count & (~31) ? 0 : (uint32_t)(_a[2]) >> count;
+  //   d[3] = count & (~31) ? 0 : (uint32_t)(_a[3]) >> count;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i c = _mm_srli_epi32(a, count);
+  //
+  //   return VALIDATE_INT32_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_srli_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //   const int count = (int)(iter % 66 - 1); // range: -1 ~ 64
+  //
+  //   int64_t d0 = count & (~63) ? 0 : (uint64_t)(_a[0]) >> count;
+  //   int64_t d1 = count & (~63) ? 0 : (uint64_t)(_a[1]) >> count;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i c = _mm_srli_epi64(a, count);
+  //
+  //   return validate_int64(c, d0, d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_srli_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int count = (iter % 5) << 2;
+  //
+  //   int8_t d[16];
+  //   for (int i = 0; i < 16; i++) {
+  //     if (i >= (16 - count))
+  //       d[i] = 0;
+  //     else
+  //       d[i] = _a[i + count];
+  //   }
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i ret;
+  //   switch (iter % 5) {
+  //   case 0:
+  //     ret = _mm_srli_si128(a, 0);
+  //     break;
+  //   case 1:
+  //     ret = _mm_srli_si128(a, 4);
+  //     break;
+  //   case 2:
+  //     ret = _mm_srli_si128(a, 8);
+  //     break;
+  //   case 3:
+  //     ret = _mm_srli_si128(a, 12);
+  //     break;
+  //   case 4:
+  //     ret = _mm_srli_si128(a, 16);
+  //     break;
+  //   }
+  //
+  //   return VALIDATE_INT8_M128(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_store_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   double *p = (double *)impl.test_cases_float_pointer1;
+  //   double x = impl.test_cases_floats[iter + 4];
+  //   double y = impl.test_cases_floats[iter + 6];
+  //
+  //   __m128d a = _mm_set_pd(x, y);
+  //   _mm_store_pd(p, a);
+  //   ASSERT_RETURN(p[0] == y);
+  //   ASSERT_RETURN(p[1] == x);
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_store_pd1(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   double *p = (double *)impl.test_cases_float_pointer1;
+  //   double _a[2] = {(double)impl.test_cases_floats[iter],
+  //                   (double)impl.test_cases_floats[iter + 1]};
+  //
+  //   __m128d a = load_m128d(_a);
+  //   _mm_store_pd1(p, a);
+  //   ASSERT_RETURN(p[0] == impl.test_cases_floats[iter]);
+  //   ASSERT_RETURN(p[1] == impl.test_cases_floats[iter]);
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_store_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   double *p = (double *)impl.test_cases_float_pointer1;
+  //   double _a[2] = {(double)impl.test_cases_floats[iter],
+  //                   (double)impl.test_cases_floats[iter + 1]};
+  //
+  //   __m128d a = load_m128d(_a);
+  //   _mm_store_sd(p, a);
+  //   ASSERT_RETURN(p[0] == impl.test_cases_floats[iter]);
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_store_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   alignas(16) int32_t p[4];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   _mm_store_si128((__m128i *)p, a);
+  //
+  //   return VALIDATE_INT32_M128(a, p);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_store1_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   return test_mm_store_pd1(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_storeh_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   double *p = (double *)impl.test_cases_float_pointer1;
+  //   double mem;
+  //
+  //   __m128d a = load_m128d(p);
+  //   _mm_storeh_pd(&mem, a);
+  //
+  //   ASSERT_RETURN(mem == p[1]);
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_storel_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   int64_t *p = (int64_t *)impl.test_cases_int_pointer1;
+  //   __m128i mem;
+  //
+  //   __m128i a = load_m128i(p);
+  //   _mm_storel_epi64(&mem, a);
+  //
+  //   ASSERT_RETURN(((SIMDVec *)&mem)->m128_u64[0] == (uint64_t)p[0]);
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_storel_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   double *p = (double *)impl.test_cases_float_pointer1;
+  //   double mem;
+  //
+  //   __m128d a = load_m128d(p);
+  //   _mm_storel_pd(&mem, a);
+  //
+  //   ASSERT_RETURN(mem == p[0]);
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_storer_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   double *p = (double *)impl.test_cases_float_pointer1;
+  //   double mem[2];
+  //
+  //   __m128d a = load_m128d(p);
+  //   _mm_storer_pd(mem, a);
+  //
+  //   __m128d res = load_m128d(mem);
+  //   return validate_double(res, p[1], p[0]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_storeu_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   double *p = (double *)impl.test_cases_float_pointer1;
+  //   double x = impl.test_cases_floats[iter + 4];
+  //   double y = impl.test_cases_floats[iter + 6];
+  //
+  //   __m128d a = _mm_set_pd(x, y);
+  //   _mm_storeu_pd(p, a);
+  //   ASSERT_RETURN(p[0] == y);
+  //   ASSERT_RETURN(p[1] == x);
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_storeu_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   __m128i b;
+  //   __m128i a = load_m128i(_a);
+  //   _mm_storeu_si128(&b, a);
+  //   int32_t *_b = (int32_t *)&b;
+  //   return VALIDATE_INT32_M128(a, _b);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_storeu_si32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // The GCC version before 11 does not implement intrinsic function
+  // _mm_storeu_si32. Check https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95483
+  // for more information.
+  // #if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
+  //   return TEST_UNIMPL;
+  // #else
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   __m128i b;
+  //   __m128i a = load_m128i(_a);
+  //   _mm_storeu_si32(&b, a);
+  //   int32_t *_b = (int32_t *)&b;
+  //   return validate_int32(b, _a[0], _b[1], _b[2], _b[3]);
+  // #endif
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_stream_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   double p[2];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   _mm_stream_pd(p, a);
+  //
+  //   return validate_double(a, p[0], p[1]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_stream_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   alignas(16) int32_t p[4];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   _mm_stream_si128((__m128i *)p, a);
+  //
+  //   return VALIDATE_INT32_M128(a, p);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_stream_si32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t a = (const int32_t)impl.test_cases_ints[iter];
+  //   int32_t p;
+  //
+  //   _mm_stream_si32(&p, a);
+  //
+  //   ASSERT_RETURN(a == p)
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_stream_si64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t a = (const int64_t)impl.test_cases_ints[iter];
+  //   __int64 p[1];
+  //   _mm_stream_si64(p, a);
+  //   ASSERT_RETURN(p[0] == a);
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sub_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //   int16_t d[8];
+  //   d[0] = _a[0] - _b[0];
+  //   d[1] = _a[1] - _b[1];
+  //   d[2] = _a[2] - _b[2];
+  //   d[3] = _a[3] - _b[3];
+  //   d[4] = _a[4] - _b[4];
+  //   d[5] = _a[5] - _b[5];
+  //   d[6] = _a[6] - _b[6];
+  //   d[7] = _a[7] - _b[7];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_sub_epi16(a, b);
+  //   return VALIDATE_INT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sub_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = impl.test_cases_int_pointer1;
+  //   const int32_t *_b = impl.test_cases_int_pointer2;
+  //   int32_t d[4];
+  //   d[0] = _a[0] - _b[0];
+  //   d[1] = _a[1] - _b[1];
+  //   d[2] = _a[2] - _b[2];
+  //   d[3] = _a[3] - _b[3];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_sub_epi32(a, b);
+  //   return VALIDATE_INT32_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sub_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (int64_t *)impl.test_cases_int_pointer1;
+  //   const int64_t *_b = (int64_t *)impl.test_cases_int_pointer2;
+  //   int64_t d0 = _a[0] - _b[0];
+  //   int64_t d1 = _a[1] - _b[1];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_sub_epi64(a, b);
+  //   return validate_int64(c, d0, d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sub_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //   int8_t d[16];
+  //   d[0] = _a[0] - _b[0];
+  //   d[1] = _a[1] - _b[1];
+  //   d[2] = _a[2] - _b[2];
+  //   d[3] = _a[3] - _b[3];
+  //   d[4] = _a[4] - _b[4];
+  //   d[5] = _a[5] - _b[5];
+  //   d[6] = _a[6] - _b[6];
+  //   d[7] = _a[7] - _b[7];
+  //   d[8] = _a[8] - _b[8];
+  //   d[9] = _a[9] - _b[9];
+  //   d[10] = _a[10] - _b[10];
+  //   d[11] = _a[11] - _b[11];
+  //   d[12] = _a[12] - _b[12];
+  //   d[13] = _a[13] - _b[13];
+  //   d[14] = _a[14] - _b[14];
+  //   d[15] = _a[15] - _b[15];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_sub_epi8(a, b);
+  //   return VALIDATE_INT8_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sub_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   double d0 = _a[0] - _b[0];
+  //   double d1 = _a[1] - _b[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_sub_pd(a, b);
+  //   return validate_double(c, d0, d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sub_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   double d0 = _a[0] - _b[0];
+  //   double d1 = _a[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_sub_sd(a, b);
+  //   return validate_double(c, d0, d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sub_si64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //   const int64_t *_b = (const int64_t *)impl.test_cases_int_pointer2;
+  //
+  //   int64_t d = _a[0] - _b[0];
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 c = _mm_sub_si64(a, b);
+  //
+  //   return validate_int64(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_subs_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   int32_t max = 32767;
+  //   int32_t min = -32768;
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //
+  //   int16_t d[8];
+  //   for (int i = 0; i < 8; i++) {
+  //     int32_t res = (int32_t)_a[i] - (int32_t)_b[i];
+  //     if (res > max)
+  //       d[i] = max;
+  //     else if (res < min)
+  //       d[i] = min;
+  //     else
+  //       d[i] = (int16_t)res;
+  //   }
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_subs_epi16(a, b);
+  //
+  //   return VALIDATE_INT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_subs_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   int16_t max = 127;
+  //   int16_t min = -128;
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //
+  //   int8_t d[16];
+  //   for (int i = 0; i < 16; i++) {
+  //     int16_t res = (int16_t)_a[i] - (int16_t)_b[i];
+  //     if (res > max)
+  //       d[i] = max;
+  //     else if (res < min)
+  //       d[i] = min;
+  //     else
+  //       d[i] = (int8_t)res;
+  //   }
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_subs_epi8(a, b);
+  //
+  //   return VALIDATE_INT8_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_subs_epu16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //   uint16_t d[8];
+  //   d[0] = (uint16_t)_a[0] - (uint16_t)_b[0];
+  //   if (d[0] > (uint16_t)_a[0])
+  //     d[0] = 0;
+  //   d[1] = (uint16_t)_a[1] - (uint16_t)_b[1];
+  //   if (d[1] > (uint16_t)_a[1])
+  //     d[1] = 0;
+  //   d[2] = (uint16_t)_a[2] - (uint16_t)_b[2];
+  //   if (d[2] > (uint16_t)_a[2])
+  //     d[2] = 0;
+  //   d[3] = (uint16_t)_a[3] - (uint16_t)_b[3];
+  //   if (d[3] > (uint16_t)_a[3])
+  //     d[3] = 0;
+  //   d[4] = (uint16_t)_a[4] - (uint16_t)_b[4];
+  //   if (d[4] > (uint16_t)_a[4])
+  //     d[4] = 0;
+  //   d[5] = (uint16_t)_a[5] - (uint16_t)_b[5];
+  //   if (d[5] > (uint16_t)_a[5])
+  //     d[5] = 0;
+  //   d[6] = (uint16_t)_a[6] - (uint16_t)_b[6];
+  //   if (d[6] > (uint16_t)_a[6])
+  //     d[6] = 0;
+  //   d[7] = (uint16_t)_a[7] - (uint16_t)_b[7];
+  //   if (d[7] > (uint16_t)_a[7])
+  //     d[7] = 0;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //
+  //   __m128i c = _mm_subs_epu16(a, b);
+  //   return VALIDATE_INT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_subs_epu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //   uint8_t d[16];
+  //   d[0] = (uint8_t)_a[0] - (uint8_t)_b[0];
+  //   if (d[0] > (uint8_t)_a[0])
+  //     d[0] = 0;
+  //   d[1] = (uint8_t)_a[1] - (uint8_t)_b[1];
+  //   if (d[1] > (uint8_t)_a[1])
+  //     d[1] = 0;
+  //   d[2] = (uint8_t)_a[2] - (uint8_t)_b[2];
+  //   if (d[2] > (uint8_t)_a[2])
+  //     d[2] = 0;
+  //   d[3] = (uint8_t)_a[3] - (uint8_t)_b[3];
+  //   if (d[3] > (uint8_t)_a[3])
+  //     d[3] = 0;
+  //   d[4] = (uint8_t)_a[4] - (uint8_t)_b[4];
+  //   if (d[4] > (uint8_t)_a[4])
+  //     d[4] = 0;
+  //   d[5] = (uint8_t)_a[5] - (uint8_t)_b[5];
+  //   if (d[5] > (uint8_t)_a[5])
+  //     d[5] = 0;
+  //   d[6] = (uint8_t)_a[6] - (uint8_t)_b[6];
+  //   if (d[6] > (uint8_t)_a[6])
+  //     d[6] = 0;
+  //   d[7] = (uint8_t)_a[7] - (uint8_t)_b[7];
+  //   if (d[7] > (uint8_t)_a[7])
+  //     d[7] = 0;
+  //   d[8] = (uint8_t)_a[8] - (uint8_t)_b[8];
+  //   if (d[8] > (uint8_t)_a[8])
+  //     d[8] = 0;
+  //   d[9] = (uint8_t)_a[9] - (uint8_t)_b[9];
+  //   if (d[9] > (uint8_t)_a[9])
+  //     d[9] = 0;
+  //   d[10] = (uint8_t)_a[10] - (uint8_t)_b[10];
+  //   if (d[10] > (uint8_t)_a[10])
+  //     d[10] = 0;
+  //   d[11] = (uint8_t)_a[11] - (uint8_t)_b[11];
+  //   if (d[11] > (uint8_t)_a[11])
+  //     d[11] = 0;
+  //   d[12] = (uint8_t)_a[12] - (uint8_t)_b[12];
+  //   if (d[12] > (uint8_t)_a[12])
+  //     d[12] = 0;
+  //   d[13] = (uint8_t)_a[13] - (uint8_t)_b[13];
+  //   if (d[13] > (uint8_t)_a[13])
+  //     d[13] = 0;
+  //   d[14] = (uint8_t)_a[14] - (uint8_t)_b[14];
+  //   if (d[14] > (uint8_t)_a[14])
+  //     d[14] = 0;
+  //   d[15] = (uint8_t)_a[15] - (uint8_t)_b[15];
+  //   if (d[15] > (uint8_t)_a[15])
+  //     d[15] = 0;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_subs_epu8(a, b);
+  //   return VALIDATE_INT8_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_ucomieq_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   return test_mm_comieq_sd(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_ucomige_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   return test_mm_comige_sd(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_ucomigt_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   return test_mm_comigt_sd(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_ucomile_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   return test_mm_comile_sd(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_ucomilt_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   return test_mm_comilt_sd(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_ucomineq_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   return test_mm_comineq_sd(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_undefined_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   __m128d a = _mm_undefined_pd();
+  //   a = _mm_xor_pd(a, a);
+  //   return validate_double(a, 0, 0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_undefined_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   __m128i a = _mm_undefined_si128();
+  //   a = _mm_xor_si128(a, a);
+  //   return validate_int64(a, 0, 0);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_unpackhi_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //
+  //   int16_t d[8];
+  //   d[0] = _a[4];
+  //   d[1] = _b[4];
+  //   d[2] = _a[5];
+  //   d[3] = _b[5];
+  //   d[4] = _a[6];
+  //   d[5] = _b[6];
+  //   d[6] = _a[7];
+  //   d[7] = _b[7];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i ret = _mm_unpackhi_epi16(a, b);
+  //
+  //   return VALIDATE_INT16_M128(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_unpackhi_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2;
+  //
+  //   int32_t d[4];
+  //   d[0] = _a[2];
+  //   d[1] = _b[2];
+  //   d[2] = _a[3];
+  //   d[3] = _b[3];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i ret = _mm_unpackhi_epi32(a, b);
+  //
+  //   return VALIDATE_INT32_M128(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_unpackhi_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //   const int64_t *_b = (const int64_t *)impl.test_cases_int_pointer2;
+  //
+  //   int64_t i0 = _a[1];
+  //   int64_t i1 = _b[1];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i ret = _mm_unpackhi_epi64(a, b);
+  //
+  //   return validate_int64(ret, i0, i1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_unpackhi_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //
+  //   int8_t d[16];
+  //   d[0] = _a[8];
+  //   d[1] = _b[8];
+  //   d[2] = _a[9];
+  //   d[3] = _b[9];
+  //   d[4] = _a[10];
+  //   d[5] = _b[10];
+  //   d[6] = _a[11];
+  //   d[7] = _b[11];
+  //   d[8] = _a[12];
+  //   d[9] = _b[12];
+  //   d[10] = _a[13];
+  //   d[11] = _b[13];
+  //   d[12] = _a[14];
+  //   d[13] = _b[14];
+  //   d[14] = _a[15];
+  //   d[15] = _b[15];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i ret = _mm_unpackhi_epi8(a, b);
+  //
+  //   return VALIDATE_INT8_M128(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_unpackhi_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d ret = _mm_unpackhi_pd(a, b);
+  //
+  //   return validate_double(ret, _a[1], _b[1]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_unpacklo_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //
+  //   int16_t d[8];
+  //   d[0] = _a[0];
+  //   d[1] = _b[0];
+  //   d[2] = _a[1];
+  //   d[3] = _b[1];
+  //   d[4] = _a[2];
+  //   d[5] = _b[2];
+  //   d[6] = _a[3];
+  //   d[7] = _b[3];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i ret = _mm_unpacklo_epi16(a, b);
+  //
+  //   return VALIDATE_INT16_M128(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_unpacklo_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2;
+  //
+  //   int32_t d[4];
+  //   d[0] = _a[0];
+  //   d[1] = _b[0];
+  //   d[2] = _a[1];
+  //   d[3] = _b[1];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i ret = _mm_unpacklo_epi32(a, b);
+  //
+  //   return VALIDATE_INT32_M128(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_unpacklo_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //   const int64_t *_b = (const int64_t *)impl.test_cases_int_pointer2;
+  //
+  //   int64_t i0 = _a[0];
+  //   int64_t i1 = _b[0];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i ret = _mm_unpacklo_epi64(a, b);
+  //
+  //   return validate_int64(ret, i0, i1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_unpacklo_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //
+  //   int8_t d[16];
+  //   d[0] = _a[0];
+  //   d[1] = _b[0];
+  //   d[2] = _a[1];
+  //   d[3] = _b[1];
+  //   d[4] = _a[2];
+  //   d[5] = _b[2];
+  //   d[6] = _a[3];
+  //   d[7] = _b[3];
+  //   d[8] = _a[4];
+  //   d[9] = _b[4];
+  //   d[10] = _a[5];
+  //   d[11] = _b[5];
+  //   d[12] = _a[6];
+  //   d[13] = _b[6];
+  //   d[14] = _a[7];
+  //   d[15] = _b[7];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i ret = _mm_unpacklo_epi8(a, b);
+  //
+  //   return VALIDATE_INT8_M128(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_unpacklo_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d ret = _mm_unpacklo_pd(a, b);
+  //
+  //   return validate_double(ret, _a[0], _b[0]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_xor_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_float_pointer1;
+  //   const int64_t *_b = (const int64_t *)impl.test_cases_float_pointer2;
+  //
+  //   int64_t d0 = _a[0] ^ _b[0];
+  //   int64_t d1 = _a[1] ^ _b[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_xor_pd(a, b);
+  //
+  //   return validate_double(c, *((double *)&d0), *((double *)&d1));
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_xor_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //   const int64_t *_b = (const int64_t *)impl.test_cases_int_pointer2;
+  //
+  //   int64_t d0 = _a[0] ^ _b[0];
+  //   int64_t d1 = _a[1] ^ _b[1];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_xor_si128(a, b);
+  //
+  //   return validate_int64(c, d0, d1);
+  return TEST_UNIMPL;
+}
+
+/* SSE3 */
+result_t test_mm_addsub_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //
+  //   double d0 = _a[0] - _b[0];
+  //   double d1 = _a[1] + _b[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_addsub_pd(a, b);
+  //
+  //   return validate_double(c, d0, d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_addsub_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //
+  //   float f0 = _a[0] - _b[0];
+  //   float f1 = _a[1] + _b[1];
+  //   float f2 = _a[2] - _b[2];
+  //   float f3 = _a[3] + _b[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 c = _mm_addsub_ps(a, b);
+  //
+  //   return validate_float(c, f0, f1, f2, f3);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_hadd_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //
+  //   double f0 = _a[0] + _a[1];
+  //   double f1 = _b[0] + _b[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_hadd_pd(a, b);
+  //
+  //   return validate_double(c, f0, f1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_hadd_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //
+  //   float f0 = _a[0] + _a[1];
+  //   float f1 = _a[2] + _a[3];
+  //   float f2 = _b[0] + _b[1];
+  //   float f3 = _b[2] + _b[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 c = _mm_hadd_ps(a, b);
+  //
+  //   return validate_float(c, f0, f1, f2, f3);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_hsub_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //
+  //   double f0 = _a[0] - _a[1];
+  //   double f1 = _b[0] - _b[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d c = _mm_hsub_pd(a, b);
+  //
+  //   return validate_double(c, f0, f1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_hsub_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //
+  //   float f0 = _a[0] - _a[1];
+  //   float f1 = _a[2] - _a[3];
+  //   float f2 = _b[0] - _b[1];
+  //   float f3 = _b[2] - _b[3];
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 c = _mm_hsub_ps(a, b);
+  //
+  //   return validate_float(c, f0, f1, f2, f3);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_lddqu_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   return test_mm_loadu_si128(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_loaddup_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *addr = (const double *)impl.test_cases_float_pointer1;
+  //
+  //   __m128d ret = _mm_loaddup_pd(addr);
+  //
+  //   return validate_double(ret, addr[0], addr[0]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_movedup_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *p = (const double *)impl.test_cases_float_pointer1;
+  //   __m128d a = load_m128d(p);
+  //   __m128d b = _mm_movedup_pd(a);
+  //
+  //   return validate_double(b, p[0], p[0]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_movehdup_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *p = impl.test_cases_float_pointer1;
+  //   __m128 a = load_m128(p);
+  //   return validate_float(_mm_movehdup_ps(a), p[1], p[1], p[3], p[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_moveldup_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *p = impl.test_cases_float_pointer1;
+  //   __m128 a = load_m128(p);
+  //   return validate_float(_mm_moveldup_ps(a), p[0], p[0], p[2], p[2]);
+  return TEST_UNIMPL;
+}
+
+/* SSSE3 */
+result_t test_mm_abs_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   __m128i a = load_m128i(_a);
+  //   __m128i c = _mm_abs_epi16(a);
+  //
+  //   uint32_t d[8];
+  //   d[0] = (_a[0] < 0) ? -_a[0] : _a[0];
+  //   d[1] = (_a[1] < 0) ? -_a[1] : _a[1];
+  //   d[2] = (_a[2] < 0) ? -_a[2] : _a[2];
+  //   d[3] = (_a[3] < 0) ? -_a[3] : _a[3];
+  //   d[4] = (_a[4] < 0) ? -_a[4] : _a[4];
+  //   d[5] = (_a[5] < 0) ? -_a[5] : _a[5];
+  //   d[6] = (_a[6] < 0) ? -_a[6] : _a[6];
+  //   d[7] = (_a[7] < 0) ? -_a[7] : _a[7];
+  //
+  //   return VALIDATE_UINT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_abs_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   __m128i a = load_m128i(_a);
+  //   __m128i c = _mm_abs_epi32(a);
+  //
+  //   uint32_t d[4];
+  //   d[0] = (_a[0] < 0) ? -_a[0] : _a[0];
+  //   d[1] = (_a[1] < 0) ? -_a[1] : _a[1];
+  //   d[2] = (_a[2] < 0) ? -_a[2] : _a[2];
+  //   d[3] = (_a[3] < 0) ? -_a[3] : _a[3];
+  //
+  //   return VALIDATE_UINT32_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_abs_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   __m128i a = load_m128i(_a);
+  //   __m128i c = _mm_abs_epi8(a);
+  //
+  //   uint32_t d[16];
+  //   for (int i = 0; i < 16; i++) {
+  //     d[i] = (_a[i] < 0) ? -_a[i] : _a[i];
+  //   }
+  //
+  //   return VALIDATE_UINT8_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_abs_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   __m64 a = load_m64(_a);
+  //   __m64 c = _mm_abs_pi16(a);
+  //
+  //   uint32_t d[4];
+  //   d[0] = (_a[0] < 0) ? -_a[0] : _a[0];
+  //   d[1] = (_a[1] < 0) ? -_a[1] : _a[1];
+  //   d[2] = (_a[2] < 0) ? -_a[2] : _a[2];
+  //   d[3] = (_a[3] < 0) ? -_a[3] : _a[3];
+  //
+  //   return VALIDATE_UINT16_M64(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_abs_pi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   __m64 a = load_m64(_a);
+  //   __m64 c = _mm_abs_pi32(a);
+  //
+  //   uint32_t d[2];
+  //   d[0] = (_a[0] < 0) ? -_a[0] : _a[0];
+  //   d[1] = (_a[1] < 0) ? -_a[1] : _a[1];
+  //
+  //   return VALIDATE_UINT32_M64(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_abs_pi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   __m64 a = load_m64(_a);
+  //   __m64 c = _mm_abs_pi8(a);
+  //
+  //   uint32_t d[8];
+  //   d[0] = (_a[0] < 0) ? -_a[0] : _a[0];
+  //   d[1] = (_a[1] < 0) ? -_a[1] : _a[1];
+  //   d[2] = (_a[2] < 0) ? -_a[2] : _a[2];
+  //   d[3] = (_a[3] < 0) ? -_a[3] : _a[3];
+  //   d[4] = (_a[4] < 0) ? -_a[4] : _a[4];
+  //   d[5] = (_a[5] < 0) ? -_a[5] : _a[5];
+  //   d[6] = (_a[6] < 0) ? -_a[6] : _a[6];
+  //   d[7] = (_a[7] < 0) ? -_a[7] : _a[7];
+  //
+  //   return VALIDATE_UINT8_M64(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_alignr_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // #if defined(__clang__)
+  //   return TEST_UNIMPL;
+  // #else
+  //   const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
+  //   const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2;
+  //   unsigned int shift = (iter % 5) << 3;
+  //   uint8_t d[32];
+  //
+  //   if (shift >= 32) {
+  //     memset((void *)d, 0, sizeof(d));
+  //   } else {
+  //     memcpy((void *)d, (const void *)_b, 16);
+  //     memcpy((void *)(d + 16), (const void *)_a, 16);
+  //     // shifting
+  //     for (size_t x = 0; x < sizeof(d); x++) {
+  //       if (x + shift >= sizeof(d))
+  //         d[x] = 0;
+  //       else
+  //         d[x] = d[x + shift];
+  //     }
+  //   }
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i ret;
+  //   switch (iter % 5) {
+  //   case 0:
+  //     ret = _mm_alignr_epi8(a, b, 0);
+  //     break;
+  //   case 1:
+  //     ret = _mm_alignr_epi8(a, b, 8);
+  //     break;
+  //   case 2:
+  //     ret = _mm_alignr_epi8(a, b, 16);
+  //     break;
+  //   case 3:
+  //     ret = _mm_alignr_epi8(a, b, 24);
+  //     break;
+  //   case 4:
+  //     ret = _mm_alignr_epi8(a, b, 32);
+  //     break;
+  //   }
+  //
+  //   return VALIDATE_UINT8_M128(ret, d);
+  // #endif
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_alignr_pi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  // #if defined(__clang__)
+  //   return TEST_UNIMPL;
+  // #else
+  //   const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
+  //   const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2;
+  //   unsigned int shift = (iter % 3) << 3;
+  //   uint8_t d[16];
+  //
+  //   if (shift >= 16) {
+  //     memset((void *)d, 0, sizeof(d));
+  //   } else {
+  //     memcpy((void *)d, (const void *)_b, 8);
+  //     memcpy((void *)(d + 8), (const void *)_a, 8);
+  //     // shifting
+  //     for (size_t x = 0; x < sizeof(d); x++) {
+  //       if (x + shift >= sizeof(d))
+  //         d[x] = 0;
+  //       else
+  //         d[x] = d[x + shift];
+  //     }
+  //   }
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 ret;
+  //   switch (iter % 3) {
+  //   case 0:
+  //     ret = _mm_alignr_pi8(a, b, 0);
+  //     break;
+  //   case 1:
+  //     ret = _mm_alignr_pi8(a, b, 8);
+  //     break;
+  //   case 2:
+  //     ret = _mm_alignr_pi8(a, b, 16);
+  //     break;
+  //   }
+  //
+  //   return VALIDATE_UINT8_M64(ret, d);
+  // #endif
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_hadd_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //   int16_t d[8];
+  //   d[0] = _a[0] + _a[1];
+  //   d[1] = _a[2] + _a[3];
+  //   d[2] = _a[4] + _a[5];
+  //   d[3] = _a[6] + _a[7];
+  //   d[4] = _b[0] + _b[1];
+  //   d[5] = _b[2] + _b[3];
+  //   d[6] = _b[4] + _b[5];
+  //   d[7] = _b[6] + _b[7];
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i ret = _mm_hadd_epi16(a, b);
+  //   return VALIDATE_INT16_M128(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_hadd_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2;
+  //   int32_t d[4];
+  //   d[0] = _a[0] + _a[1];
+  //   d[1] = _a[2] + _a[3];
+  //   d[2] = _b[0] + _b[1];
+  //   d[3] = _b[2] + _b[3];
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i ret = _mm_hadd_epi32(a, b);
+  //   return VALIDATE_INT32_M128(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_hadd_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //   int16_t d[4];
+  //   d[0] = _a[0] + _a[1];
+  //   d[1] = _a[2] + _a[3];
+  //   d[2] = _b[0] + _b[1];
+  //   d[3] = _b[2] + _b[3];
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 ret = _mm_hadd_pi16(a, b);
+  //   return VALIDATE_INT16_M64(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_hadd_pi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2;
+  //   int32_t d[2];
+  //   d[0] = _a[0] + _a[1];
+  //   d[1] = _b[0] + _b[1];
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 ret = _mm_hadd_pi32(a, b);
+  //   return VALIDATE_INT32_M64(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_hadds_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer1;
+  //
+  //   int16_t d16[8];
+  //   int32_t d32[8];
+  //   d32[0] = (int32_t)_a[0] + (int32_t)_a[1];
+  //   d32[1] = (int32_t)_a[2] + (int32_t)_a[3];
+  //   d32[2] = (int32_t)_a[4] + (int32_t)_a[5];
+  //   d32[3] = (int32_t)_a[6] + (int32_t)_a[7];
+  //   d32[4] = (int32_t)_b[0] + (int32_t)_b[1];
+  //   d32[5] = (int32_t)_b[2] + (int32_t)_b[3];
+  //   d32[6] = (int32_t)_b[4] + (int32_t)_b[5];
+  //   d32[7] = (int32_t)_b[6] + (int32_t)_b[7];
+  //   for (int i = 0; i < 8; i++) {
+  //     if (d32[i] > (int32_t)INT16_MAX)
+  //       d16[i] = INT16_MAX;
+  //     else if (d32[i] < (int32_t)INT16_MIN)
+  //       d16[i] = INT16_MIN;
+  //     else
+  //       d16[i] = (int16_t)d32[i];
+  //   }
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_hadds_epi16(a, b);
+  //
+  //   return VALIDATE_INT16_M128(c, d16);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_hadds_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer1;
+  //
+  //   int16_t d16[8];
+  //   int32_t d32[8];
+  //   d32[0] = (int32_t)_a[0] + (int32_t)_a[1];
+  //   d32[1] = (int32_t)_a[2] + (int32_t)_a[3];
+  //   d32[2] = (int32_t)_b[0] + (int32_t)_b[1];
+  //   d32[3] = (int32_t)_b[2] + (int32_t)_b[3];
+  //   for (int i = 0; i < 8; i++) {
+  //     if (d32[i] > (int32_t)INT16_MAX)
+  //       d16[i] = INT16_MAX;
+  //     else if (d32[i] < (int32_t)INT16_MIN)
+  //       d16[i] = INT16_MIN;
+  //     else
+  //       d16[i] = (int16_t)d32[i];
+  //   }
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 c = _mm_hadds_pi16(a, b);
+  //
+  //   return VALIDATE_INT16_M64(c, d16);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_hsub_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer1;
+  //
+  //   int16_t d[8];
+  //   d[0] = _a[0] - _a[1];
+  //   d[1] = _a[2] - _a[3];
+  //   d[2] = _a[4] - _a[5];
+  //   d[3] = _a[6] - _a[7];
+  //   d[4] = _b[0] - _b[1];
+  //   d[5] = _b[2] - _b[3];
+  //   d[6] = _b[4] - _b[5];
+  //   d[7] = _b[6] - _b[7];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_hsub_epi16(a, b);
+  //
+  //   return VALIDATE_INT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_hsub_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = impl.test_cases_int_pointer1;
+  //   const int32_t *_b = impl.test_cases_int_pointer1;
+  //
+  //   int32_t d[4];
+  //   d[0] = _a[0] - _a[1];
+  //   d[1] = _a[2] - _a[3];
+  //   d[2] = _b[0] - _b[1];
+  //   d[3] = _b[2] - _b[3];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_hsub_epi32(a, b);
+  //
+  //   return VALIDATE_INT32_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_hsub_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //
+  //   int16_t d[4];
+  //   d[0] = _a[0] - _a[1];
+  //   d[1] = _a[2] - _a[3];
+  //   d[2] = _b[0] - _b[1];
+  //   d[3] = _b[2] - _b[3];
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 c = _mm_hsub_pi16(a, b);
+  //
+  //   return VALIDATE_INT16_M64(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_hsub_pi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = impl.test_cases_int_pointer1;
+  //   const int32_t *_b = impl.test_cases_int_pointer2;
+  //
+  //   int32_t d[2];
+  //   d[0] = _a[0] - _a[1];
+  //   d[1] = _b[0] - _b[1];
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 c = _mm_hsub_pi32(a, b);
+  //
+  //   return VALIDATE_INT32_M64(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_hsubs_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer1;
+  //
+  //   int16_t d16[8];
+  //   int32_t d32[8];
+  //   d32[0] = (int32_t)_a[0] - (int32_t)_a[1];
+  //   d32[1] = (int32_t)_a[2] - (int32_t)_a[3];
+  //   d32[2] = (int32_t)_a[4] - (int32_t)_a[5];
+  //   d32[3] = (int32_t)_a[6] - (int32_t)_a[7];
+  //   d32[4] = (int32_t)_b[0] - (int32_t)_b[1];
+  //   d32[5] = (int32_t)_b[2] - (int32_t)_b[3];
+  //   d32[6] = (int32_t)_b[4] - (int32_t)_b[5];
+  //   d32[7] = (int32_t)_b[6] - (int32_t)_b[7];
+  //   for (int i = 0; i < 8; i++) {
+  //     if (d32[i] > (int32_t)INT16_MAX)
+  //       d16[i] = INT16_MAX;
+  //     else if (d32[i] < (int32_t)INT16_MIN)
+  //       d16[i] = INT16_MIN;
+  //     else
+  //       d16[i] = (int16_t)d32[i];
+  //   }
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_hsubs_epi16(a, b);
+  //
+  //   return VALIDATE_INT16_M128(c, d16);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_hsubs_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer1;
+  //
+  //   int32_t _d[4];
+  //   _d[0] = (int32_t)_a[0] - (int32_t)_a[1];
+  //   _d[1] = (int32_t)_a[2] - (int32_t)_a[3];
+  //   _d[2] = (int32_t)_b[0] - (int32_t)_b[1];
+  //   _d[3] = (int32_t)_b[2] - (int32_t)_b[3];
+  //
+  //   for (int i = 0; i < 4; i++) {
+  //     if (_d[i] > (int32_t)INT16_MAX) {
+  //       _d[i] = INT16_MAX;
+  //     } else if (_d[i] < (int32_t)INT16_MIN) {
+  //       _d[i] = INT16_MIN;
+  //     }
+  //   }
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 c = _mm_hsubs_pi16(a, b);
+  //
+  //   return VALIDATE_INT16_M64(c, _d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_maddubs_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //   int32_t d0 = (int32_t)(_a[0] * _b[0]);
+  //   int32_t d1 = (int32_t)(_a[1] * _b[1]);
+  //   int32_t d2 = (int32_t)(_a[2] * _b[2]);
+  //   int32_t d3 = (int32_t)(_a[3] * _b[3]);
+  //   int32_t d4 = (int32_t)(_a[4] * _b[4]);
+  //   int32_t d5 = (int32_t)(_a[5] * _b[5]);
+  //   int32_t d6 = (int32_t)(_a[6] * _b[6]);
+  //   int32_t d7 = (int32_t)(_a[7] * _b[7]);
+  //   int32_t d8 = (int32_t)(_a[8] * _b[8]);
+  //   int32_t d9 = (int32_t)(_a[9] * _b[9]);
+  //   int32_t d10 = (int32_t)(_a[10] * _b[10]);
+  //   int32_t d11 = (int32_t)(_a[11] * _b[11]);
+  //   int32_t d12 = (int32_t)(_a[12] * _b[12]);
+  //   int32_t d13 = (int32_t)(_a[13] * _b[13]);
+  //   int32_t d14 = (int32_t)(_a[14] * _b[14]);
+  //   int32_t d15 = (int32_t)(_a[15] * _b[15]);
+  //
+  //   int16_t e[8];
+  //   e[0] = saturate_16(d0 + d1);
+  //   e[1] = saturate_16(d2 + d3);
+  //   e[2] = saturate_16(d4 + d5);
+  //   e[3] = saturate_16(d6 + d7);
+  //   e[4] = saturate_16(d8 + d9);
+  //   e[5] = saturate_16(d10 + d11);
+  //   e[6] = saturate_16(d12 + d13);
+  //   e[7] = saturate_16(d14 + d15);
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_maddubs_epi16(a, b);
+  //   return VALIDATE_INT16_M128(c, e);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_maddubs_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //   int16_t d0 = (int16_t)(_a[0] * _b[0]);
+  //   int16_t d1 = (int16_t)(_a[1] * _b[1]);
+  //   int16_t d2 = (int16_t)(_a[2] * _b[2]);
+  //   int16_t d3 = (int16_t)(_a[3] * _b[3]);
+  //   int16_t d4 = (int16_t)(_a[4] * _b[4]);
+  //   int16_t d5 = (int16_t)(_a[5] * _b[5]);
+  //   int16_t d6 = (int16_t)(_a[6] * _b[6]);
+  //   int16_t d7 = (int16_t)(_a[7] * _b[7]);
+  //
+  //   int16_t e[4];
+  //   e[0] = saturate_16(d0 + d1);
+  //   e[1] = saturate_16(d2 + d3);
+  //   e[2] = saturate_16(d4 + d5);
+  //   e[3] = saturate_16(d6 + d7);
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 c = _mm_maddubs_pi16(a, b);
+  //
+  //   return VALIDATE_INT16_M64(c, e);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_mulhrs_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   int32_t _c[8];
+  //   for (int i = 0; i < 8; i++) {
+  //     _c[i] = (((((int32_t)_a[i] * (int32_t)_b[i]) >> 14) + 1) & 0x1FFFE) >>
+  //     1;
+  //   }
+  //   __m128i c = _mm_mulhrs_epi16(a, b);
+  //
+  //   return VALIDATE_INT16_M128(c, _c);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_mulhrs_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   int32_t _c[4];
+  //   for (int i = 0; i < 4; i++) {
+  //     _c[i] = (((((int32_t)_a[i] * (int32_t)_b[i]) >> 14) + 1) & 0x1FFFE) >>
+  //     1;
+  //   }
+  //   __m64 c = _mm_mulhrs_pi16(a, b);
+  //
+  //   return VALIDATE_INT16_M64(c, _c);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_shuffle_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //   int8_t dst[16];
+  //
+  //   for (int i = 0; i < 16; i++) {
+  //     if (_b[i] & 0x80) {
+  //       dst[i] = 0;
+  //     } else {
+  //       dst[i] = _a[_b[i] & 0x0F];
+  //     }
+  //   }
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i ret = _mm_shuffle_epi8(a, b);
+  //
+  //   return VALIDATE_INT8_M128(ret, dst);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_shuffle_pi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //   int8_t dst[8];
+  //
+  //   for (int i = 0; i < 8; i++) {
+  //     if (_b[i] & 0x80) {
+  //       dst[i] = 0;
+  //     } else {
+  //       dst[i] = _a[_b[i] & 0x07];
+  //     }
+  //   }
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 ret = _mm_shuffle_pi8(a, b);
+  //
+  //   return VALIDATE_INT8_M64(ret, dst);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sign_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //
+  //   int16_t d[8];
+  //   for (int i = 0; i < 8; i++) {
+  //     if (_b[i] < 0) {
+  //       d[i] = -_a[i];
+  //     } else if (_b[i] == 0) {
+  //       d[i] = 0;
+  //     } else {
+  //       d[i] = _a[i];
+  //     }
+  //   }
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_sign_epi16(a, b);
+  //
+  //   return VALIDATE_INT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sign_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2;
+  //
+  //   int32_t d[4];
+  //   for (int i = 0; i < 4; i++) {
+  //     if (_b[i] < 0) {
+  //       d[i] = -_a[i];
+  //     } else if (_b[i] == 0) {
+  //       d[i] = 0;
+  //     } else {
+  //       d[i] = _a[i];
+  //     }
+  //   }
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_sign_epi32(a, b);
+  //
+  //   return VALIDATE_INT32_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sign_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //
+  //   int8_t d[16];
+  //   for (int i = 0; i < 16; i++) {
+  //     if (_b[i] < 0) {
+  //       d[i] = -_a[i];
+  //     } else if (_b[i] == 0) {
+  //       d[i] = 0;
+  //     } else {
+  //       d[i] = _a[i];
+  //     }
+  //   }
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_sign_epi8(a, b);
+  //
+  //   return VALIDATE_INT8_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sign_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //
+  //   int16_t d[4];
+  //   for (int i = 0; i < 4; i++) {
+  //     if (_b[i] < 0) {
+  //       d[i] = -_a[i];
+  //     } else if (_b[i] == 0) {
+  //       d[i] = 0;
+  //     } else {
+  //       d[i] = _a[i];
+  //     }
+  //   }
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 c = _mm_sign_pi16(a, b);
+  //
+  //   return VALIDATE_INT16_M64(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sign_pi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2;
+  //
+  //   int32_t d[2];
+  //   for (int i = 0; i < 2; i++) {
+  //     if (_b[i] < 0) {
+  //       d[i] = -_a[i];
+  //     } else if (_b[i] == 0) {
+  //       d[i] = 0;
+  //     } else {
+  //       d[i] = _a[i];
+  //     }
+  //   }
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 c = _mm_sign_pi32(a, b);
+  //
+  //   return VALIDATE_INT32_M64(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_sign_pi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //
+  //   int8_t d[8];
+  //   for (int i = 0; i < 8; i++) {
+  //     if (_b[i] < 0) {
+  //       d[i] = -_a[i];
+  //     } else if (_b[i] == 0) {
+  //       d[i] = 0;
+  //     } else {
+  //       d[i] = _a[i];
+  //     }
+  //   }
+  //
+  //   __m64 a = load_m64(_a);
+  //   __m64 b = load_m64(_b);
+  //   __m64 c = _mm_sign_pi8(a, b);
+  //
+  //   return VALIDATE_INT8_M64(c, d);
+  return TEST_UNIMPL;
+}
+
+/* SSE4.1 */
+result_t test_mm_blend_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
+  //   int16_t _c[8];
+  //   __m128i a, b, c;
+  //
+  // #define TEST_IMPL(IDX)
+  //   for (int j = 0; j < 8; j++) {
+  //     if ((IDX >> j) & 0x1) {
+  //       _c[j] = _b[j];
+  //     } else {
+  //       _c[j] = _a[j];
+  //     }
+  //   }
+  //   a = load_m128i(_a);
+  //   b = load_m128i(_b);
+  //   c = _mm_blend_epi16(a, b, IDX);
+  // CHECK_RESULT(VALIDATE_INT16_M128(c, _c));
+  //
+  //   IMM_256_ITER
+  // #undef TEST_IMPL
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_blend_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   __m128d a, b, c;
+  //
+  // #define TEST_IMPL(IDX)
+  //   double _c##IDX[2];
+  //   for (int j = 0; j < 2; j++) {
+  //     if ((IDX >> j) & 0x1) {
+  //       _c##IDX[j] = _b[j];
+  //     } else {
+  //       _c##IDX[j] = _a[j];
+  //     }
+  //   }
+  //
+  //   a = load_m128d(_a);
+  //   b = load_m128d(_b);
+  //   c = _mm_blend_pd(a, b, IDX);
+  //  CHECK_RESULT(validate_double(c, _c##IDX[0], _c##IDX[1]))
+  //
+  //   IMM_4_ITER
+  // #undef TEST_IMPL
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_blend_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 c;
+  //
+  // gcc and clang can't compile call to _mm_blend_ps with 3rd argument as
+  // integer type due 4 bit size limitation.
+  // #define TEST_IMPL(IDX)
+  //   float _c##IDX[4];
+  //   for (int i = 0; i < 4; i++) {
+  //     if (IDX & (1 << i)) {
+  //       _c##IDX[i] = _b[i];
+  //     } else {
+  //       _c##IDX[i] = _a[i];
+  //     }
+  //   }
+  //
+  //   c = _mm_blend_ps(a, b, IDX);
+  //   CHECK_RESULT(
+  //       validate_float(c, _c##IDX[0], _c##IDX[1], _c##IDX[2], _c##IDX[3]))
+  //
+  //   IMM_4_ITER
+  // #undef TEST_IMPL
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_blendv_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //   const int8_t _mask[16] = {(const int8_t)impl.test_cases_ints[iter],
+  //                             (const int8_t)impl.test_cases_ints[iter + 1],
+  //                             (const int8_t)impl.test_cases_ints[iter + 2],
+  //                             (const int8_t)impl.test_cases_ints[iter + 3],
+  //                             (const int8_t)impl.test_cases_ints[iter + 4],
+  //                             (const int8_t)impl.test_cases_ints[iter + 5],
+  //                             (const int8_t)impl.test_cases_ints[iter + 6],
+  //                             (const int8_t)impl.test_cases_ints[iter + 7]};
+  //
+  //   int8_t _c[16];
+  //   for (int i = 0; i < 16; i++) {
+  //     if (_mask[i] >> 7) {
+  //       _c[i] = _b[i];
+  //     } else {
+  //       _c[i] = _a[i];
+  //     }
+  //   }
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i mask = load_m128i(_mask);
+  //   __m128i c = _mm_blendv_epi8(a, b, mask);
+  //
+  //   return VALIDATE_INT8_M128(c, _c);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_blendv_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //   const double _mask[] = {(double)impl.test_cases_floats[iter],
+  //                           (double)impl.test_cases_floats[iter + 1]};
+  //
+  //   double _c[2];
+  //   for (int i = 0; i < 2; i++) {
+  //     // signed shift right would return a result which is either all 1's
+  //     from
+  //     // negative numbers or all 0's from positive numbers
+  //     if ((*(const int64_t *)(_mask + i)) >> 63) {
+  //       _c[i] = _b[i];
+  //     } else {
+  //       _c[i] = _a[i];
+  //     }
+  //   }
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d mask = load_m128d(_mask);
+  //
+  //   __m128d c = _mm_blendv_pd(a, b, mask);
+  //
+  //   return validate_double(c, _c[0], _c[1]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_blendv_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   const float _mask[] = {
+  //       impl.test_cases_floats[iter], impl.test_cases_floats[iter + 1],
+  //       impl.test_cases_floats[iter + 2], impl.test_cases_floats[iter + 3]};
+  //
+  //   float _c[4];
+  //   for (int i = 0; i < 4; i++) {
+  //     // signed shift right would return a result which is either all 1's
+  //     from
+  //     // negative numbers or all 0's from positive numbers
+  //     if ((*(const int32_t *)(_mask + i)) >> 31) {
+  //       _c[i] = _b[i];
+  //     } else {
+  //       _c[i] = _a[i];
+  //     }
+  //   }
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 mask = load_m128(_mask);
+  //
+  //   __m128 c = _mm_blendv_ps(a, b, mask);
+  //
+  //   return validate_float(c, _c[0], _c[1], _c[2], _c[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_ceil_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //
+  //   double dx = ceil(_a[0]);
+  //   double dy = ceil(_a[1]);
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d ret = _mm_ceil_pd(a);
+  //
+  //   return validate_double(ret, dx, dy);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_ceil_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   float dx = ceilf(_a[0]);
+  //   float dy = ceilf(_a[1]);
+  //   float dz = ceilf(_a[2]);
+  //   float dw = ceilf(_a[3]);
+  //
+  //   __m128 a = _mm_load_ps(_a);
+  //   __m128 c = _mm_ceil_ps(a);
+  //   return validate_float(c, dx, dy, dz, dw);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_ceil_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //
+  //   double dx = ceil(_b[0]);
+  //   double dy = _a[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d ret = _mm_ceil_sd(a, b);
+  //
+  //   return validate_double(ret, dx, dy);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_ceil_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer1;
+  //
+  //   float f0 = ceilf(_b[0]);
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 c = _mm_ceil_ss(a, b);
+  //
+  //   return validate_float(c, f0, _a[1], _a[2], _a[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpeq_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //   const int64_t *_b = (const int64_t *)impl.test_cases_int_pointer2;
+  //   int64_t d0 = (_a[0] == _b[0]) ? 0xffffffffffffffff : 0x0;
+  //   int64_t d1 = (_a[1] == _b[1]) ? 0xffffffffffffffff : 0x0;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_cmpeq_epi64(a, b);
+  //   return validate_int64(c, d0, d1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtepi16_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //
+  //   int32_t d[4];
+  //   d[0] = (int32_t)_a[0];
+  //   d[1] = (int32_t)_a[1];
+  //   d[2] = (int32_t)_a[2];
+  //   d[3] = (int32_t)_a[3];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i ret = _mm_cvtepi16_epi32(a);
+  //
+  //   return VALIDATE_INT32_M128(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtepi16_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //
+  //   int64_t i0 = (int64_t)_a[0];
+  //   int64_t i1 = (int64_t)_a[1];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i ret = _mm_cvtepi16_epi64(a);
+  //
+  //   return validate_int64(ret, i0, i1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtepi32_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //
+  //   int64_t i0 = (int64_t)_a[0];
+  //   int64_t i1 = (int64_t)_a[1];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i ret = _mm_cvtepi32_epi64(a);
+  //
+  //   return validate_int64(ret, i0, i1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtepi8_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //
+  //   int16_t d[8];
+  //   d[0] = (int16_t)_a[0];
+  //   d[1] = (int16_t)_a[1];
+  //   d[2] = (int16_t)_a[2];
+  //   d[3] = (int16_t)_a[3];
+  //   d[4] = (int16_t)_a[4];
+  //   d[5] = (int16_t)_a[5];
+  //   d[6] = (int16_t)_a[6];
+  //   d[7] = (int16_t)_a[7];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i ret = _mm_cvtepi8_epi16(a);
+  //
+  //   return VALIDATE_INT16_M128(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtepi8_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //
+  //   int32_t d[4];
+  //   d[0] = (int32_t)_a[0];
+  //   d[1] = (int32_t)_a[1];
+  //   d[2] = (int32_t)_a[2];
+  //   d[3] = (int32_t)_a[3];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i ret = _mm_cvtepi8_epi32(a);
+  //
+  //   return VALIDATE_INT32_M128(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtepi8_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //
+  //   int64_t i0 = (int64_t)_a[0];
+  //   int64_t i1 = (int64_t)_a[1];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i ret = _mm_cvtepi8_epi64(a);
+  //
+  //   return validate_int64(ret, i0, i1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtepu16_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1;
+  //
+  //   int32_t d[4];
+  //   d[0] = (int32_t)_a[0];
+  //   d[1] = (int32_t)_a[1];
+  //   d[2] = (int32_t)_a[2];
+  //   d[3] = (int32_t)_a[3];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i ret = _mm_cvtepu16_epi32(a);
+  //
+  //   return VALIDATE_INT32_M128(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtepu16_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1;
+  //
+  //   int64_t i0 = (int64_t)_a[0];
+  //   int64_t i1 = (int64_t)_a[1];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i ret = _mm_cvtepu16_epi64(a);
+  //
+  //   return validate_int64(ret, i0, i1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtepu32_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint32_t *_a = (const uint32_t *)impl.test_cases_int_pointer1;
+  //
+  //   int64_t i0 = (int64_t)_a[0];
+  //   int64_t i1 = (int64_t)_a[1];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i ret = _mm_cvtepu32_epi64(a);
+  //
+  //   return validate_int64(ret, i0, i1);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtepu8_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
+  //
+  //   int16_t d[8];
+  //   d[0] = (int16_t)_a[0];
+  //   d[1] = (int16_t)_a[1];
+  //   d[2] = (int16_t)_a[2];
+  //   d[3] = (int16_t)_a[3];
+  //   d[4] = (int16_t)_a[4];
+  //   d[5] = (int16_t)_a[5];
+  //   d[6] = (int16_t)_a[6];
+  //   d[7] = (int16_t)_a[7];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i ret = _mm_cvtepu8_epi16(a);
+  //
+  //   return VALIDATE_INT16_M128(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtepu8_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
+  //
+  //   int32_t d[4];
+  //   d[0] = (int32_t)_a[0];
+  //   d[1] = (int32_t)_a[1];
+  //   d[2] = (int32_t)_a[2];
+  //   d[3] = (int32_t)_a[3];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i ret = _mm_cvtepu8_epi32(a);
+  //
+  //   return VALIDATE_INT32_M128(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cvtepu8_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
+  //
+  //   int64_t i0 = (int64_t)_a[0];
+  //   int64_t i1 = (int64_t)_a[1];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i ret = _mm_cvtepu8_epi64(a);
+  //
+  //   return validate_int64(ret, i0, i1);
+  return TEST_UNIMPL;
+}
+
+#define MM_DP_PD_TEST_CASE_WITH(imm8)                                          \
+  do {                                                                         \
+    const double *_a = (const double *)impl.test_cases_float_pointer1;         \
+    const double *_b = (const double *)impl.test_cases_float_pointer2;         \
+    const int imm = imm8;                                                      \
+    double d[2];                                                               \
+    double sum = 0;                                                            \
+    for (size_t i = 0; i < 2; i++)                                             \
+      sum += ((imm) & (1 << (i + 4))) ? _a[i] * _b[i] : 0;                     \
+    for (size_t i = 0; i < 2; i++)                                             \
+      d[i] = (imm & (1 << i)) ? sum : 0;                                       \
+    __m128d a = load_m128d(_a);                                                \
+    __m128d b = load_m128d(_b);                                                \
+    __m128d ret = _mm_dp_pd(a, b, imm);                                        \
+    if (validate_double(ret, d[0], d[1]) != TEST_SUCCESS)                      \
+      return TEST_FAIL;                                                        \
+  } while (0)
+
+#define GENERATE_MM_DP_PD_TEST_CASES                                           \
+  MM_DP_PD_TEST_CASE_WITH(0xF0);                                               \
+  MM_DP_PD_TEST_CASE_WITH(0xF1);                                               \
+  MM_DP_PD_TEST_CASE_WITH(0xF2);                                               \
+  MM_DP_PD_TEST_CASE_WITH(0xFF);                                               \
+  MM_DP_PD_TEST_CASE_WITH(0x10);                                               \
+  MM_DP_PD_TEST_CASE_WITH(0x11);                                               \
+  MM_DP_PD_TEST_CASE_WITH(0x12);                                               \
+  MM_DP_PD_TEST_CASE_WITH(0x13);                                               \
+  MM_DP_PD_TEST_CASE_WITH(0x00);                                               \
+  MM_DP_PD_TEST_CASE_WITH(0x01);                                               \
+  MM_DP_PD_TEST_CASE_WITH(0x02);                                               \
+  MM_DP_PD_TEST_CASE_WITH(0x03);                                               \
+  MM_DP_PD_TEST_CASE_WITH(0x20);                                               \
+  MM_DP_PD_TEST_CASE_WITH(0x21);                                               \
+  MM_DP_PD_TEST_CASE_WITH(0x22);                                               \
+  MM_DP_PD_TEST_CASE_WITH(0x23);
+
+result_t test_mm_dp_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   GENERATE_MM_DP_PD_TEST_CASES
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+#define MM_DP_PS_TEST_CASE_WITH(IMM)                                           \
+  do {                                                                         \
+    const float *_a = impl.test_cases_float_pointer1;                          \
+    const float *_b = impl.test_cases_float_pointer2;                          \
+    const int imm = IMM;                                                       \
+    __m128 a = load_m128(_a);                                                  \
+    __m128 b = load_m128(_b);                                                  \
+    __m128 out = _mm_dp_ps(a, b, imm);                                         \
+    float r[4]; /* the reference */                                            \
+    float sum = 0;                                                             \
+    for (size_t i = 0; i < 4; i++)                                             \
+      sum += ((imm) & (1 << (i + 4))) ? _a[i] * _b[i] : 0;                     \
+    for (size_t i = 0; i < 4; i++)                                             \
+      r[i] = (imm & (1 << i)) ? sum : 0;                                       \
+    /* the epsilon has to be large enough, otherwise test suite fails. */      \
+    if (validate_float_epsilon(out, r[0], r[1], r[2], r[3], 2050.0f) !=        \
+        TEST_SUCCESS)                                                          \
+      return TEST_FAIL;                                                        \
+  } while (0)
+
+#define GENERATE_MM_DP_PS_TEST_CASES                                           \
+  MM_DP_PS_TEST_CASE_WITH(0xFF);                                               \
+  MM_DP_PS_TEST_CASE_WITH(0x7F);                                               \
+  MM_DP_PS_TEST_CASE_WITH(0x9F);                                               \
+  MM_DP_PS_TEST_CASE_WITH(0x2F);                                               \
+  MM_DP_PS_TEST_CASE_WITH(0x0F);                                               \
+  MM_DP_PS_TEST_CASE_WITH(0x23);                                               \
+  MM_DP_PS_TEST_CASE_WITH(0xB5);
+
+result_t test_mm_dp_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   GENERATE_MM_DP_PS_TEST_CASES
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_extract_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   int32_t *_a = (int32_t *)impl.test_cases_int_pointer1;
+  //   __m128i a = load_m128i(_a);
+  //   int c;
+  //
+  // #define TEST_IMPL(IDX)
+  //   c = _mm_extract_epi32(a, IDX);
+  // ASSERT_RETURN(c == *(_a + IDX));
+  //
+  //   IMM_4_ITER
+  // #undef TEST_IMPL
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_extract_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   int64_t *_a = (int64_t *)impl.test_cases_int_pointer1;
+  //   __m128i a = load_m128i(_a);
+  //   __int64 c;
+  //
+  // #define TEST_IMPL(IDX)
+  //   c = _mm_extract_epi64(a, IDX);
+  //  ASSERT_RETURN(c == *(_a + IDX));
+  //
+  //   IMM_2_ITER
+  // #undef TEST_IMPL
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_extract_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   uint8_t *_a = (uint8_t *)impl.test_cases_int_pointer1;
+  //   __m128i a = load_m128i(_a);
+  //   int c;
+  //
+  // #define TEST_IMPL(IDX)
+  //   c = _mm_extract_epi8(a, IDX);
+  //  ASSERT_RETURN(c == *(_a + IDX));
+  //
+  //   IMM_8_ITER
+  // #undef TEST_IMPL
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_extract_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = (const float *)impl.test_cases_float_pointer1;
+  //
+  //   __m128 a = _mm_load_ps(_a);
+  //   int32_t c;
+  //
+  // #define TEST_IMPL(IDX)
+  //   c = _mm_extract_ps(a, IDX);
+  //  ASSERT_RETURN(c == *(const int32_t *)(_a + IDX));
+  //
+  //   IMM_4_ITER
+  // #undef TEST_IMPL
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_floor_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //
+  //   double dx = floor(_a[0]);
+  //   double dy = floor(_a[1]);
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d ret = _mm_floor_pd(a);
+  //
+  //   return validate_double(ret, dx, dy);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_floor_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   float dx = floorf(_a[0]);
+  //   float dy = floorf(_a[1]);
+  //   float dz = floorf(_a[2]);
+  //   float dw = floorf(_a[3]);
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 c = _mm_floor_ps(a);
+  //   return validate_float(c, dx, dy, dz, dw);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_floor_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (const double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (const double *)impl.test_cases_float_pointer2;
+  //
+  //   double dx = floor(_b[0]);
+  //   double dy = _a[1];
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   __m128d ret = _mm_floor_sd(a, b);
+  //
+  //   return validate_double(ret, dx, dy);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_floor_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer1;
+  //
+  //   float f0 = floorf(_b[0]);
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   __m128 c = _mm_floor_ss(a, b);
+  //
+  //   return validate_float(c, f0, _a[1], _a[2], _a[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_insert_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   const int32_t insert = (int32_t)*impl.test_cases_int_pointer2;
+  //   __m128i a, b;
+  //
+  // #define TEST_IMPL(IDX)
+  //   int32_t d##IDX[4];
+  //   for (int i = 0; i < 4; i++) {
+  //     d##IDX[i] = _a[i];
+  //   }
+  //   d##IDX[IDX] = insert;
+  //
+  //   a = load_m128i(_a);
+  //   b = _mm_insert_epi32(a, (int)insert, IDX);
+  //   CHECK_RESULT(VALIDATE_INT32_M128(b, d##IDX));
+  //
+  //   IMM_4_ITER
+  // #undef TEST_IMPL
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_insert_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //   int64_t insert = (int64_t)*impl.test_cases_int_pointer2;
+  //
+  //   __m128i a, b;
+  //   int64_t d[2];
+  // #define TEST_IMPL(IDX)
+  //   d[0] = _a[0];
+  //   d[1] = _a[1];
+  //   d[IDX] = insert;
+  //   a = load_m128i(_a);
+  //   b = _mm_insert_epi64(a, insert, IDX);
+  // CHECK_RESULT(validate_int64(b, d[0], d[1]));
+  //
+  //   IMM_2_ITER
+  // #undef TEST_IMPL
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_insert_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t insert = (int8_t)*impl.test_cases_int_pointer2;
+  //   __m128i a, b;
+  //   int8_t d[16];
+  //
+  // #define TEST_IMPL(IDX)
+  //   for (int i = 0; i < 16; i++) {
+  //     d[i] = _a[i];
+  //   }
+  //   d[IDX] = insert;
+  //   a = load_m128i(_a);
+  //   b = _mm_insert_epi8(a, insert, IDX);
+  // CHECK_RESULT(VALIDATE_INT8_M128(b, d));
+  //
+  //   IMM_16_ITER
+  // #undef TEST_IMPL
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_insert_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //
+  //   __m128 a, b, c;
+  // #define TEST_IMPL(IDX)
+  //   float d##IDX[4] = {_a[0], _a[1], _a[2], _a[3]};
+  //   d##IDX[(IDX >> 4) & 0x3] = _b[(IDX >> 6) & 0x3];
+  //
+  //   for (int j = 0; j < 4; j++) {
+  //     if (IDX & (1 << j)) {
+  //       d##IDX[j] = 0;
+  //     }
+  //   }
+  //
+  //   a = _mm_load_ps(_a);
+  //   b = _mm_load_ps(_b);
+  //   c = _mm_insert_ps(a, b, IDX);
+  // CHECK_RESULT(validate_float(c, d##IDX[0], d##IDX[1], d##IDX[2],
+  // d##IDX[3]));
+  //
+  //   IMM_256_ITER
+  // #undef TEST_IMPL
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_max_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2;
+  //
+  //   int32_t d[4];
+  //   d[0] = _a[0] > _b[0] ? _a[0] : _b[0];
+  //   d[1] = _a[1] > _b[1] ? _a[1] : _b[1];
+  //   d[2] = _a[2] > _b[2] ? _a[2] : _b[2];
+  //   d[3] = _a[3] > _b[3] ? _a[3] : _b[3];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_max_epi32(a, b);
+  //
+  //   return VALIDATE_INT32_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_max_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //   int8_t d[16];
+  //   d[0] = _a[0] > _b[0] ? _a[0] : _b[0];
+  //   d[1] = _a[1] > _b[1] ? _a[1] : _b[1];
+  //   d[2] = _a[2] > _b[2] ? _a[2] : _b[2];
+  //   d[3] = _a[3] > _b[3] ? _a[3] : _b[3];
+  //   d[4] = _a[4] > _b[4] ? _a[4] : _b[4];
+  //   d[5] = _a[5] > _b[5] ? _a[5] : _b[5];
+  //   d[6] = _a[6] > _b[6] ? _a[6] : _b[6];
+  //   d[7] = _a[7] > _b[7] ? _a[7] : _b[7];
+  //   d[8] = _a[8] > _b[8] ? _a[8] : _b[8];
+  //   d[9] = _a[9] > _b[9] ? _a[9] : _b[9];
+  //   d[10] = _a[10] > _b[10] ? _a[10] : _b[10];
+  //   d[11] = _a[11] > _b[11] ? _a[11] : _b[11];
+  //   d[12] = _a[12] > _b[12] ? _a[12] : _b[12];
+  //   d[13] = _a[13] > _b[13] ? _a[13] : _b[13];
+  //   d[14] = _a[14] > _b[14] ? _a[14] : _b[14];
+  //   d[15] = _a[15] > _b[15] ? _a[15] : _b[15];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //
+  //   __m128i c = _mm_max_epi8(a, b);
+  //   return VALIDATE_INT8_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_max_epu16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1;
+  //   const uint16_t *_b = (const uint16_t *)impl.test_cases_int_pointer2;
+  //
+  //   uint16_t d[8];
+  //   d[0] = _a[0] > _b[0] ? _a[0] : _b[0];
+  //   d[1] = _a[1] > _b[1] ? _a[1] : _b[1];
+  //   d[2] = _a[2] > _b[2] ? _a[2] : _b[2];
+  //   d[3] = _a[3] > _b[3] ? _a[3] : _b[3];
+  //   d[4] = _a[4] > _b[4] ? _a[4] : _b[4];
+  //   d[5] = _a[5] > _b[5] ? _a[5] : _b[5];
+  //   d[6] = _a[6] > _b[6] ? _a[6] : _b[6];
+  //   d[7] = _a[7] > _b[7] ? _a[7] : _b[7];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_max_epu16(a, b);
+  //
+  //   return VALIDATE_UINT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_max_epu32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint32_t *_a = (const uint32_t *)impl.test_cases_int_pointer1;
+  //   const uint32_t *_b = (const uint32_t *)impl.test_cases_int_pointer2;
+  //
+  //   uint32_t d[4];
+  //   d[0] = _a[0] > _b[0] ? _a[0] : _b[0];
+  //   d[1] = _a[1] > _b[1] ? _a[1] : _b[1];
+  //   d[2] = _a[2] > _b[2] ? _a[2] : _b[2];
+  //   d[3] = _a[3] > _b[3] ? _a[3] : _b[3];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_max_epu32(a, b);
+  //
+  //   return VALIDATE_UINT32_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_min_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2;
+  //
+  //   int32_t d[4];
+  //   d[0] = _a[0] < _b[0] ? _a[0] : _b[0];
+  //   d[1] = _a[1] < _b[1] ? _a[1] : _b[1];
+  //   d[2] = _a[2] < _b[2] ? _a[2] : _b[2];
+  //   d[3] = _a[3] < _b[3] ? _a[3] : _b[3];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_min_epi32(a, b);
+  //
+  //   return VALIDATE_INT32_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_min_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
+  //   const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
+  //
+  //   int8_t d[16];
+  //   d[0] = _a[0] < _b[0] ? _a[0] : _b[0];
+  //   d[1] = _a[1] < _b[1] ? _a[1] : _b[1];
+  //   d[2] = _a[2] < _b[2] ? _a[2] : _b[2];
+  //   d[3] = _a[3] < _b[3] ? _a[3] : _b[3];
+  //   d[4] = _a[4] < _b[4] ? _a[4] : _b[4];
+  //   d[5] = _a[5] < _b[5] ? _a[5] : _b[5];
+  //   d[6] = _a[6] < _b[6] ? _a[6] : _b[6];
+  //   d[7] = _a[7] < _b[7] ? _a[7] : _b[7];
+  //   d[8] = _a[8] < _b[8] ? _a[8] : _b[8];
+  //   d[9] = _a[9] < _b[9] ? _a[9] : _b[9];
+  //   d[10] = _a[10] < _b[10] ? _a[10] : _b[10];
+  //   d[11] = _a[11] < _b[11] ? _a[11] : _b[11];
+  //   d[12] = _a[12] < _b[12] ? _a[12] : _b[12];
+  //   d[13] = _a[13] < _b[13] ? _a[13] : _b[13];
+  //   d[14] = _a[14] < _b[14] ? _a[14] : _b[14];
+  //   d[15] = _a[15] < _b[15] ? _a[15] : _b[15];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //
+  //   __m128i c = _mm_min_epi8(a, b);
+  //   return VALIDATE_INT8_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_min_epu16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1;
+  //   const uint16_t *_b = (const uint16_t *)impl.test_cases_int_pointer2;
+  //
+  //   uint16_t d[8];
+  //   d[0] = _a[0] < _b[0] ? _a[0] : _b[0];
+  //   d[1] = _a[1] < _b[1] ? _a[1] : _b[1];
+  //   d[2] = _a[2] < _b[2] ? _a[2] : _b[2];
+  //   d[3] = _a[3] < _b[3] ? _a[3] : _b[3];
+  //   d[4] = _a[4] < _b[4] ? _a[4] : _b[4];
+  //   d[5] = _a[5] < _b[5] ? _a[5] : _b[5];
+  //   d[6] = _a[6] < _b[6] ? _a[6] : _b[6];
+  //   d[7] = _a[7] < _b[7] ? _a[7] : _b[7];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_min_epu16(a, b);
+  //
+  //   return VALIDATE_UINT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_min_epu32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint32_t *_a = (const uint32_t *)impl.test_cases_int_pointer1;
+  //   const uint32_t *_b = (const uint32_t *)impl.test_cases_int_pointer2;
+  //
+  //   uint32_t d[4];
+  //   d[0] = _a[0] < _b[0] ? _a[0] : _b[0];
+  //   d[1] = _a[1] < _b[1] ? _a[1] : _b[1];
+  //   d[2] = _a[2] < _b[2] ? _a[2] : _b[2];
+  //   d[3] = _a[3] < _b[3] ? _a[3] : _b[3];
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_min_epu32(a, b);
+  //
+  //   return VALIDATE_UINT32_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_minpos_epu16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
+  //   uint16_t index = 0, min = (uint16_t)_a[0];
+  //   for (int i = 0; i < 8; i++) {
+  //     if ((uint16_t)_a[i] < min) {
+  //       index = (uint16_t)i;
+  //       min = (uint16_t)_a[i];
+  //     }
+  //   }
+  //
+  //   uint16_t d[8] = {min, index, 0, 0, 0, 0, 0, 0};
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i ret = _mm_minpos_epu16(a);
+  //   return VALIDATE_UINT16_M128(ret, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_mpsadbw_epu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
+  //   const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c;
+  // #define TEST_IMPL(IDX)
+  //   uint8_t a_offset##IDX = ((IDX >> 2) & 0x1) * 4;
+  //   uint8_t b_offset##IDX = (IDX & 0x3) * 4;
+  //
+  //   uint16_t d##IDX[8] = {};
+  //   for (int i = 0; i < 8; i++) {
+  //     for (int j = 0; j < 4; j++) {
+  //       d##IDX[i] += abs(_a[(a_offset##IDX + i) + j] - _b[b_offset##IDX +
+  //       j]);
+  //     }
+  //   }
+  //   c = _mm_mpsadbw_epu8(a, b, IDX);
+  // CHECK_RESULT(VALIDATE_UINT16_M128(c,d##IDX));
+  //
+  //   IMM_8_ITER
+  // #undef TEST_IMPL
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_mul_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2;
+  //
+  //   int64_t dx = (int64_t)(_a[0]) * (int64_t)(_b[0]);
+  //   int64_t dy = (int64_t)(_a[2]) * (int64_t)(_b[2]);
+  //
+  //   __m128i a = _mm_loadu_si128((const __m128i *)_a);
+  //   __m128i b = _mm_loadu_si128((const __m128i *)_b);
+  //   __m128i r = _mm_mul_epi32(a, b);
+  //
+  //   return validate_int64(r, dx, dy);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_mullo_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = impl.test_cases_int_pointer1;
+  //   const int32_t *_b = impl.test_cases_int_pointer2;
+  //   int32_t d[4];
+  //
+  //   for (int i = 0; i < 4; i++) {
+  //     d[i] = (int32_t)((int64_t)_a[i] * (int64_t)_b[i]);
+  //   }
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_mullo_epi32(a, b);
+  //   return VALIDATE_INT32_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_packus_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   uint16_t max = UINT16_MAX;
+  //   uint16_t min = 0;
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2;
+  //
+  //   uint16_t d[8];
+  //   for (int i = 0; i < 4; i++) {
+  //     if (_a[i] > (int32_t)max)
+  //       d[i] = max;
+  //     else if (_a[i] < (int32_t)min)
+  //       d[i] = min;
+  //     else
+  //       d[i] = (uint16_t)_a[i];
+  //   }
+  //   for (int i = 0; i < 4; i++) {
+  //     if (_b[i] > (int32_t)max)
+  //       d[i + 4] = max;
+  //     else if (_b[i] < (int32_t)min)
+  //       d[i + 4] = min;
+  //     else
+  //       d[i + 4] = (uint16_t)_b[i];
+  //   }
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i c = _mm_packus_epi32(a, b);
+  //
+  //   return VALIDATE_UINT16_M128(c, d);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_round_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (double *)impl.test_cases_float_pointer1;
+  //   double d[2];
+  //   __m128d ret;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   switch (iter & 0x7) {
+  //   case 0:
+  //     d[0] = bankersRounding(_a[0]);
+  //     d[1] = bankersRounding(_a[1]);
+  //
+  //     ret = _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  //     break;
+  //   case 1:
+  //     d[0] = floor(_a[0]);
+  //     d[1] = floor(_a[1]);
+  //
+  //     ret = _mm_round_pd(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  //     break;
+  //   case 2:
+  //     d[0] = ceil(_a[0]);
+  //     d[1] = ceil(_a[1]);
+  //
+  //     ret = _mm_round_pd(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+  //     break;
+  //   case 3:
+  //     d[0] = _a[0] > 0 ? floor(_a[0]) : ceil(_a[0]);
+  //     d[1] = _a[1] > 0 ? floor(_a[1]) : ceil(_a[1]);
+  //
+  //     ret = _mm_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+  //     break;
+  //   case 4:
+  //     d[0] = bankersRounding(_a[0]);
+  //     d[1] = bankersRounding(_a[1]);
+  //
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+  //     ret = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+  //     break;
+  //   case 5:
+  //     d[0] = floor(_a[0]);
+  //     d[1] = floor(_a[1]);
+  //
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+  //     ret = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+  //     break;
+  //   case 6:
+  //     d[0] = ceil(_a[0]);
+  //     d[1] = ceil(_a[1]);
+  //
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+  //     ret = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+  //     break;
+  //   case 7:
+  //     d[0] = _a[0] > 0 ? floor(_a[0]) : ceil(_a[0]);
+  //     d[1] = _a[1] > 0 ? floor(_a[1]) : ceil(_a[1]);
+  //
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+  //     ret = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+  //     break;
+  //   }
+  //
+  //   return validate_double(ret, d[0], d[1]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_round_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   float f[4];
+  //   __m128 ret;
+  //
+  //   __m128 a = load_m128(_a);
+  //   switch (iter & 0x7) {
+  //   case 0:
+  //     f[0] = bankersRounding(_a[0]);
+  //     f[1] = bankersRounding(_a[1]);
+  //     f[2] = bankersRounding(_a[2]);
+  //     f[3] = bankersRounding(_a[3]);
+  //
+  //     ret = _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  //     break;
+  //   case 1:
+  //     f[0] = floorf(_a[0]);
+  //     f[1] = floorf(_a[1]);
+  //     f[2] = floorf(_a[2]);
+  //     f[3] = floorf(_a[3]);
+  //
+  //     ret = _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  //     break;
+  //   case 2:
+  //     f[0] = ceilf(_a[0]);
+  //     f[1] = ceilf(_a[1]);
+  //     f[2] = ceilf(_a[2]);
+  //     f[3] = ceilf(_a[3]);
+  //
+  //     ret = _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+  //     break;
+  //   case 3:
+  //     f[0] = _a[0] > 0 ? floorf(_a[0]) : ceilf(_a[0]);
+  //     f[1] = _a[1] > 0 ? floorf(_a[1]) : ceilf(_a[1]);
+  //     f[2] = _a[2] > 0 ? floorf(_a[2]) : ceilf(_a[2]);
+  //     f[3] = _a[3] > 0 ? floorf(_a[3]) : ceilf(_a[3]);
+  //
+  //     ret = _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+  //     break;
+  //   case 4:
+  //     f[0] = bankersRounding(_a[0]);
+  //     f[1] = bankersRounding(_a[1]);
+  //     f[2] = bankersRounding(_a[2]);
+  //     f[3] = bankersRounding(_a[3]);
+  //
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+  //     ret = _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION);
+  //     break;
+  //   case 5:
+  //     f[0] = floorf(_a[0]);
+  //     f[1] = floorf(_a[1]);
+  //     f[2] = floorf(_a[2]);
+  //     f[3] = floorf(_a[3]);
+  //
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+  //     ret = _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION);
+  //     break;
+  //   case 6:
+  //     f[0] = ceilf(_a[0]);
+  //     f[1] = ceilf(_a[1]);
+  //     f[2] = ceilf(_a[2]);
+  //     f[3] = ceilf(_a[3]);
+  //
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+  //     ret = _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION);
+  //     break;
+  //   case 7:
+  //     f[0] = _a[0] > 0 ? floorf(_a[0]) : ceilf(_a[0]);
+  //     f[1] = _a[1] > 0 ? floorf(_a[1]) : ceilf(_a[1]);
+  //     f[2] = _a[2] > 0 ? floorf(_a[2]) : ceilf(_a[2]);
+  //     f[3] = _a[3] > 0 ? floorf(_a[3]) : ceilf(_a[3]);
+  //
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+  //     ret = _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION);
+  //     break;
+  //   }
+  //
+  //   return validate_float(ret, f[0], f[1], f[2], f[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_round_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const double *_a = (double *)impl.test_cases_float_pointer1;
+  //   const double *_b = (double *)impl.test_cases_float_pointer2;
+  //   double d[2];
+  //   __m128d ret;
+  //
+  //   __m128d a = load_m128d(_a);
+  //   __m128d b = load_m128d(_b);
+  //   d[1] = _a[1];
+  //   switch (iter & 0x7) {
+  //   case 0:
+  //     d[0] = bankersRounding(_b[0]);
+  //
+  //     ret = _mm_round_sd(a, b, _MM_FROUND_TO_NEAREST_INT |
+  //     _MM_FROUND_NO_EXC); break;
+  //   case 1:
+  //     d[0] = floor(_b[0]);
+  //
+  //     ret = _mm_round_sd(a, b, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  //     break;
+  //   case 2:
+  //     d[0] = ceil(_b[0]);
+  //
+  //     ret = _mm_round_sd(a, b, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+  //     break;
+  //   case 3:
+  //     d[0] = _b[0] > 0 ? floor(_b[0]) : ceil(_b[0]);
+  //
+  //     ret = _mm_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+  //     break;
+  //   case 4:
+  //     d[0] = bankersRounding(_b[0]);
+  //
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+  //     ret = _mm_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
+  //     break;
+  //   case 5:
+  //     d[0] = floor(_b[0]);
+  //
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+  //     ret = _mm_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
+  //     break;
+  //   case 6:
+  //     d[0] = ceil(_b[0]);
+  //
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+  //     ret = _mm_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
+  //     break;
+  //   case 7:
+  //     d[0] = _b[0] > 0 ? floor(_b[0]) : ceil(_b[0]);
+  //
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+  //     ret = _mm_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
+  //     break;
+  //   }
+  //
+  //   return validate_double(ret, d[0], d[1]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_round_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const float *_a = impl.test_cases_float_pointer1;
+  //   const float *_b = impl.test_cases_float_pointer2;
+  //   float f[4];
+  //   __m128 ret;
+  //
+  //   __m128 a = load_m128(_a);
+  //   __m128 b = load_m128(_b);
+  //   switch (iter & 0x7) {
+  //   case 0:
+  //     f[0] = bankersRounding(_b[0]);
+  //
+  //     ret = _mm_round_ss(a, b, _MM_FROUND_TO_NEAREST_INT |
+  //     _MM_FROUND_NO_EXC); break;
+  //   case 1:
+  //     f[0] = floorf(_b[0]);
+  //
+  //     ret = _mm_round_ss(a, b, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  //     break;
+  //   case 2:
+  //     f[0] = ceilf(_b[0]);
+  //
+  //     ret = _mm_round_ss(a, b, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+  //     break;
+  //   case 3:
+  //     f[0] = _b[0] > 0 ? floorf(_b[0]) : ceilf(_b[0]);
+  //
+  //     ret = _mm_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+  //     break;
+  //   case 4:
+  //     f[0] = bankersRounding(_b[0]);
+  //
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+  //     ret = _mm_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
+  //     break;
+  //   case 5:
+  //     f[0] = floorf(_b[0]);
+  //
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
+  //     ret = _mm_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
+  //     break;
+  //   case 6:
+  //     f[0] = ceilf(_b[0]);
+  //
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_UP);
+  //     ret = _mm_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
+  //     break;
+  //   case 7:
+  //     f[0] = _b[0] > 0 ? floorf(_b[0]) : ceilf(_b[0]);
+  //
+  //     _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
+  //     ret = _mm_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
+  //     break;
+  //   }
+  //   f[1] = _a[1];
+  //   f[2] = _a[2];
+  //   f[3] = _a[3];
+  //
+  //   return validate_float(ret, f[0], f[1], f[2], f[3]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_stream_load_si128(const SSE2RVV_TEST_IMPL &impl,
+                                   uint32_t iter) {
+  //   int32_t *addr = impl.test_cases_int_pointer1;
+  //
+  //   __m128i ret = _mm_stream_load_si128((__m128i *)addr);
+  //
+  //   return VALIDATE_INT32_M128(ret, addr);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_test_all_ones(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   __m128i a = load_m128i(_a);
+  //
+  //   int32_t d0 = ~_a[0] & (~(uint32_t)0);
+  //   int32_t d1 = ~_a[1] & (~(uint32_t)0);
+  //   int32_t d2 = ~_a[2] & (~(uint32_t)0);
+  //   int32_t d3 = ~_a[3] & (~(uint32_t)0);
+  //   int32_t result = ((d0 | d1 | d2 | d3) == 0) ? 1 : 0;
+  //
+  //   int32_t ret = _mm_test_all_ones(a);
+  //
+  //   return result == ret ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_test_all_zeros(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   const int32_t *_mask = (const int32_t *)impl.test_cases_int_pointer2;
+  //   __m128i a = load_m128i(_a);
+  //   __m128i mask = load_m128i(_mask);
+  //
+  //   int32_t d0 = _a[0] & _mask[0];
+  //   int32_t d1 = _a[1] & _mask[1];
+  //   int32_t d2 = _a[2] & _mask[2];
+  //   int32_t d3 = _a[3] & _mask[3];
+  //   int32_t result = ((d0 | d1 | d2 | d3) == 0) ? 1 : 0;
+  //
+  //   int32_t ret = _mm_test_all_zeros(a, mask);
+  //
+  //   return result == ret ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_test_mix_ones_zeros(const SSE2RVV_TEST_IMPL &impl,
+                                     uint32_t iter) {
+  //   const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   const int32_t *_mask = (const int32_t *)impl.test_cases_int_pointer2;
+  //   __m128i a = load_m128i(_a);
+  //   __m128i mask = load_m128i(_mask);
+  //
+  //   int32_t d0 = !((_a[0]) & _mask[0]) & !((!_a[0]) & _mask[0]);
+  //   int32_t d1 = !((_a[1]) & _mask[1]) & !((!_a[1]) & _mask[1]);
+  //   int32_t d2 = !((_a[2]) & _mask[2]) & !((!_a[2]) & _mask[2]);
+  //   int32_t d3 = !((_a[3]) & _mask[3]) & !((!_a[3]) & _mask[3]);
+  //   int32_t result = ((d0 & d1 & d2 & d3) == 0) ? 1 : 0;
+  //
+  //   int32_t ret = _mm_test_mix_ones_zeros(a, mask);
+  //
+  //   return result == ret ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_testc_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = impl.test_cases_int_pointer1;
+  //   const int32_t *_b = impl.test_cases_int_pointer2;
+  //   __m128i a = _mm_load_si128((const __m128i *)_a);
+  //   __m128i b = _mm_load_si128((const __m128i *)_b);
+  //   int testc = 1;
+  //   for (int i = 0; i < 2; i++) {
+  //     if ((~(((SIMDVec *)&a)->m128_u64[i]) & ((SIMDVec *)&b)->m128_u64[i])) {
+  //       testc = 0;
+  //       break;
+  //     }
+  //   }
+  //   return _mm_testc_si128(a, b) == testc ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_testnzc_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   return test_mm_test_mix_ones_zeros(impl, iter);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_testz_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *_a = impl.test_cases_int_pointer1;
+  //   const int32_t *_b = impl.test_cases_int_pointer2;
+  //   __m128i a = _mm_load_si128((const __m128i *)_a);
+  //   __m128i b = _mm_load_si128((const __m128i *)_b);
+  //   int testz = 1;
+  //   for (int i = 0; i < 2; i++) {
+  //     if ((((SIMDVec *)&a)->m128_u64[i] & ((SIMDVec *)&b)->m128_u64[i])) {
+  //       testz = 0;
+  //       break;
+  //     }
+  //   }
+  //   return _mm_testz_si128(a, b) == testz ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+/* SSE4.2 */
+
+result_t test_mm_cmpestrc(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   GENERATE_MM_CMPESTRC_TEST_CASES
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpgt_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
+  //   const int64_t *_b = (const int64_t *)impl.test_cases_int_pointer2;
+  //
+  //   int64_t result[2];
+  //   result[0] = _a[0] > _b[0] ? -1 : 0;
+  //   result[1] = _a[1] > _b[1] ? -1 : 0;
+  //
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   __m128i iret = _mm_cmpgt_epi64(a, b);
+  //
+  //   return validate_int64(iret, result[0], result[1]);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpistrs(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   GENERATE_MM_CMPISTRS_TEST_CASES
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_cmpistrz(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   GENERATE_MM_CMPISTRZ_TEST_CASES
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_crc32_u16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   uint32_t crc = *(const uint32_t *)impl.test_cases_int_pointer1;
+  //   uint16_t v = iter;
+  //   uint32_t result = _mm_crc32_u16(crc, v);
+  //   ASSERT_RETURN(result == canonical_crc32_u16(crc, v));
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_crc32_u32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   uint32_t crc = *(const uint32_t *)impl.test_cases_int_pointer1;
+  //   uint32_t v = *(const uint32_t *)impl.test_cases_int_pointer2;
+  //   uint32_t result = _mm_crc32_u32(crc, v);
+  //   ASSERT_RETURN(result == canonical_crc32_u32(crc, v));
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_crc32_u64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   uint64_t crc = *(const uint64_t *)impl.test_cases_int_pointer1;
+  //   uint64_t v = *(const uint64_t *)impl.test_cases_int_pointer2;
+  //   uint64_t result = _mm_crc32_u64(crc, v);
+  //   ASSERT_RETURN(result == canonical_crc32_u64(crc, v));
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_crc32_u8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   uint32_t crc = *(const uint32_t *)impl.test_cases_int_pointer1;
+  //   uint8_t v = iter;
+  //   uint32_t result = _mm_crc32_u8(crc, v);
+  //   ASSERT_RETURN(result == canonical_crc32_u8(crc, v));
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+/* AES */
+result_t test_mm_aesenc_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *a = (int32_t *)impl.test_cases_int_pointer1;
+  //   const int32_t *b = (int32_t *)impl.test_cases_int_pointer2;
+  //   __m128i data = _mm_loadu_si128((const __m128i *)a);
+  //   __m128i rk = _mm_loadu_si128((const __m128i *)b);
+  //
+  //   __m128i resultReference = aesenc_128_reference(data, rk);
+  //   __m128i resultIntrinsic = _mm_aesenc_si128(data, rk);
+  //
+  //   return validate_128bits(resultReference, resultIntrinsic);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_aesdec_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const int32_t *a = (int32_t *)impl.test_cases_int_pointer1;
+  //   const int32_t *b = (int32_t *)impl.test_cases_int_pointer2;
+  //   __m128i data = _mm_loadu_si128((const __m128i *)a);
+  //   __m128i rk = _mm_loadu_si128((const __m128i *)b);
+  //
+  //   __m128i resultReference = aesdec_128_reference(data, rk);
+  //   __m128i resultIntrinsic = _mm_aesdec_si128(data, rk);
+  //
+  //   return validate_128bits(resultReference, resultIntrinsic);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_aesenclast_si128(const SSE2RVV_TEST_IMPL &impl,
+                                  uint32_t iter) {
+  //   const int32_t *a = (const int32_t *)impl.test_cases_int_pointer1;
+  //   const int32_t *b = (const int32_t *)impl.test_cases_int_pointer2;
+  //   __m128i data = _mm_loadu_si128((const __m128i *)a);
+  //   __m128i rk = _mm_loadu_si128((const __m128i *)b);
+  //
+  //   __m128i resultReference = aesenclast_128_reference(data, rk);
+  //   __m128i resultIntrinsic = _mm_aesenclast_si128(data, rk);
+  //
+  //   return validate_128bits(resultReference, resultIntrinsic);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_aesdeclast_si128(const SSE2RVV_TEST_IMPL &impl,
+                                  uint32_t iter) {
+  //   const uint8_t *a = (uint8_t *)impl.test_cases_int_pointer1;
+  //   const uint8_t *rk = (uint8_t *)impl.test_cases_int_pointer2;
+  //   __m128i _a = _mm_loadu_si128((const __m128i *)a);
+  //   __m128i _rk = _mm_loadu_si128((const __m128i *)rk);
+  //   uint8_t c[16] = {};
+  //
+  //   uint8_t v[4][4];
+  //   for (int i = 0; i < 16; ++i) {
+  //     v[((i / 4) + (i % 4)) % 4][i % 4] = crypto_aes_rsbox[a[i]];
+  //   }
+  //   for (int i = 0; i < 16; ++i) {
+  //     c[i] = v[i / 4][i % 4] ^ rk[i];
+  //   }
+  //
+  //   __m128i result_reference = _mm_loadu_si128((const __m128i *)c);
+  //   __m128i result_intrinsic = _mm_aesdeclast_si128(_a, _rk);
+  //
+  //   return validate_128bits(result_reference, result_intrinsic);
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_aesimc_si128(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint8_t *a = (uint8_t *)impl.test_cases_int_pointer1;
+  //   __m128i _a = _mm_loadu_si128((const __m128i *)a);
+  //
+  //   uint8_t e, f, g, h, v[4][4];
+  //   for (int i = 0; i < 16; ++i) {
+  //     ((uint8_t *)v)[i] = a[i];
+  //   }
+  //   for (int i = 0; i < 4; ++i) {
+  //     e = v[i][0];
+  //     f = v[i][1];
+  //     g = v[i][2];
+  //     h = v[i][3];
+  //
+  //     v[i][0] = MULTIPLY(e, 0x0e) ^ MULTIPLY(f, 0x0b) ^ MULTIPLY(g, 0x0d) ^
+  //               MULTIPLY(h, 0x09);
+  //     v[i][1] = MULTIPLY(e, 0x09) ^ MULTIPLY(f, 0x0e) ^ MULTIPLY(g, 0x0b) ^
+  //               MULTIPLY(h, 0x0d);
+  //     v[i][2] = MULTIPLY(e, 0x0d) ^ MULTIPLY(f, 0x09) ^ MULTIPLY(g, 0x0e) ^
+  //               MULTIPLY(h, 0x0b);
+  //     v[i][3] = MULTIPLY(e, 0x0b) ^ MULTIPLY(f, 0x0d) ^ MULTIPLY(g, 0x09) ^
+  //               MULTIPLY(h, 0x0e);
+  //   }
+  //
+  //   __m128i result_reference = _mm_loadu_si128((const __m128i *)v);
+  //   __m128i result_intrinsic = _mm_aesimc_si128(_a);
+  //
+  //   return validate_128bits(result_reference, result_intrinsic);
+  return TEST_UNIMPL;
+}
+
+static inline uint32_t sub_word(uint32_t in) {
+  return (crypto_aes_sbox[(in >> 24) & 0xff] << 24) |
+         (crypto_aes_sbox[(in >> 16) & 0xff] << 16) |
+         (crypto_aes_sbox[(in >> 8) & 0xff] << 8) |
+         (crypto_aes_sbox[in & 0xff]);
+}
+
+// FIXME: improve the test case for AES-256 key expansion.
+// Reference:
+// https://github.com/randombit/botan/blob/master/src/lib/block/aes/aes_ni/aes_ni.cpp
+result_t test_mm_aeskeygenassist_si128(const SSE2RVV_TEST_IMPL &impl,
+                                       uint32_t iter) {
+  //   const uint32_t *a = (uint32_t *)impl.test_cases_int_pointer1;
+  //   __m128i data = load_m128i(a);
+  //   uint32_t sub_x1 = sub_word(a[1]);
+  //   uint32_t sub_x3 = sub_word(a[3]);
+  //   __m128i result_reference;
+  //   __m128i result_intrinsic;
+  // #define TEST_IMPL(IDX)
+  //   uint32_t res##IDX[4] = {
+  //       sub_x1,
+  //       rotr(sub_x1, 8) ^ IDX,
+  //       sub_x3,
+  //       rotr(sub_x3, 8) ^ IDX,
+  //   };
+  //   result_reference = load_m128i(res##IDX);
+  //   result_intrinsic = _mm_aeskeygenassist_si128(data, IDX);
+  //   CHECK_RESULT(validate_128bits(result_reference, result_intrinsic));
+  //
+  //   IMM_256_ITER
+  // #undef TEST_IMPL
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+/* Others */
+result_t test_mm_clmulepi64_si128(const SSE2RVV_TEST_IMPL &impl,
+                                  uint32_t iter) {
+  //   const uint64_t *_a = (const uint64_t *)impl.test_cases_int_pointer1;
+  //   const uint64_t *_b = (const uint64_t *)impl.test_cases_int_pointer2;
+  //   __m128i a = load_m128i(_a);
+  //   __m128i b = load_m128i(_b);
+  //   auto result = clmul_64(_a[0], _b[0]);
+  //   if (!validate_uint64(_mm_clmulepi64_si128(a, b, 0x00), result.first,
+  //                        result.second))
+  //     return TEST_FAIL;
+  //   result = clmul_64(_a[1], _b[0]);
+  //   if (!validate_uint64(_mm_clmulepi64_si128(a, b, 0x01), result.first,
+  //                        result.second))
+  //     return TEST_FAIL;
+  //   result = clmul_64(_a[0], _b[1]);
+  //   if (!validate_uint64(_mm_clmulepi64_si128(a, b, 0x10), result.first,
+  //                        result.second))
+  //     return TEST_FAIL;
+  //   result = clmul_64(_a[1], _b[1]);
+  //   if (!validate_uint64(_mm_clmulepi64_si128(a, b, 0x11), result.first,
+  //                        result.second))
+  //     return TEST_FAIL;
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_get_denormals_zero_mode(const SSE2RVV_TEST_IMPL &impl,
+                                         uint32_t iter) {
+  //   int res_denormals_zero_on, res_denormals_zero_off;
+  //
+  //   _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+  //   res_denormals_zero_on =
+  //       _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_ON;
+  //
+  //   _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
+  //   res_denormals_zero_off =
+  //       _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_OFF;
+  //
+  //   return (res_denormals_zero_on && res_denormals_zero_off) ? TEST_SUCCESS
+  //                                                            : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+// static int popcnt_reference(uint64_t a) {
+// int count = 0;
+// while (a != 0) {
+//   count += a & 1;
+//   a >>= 1;
+// }
+// return count;
+// }
+
+result_t test_mm_popcnt_u32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint64_t *a = (const uint64_t *)impl.test_cases_int_pointer1;
+  //   ASSERT_RETURN(popcnt_reference((uint32_t)a[0]) == _mm_popcnt_u32(a[0]));
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_popcnt_u64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   const uint64_t *a = (const uint64_t *)impl.test_cases_int_pointer1;
+  //   ASSERT_RETURN(popcnt_reference(a[0]) == _mm_popcnt_u64(a[0]));
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_mm_set_denormals_zero_mode(const SSE2RVV_TEST_IMPL &impl,
+                                         uint32_t iter) {
+  //   result_t res_set_denormals_zero_on, res_set_denormals_zero_off;
+  //   float factor = 2;
+  //   float denormal = FLT_MIN / factor;
+  //   float denormals[4] = {denormal, denormal, denormal, denormal};
+  //   float factors[4] = {factor, factor, factor, factor};
+  //   __m128 ret;
+  //
+  //   _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+  //   ret = _mm_mul_ps(load_m128(denormals), load_m128(factors));
+  //   res_set_denormals_zero_on = validate_float(ret, 0, 0, 0, 0);
+  //
+  //   _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
+  //   ret = _mm_mul_ps(load_m128(denormals), load_m128(factors));
+  //   res_set_denormals_zero_off =
+  //       validate_float(ret, FLT_MIN, FLT_MIN, FLT_MIN, FLT_MIN);
+  //
+  //   if (res_set_denormals_zero_on == TEST_FAIL ||
+  //       res_set_denormals_zero_off == TEST_FAIL)
+  //     return TEST_FAIL;
+  //   return TEST_SUCCESS;
+  return TEST_UNIMPL;
+}
+
+result_t test_rdtsc(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  //   uint64_t start = _rdtsc();
+  //   for (int i = 0; i < 100000; i++) {
+  // #if defined(_MSC_VER)
+  //     _ReadWriteBarrier();
+  // #else
+  //     __asm__ __volatile__("" ::: "memory");
+  // #endif
+  //   }
+  //   uint64_t end = _rdtsc();
+  //   return end > start ? TEST_SUCCESS : TEST_FAIL;
+  return TEST_UNIMPL;
+}
+
+#if defined(__riscv_v_elen)
+#define REGISTER_SIZE __riscv_v_elen
+#elif defined(__aarch64__)
+#define REGISTER_SIZE 128
+#elif (defined(__x86_64__) || defined(__i386__))
+#define REGISTER_SIZE sizeof(__m128)
+#endif
+
+SSE2RVV_TEST_IMPL::SSE2RVV_TEST_IMPL(void) {
+  test_cases_float_pointer1 = (float *)platform_aligned_alloc(REGISTER_SIZE);
+  test_cases_float_pointer2 = (float *)platform_aligned_alloc(REGISTER_SIZE);
+  test_cases_int_pointer1 = (int32_t *)platform_aligned_alloc(REGISTER_SIZE);
+  test_cases_int_pointer2 = (int32_t *)platform_aligned_alloc(REGISTER_SIZE);
+  SSE2RVV_INIT_RNG(123456);
+  for (uint32_t i = 0; i < MAX_TEST_VALUE; i++) {
+    test_cases_floats[i] = ranf(-100000, 100000);
+    test_cases_ints[i] = (int32_t)ranf(-100000, 100000);
+  }
+}
+
+// Dummy function to match the case label in run_single_test.
+result_t test_last(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
+  return TEST_SUCCESS;
+}
+
+result_t SSE2RVV_TEST_IMPL::load_test_float_pointers(uint32_t i) {
+  // result_t ret = do_mm_store_ps(
+  //     test_cases_float_pointer1, test_cases_floats[i], test_cases_floats[i +
+  //     1], test_cases_floats[i + 2], test_cases_floats[i + 3]);
+  // if (ret == TEST_SUCCESS) {
+  //   ret = do_mm_store_ps(test_cases_float_pointer2, test_cases_floats[i + 4],
+  //                        test_cases_floats[i + 5], test_cases_floats[i + 6],
+  //                        test_cases_floats[i + 7]);
+  // }
+  // return ret;
+  return TEST_UNIMPL;
+}
+
+result_t SSE2RVV_TEST_IMPL::load_test_int_pointers(uint32_t i) {
+  // result_t ret = do_mm_store_ps(test_cases_int_pointer1, test_cases_ints[i],
+  //                               test_cases_ints[i + 1], test_cases_ints[i +
+  //                               2], test_cases_ints[i + 3]);
+  // if (ret == TEST_SUCCESS) {
+  //   ret = do_mm_store_ps(test_cases_int_pointer2, test_cases_ints[i + 4],
+  //                        test_cases_ints[i + 5], test_cases_ints[i + 6],
+  //                        test_cases_ints[i + 7]);
+  // }
+
+  // return ret;
+  return TEST_UNIMPL;
+}
+
+result_t SSE2RVV_TEST_IMPL::run_single_test(INSTRUCTION_TEST test, uint32_t i) {
+  result_t ret = TEST_SUCCESS;
+
+  switch (test) {
+#define _(x)                                                                   \
+  case it_##x:                                                                 \
+    ret = test_##x(*this, i);                                                  \
+    break;
+    INTRIN_LIST
+#undef _
+  }
+
+  return ret;
+}
+
+const char *instruction_string[] = {
+#define _(x) #x,
+    INTRIN_LIST
+#undef _
+};
+
+SSE2RVV_TEST *SSE2RVV_TEST::create(void) {
+  SSE2RVV_TEST_IMPL *st = new SSE2RVV_TEST_IMPL;
+  return static_cast<SSE2RVV_TEST *>(st);
+}
+
+} // namespace SSE2RVV
diff --git a/tests/impl.h b/tests/impl.h
new file mode 100644
index 0000000..bdf4931
--- /dev/null
+++ b/tests/impl.h
@@ -0,0 +1,570 @@
+#ifndef SSE2RVV_TEST_H
+#define SSE2RVV_TEST_H
+
+#include "common.h"
+
+#define INTRIN_LIST                                                            \
+  /* MMX */                                                                    \
+  _(mm_empty)                                                                  \
+  /* SSE */                                                                    \
+  _(mm_add_ps)                                                                 \
+  _(mm_add_ss)                                                                 \
+  _(mm_and_ps)                                                                 \
+  _(mm_andnot_ps)                                                              \
+  _(mm_avg_pu16)                                                               \
+  _(mm_avg_pu8)                                                                \
+  _(mm_cmpeq_ps)                                                               \
+  _(mm_cmpeq_ss)                                                               \
+  _(mm_cmpge_ps)                                                               \
+  _(mm_cmpge_ss)                                                               \
+  _(mm_cmpgt_ps)                                                               \
+  _(mm_cmpgt_ss)                                                               \
+  _(mm_cmple_ps)                                                               \
+  _(mm_cmple_ss)                                                               \
+  _(mm_cmplt_ps)                                                               \
+  _(mm_cmplt_ss)                                                               \
+  _(mm_cmpneq_ps)                                                              \
+  _(mm_cmpneq_ss)                                                              \
+  _(mm_cmpnge_ps)                                                              \
+  _(mm_cmpnge_ss)                                                              \
+  _(mm_cmpngt_ps)                                                              \
+  _(mm_cmpngt_ss)                                                              \
+  _(mm_cmpnle_ps)                                                              \
+  _(mm_cmpnle_ss)                                                              \
+  _(mm_cmpnlt_ps)                                                              \
+  _(mm_cmpnlt_ss)                                                              \
+  _(mm_cmpord_ps)                                                              \
+  _(mm_cmpord_ss)                                                              \
+  _(mm_cmpunord_ps)                                                            \
+  _(mm_cmpunord_ss)                                                            \
+  _(mm_comieq_ss)                                                              \
+  _(mm_comige_ss)                                                              \
+  _(mm_comigt_ss)                                                              \
+  _(mm_comile_ss)                                                              \
+  _(mm_comilt_ss)                                                              \
+  _(mm_comineq_ss)                                                             \
+  _(mm_cvt_pi2ps)                                                              \
+  _(mm_cvt_ps2pi)                                                              \
+  _(mm_cvt_si2ss)                                                              \
+  _(mm_cvt_ss2si)                                                              \
+  _(mm_cvtpi16_ps)                                                             \
+  _(mm_cvtpi32_ps)                                                             \
+  _(mm_cvtpi32x2_ps)                                                           \
+  _(mm_cvtpi8_ps)                                                              \
+  _(mm_cvtps_pi16)                                                             \
+  _(mm_cvtps_pi32)                                                             \
+  _(mm_cvtps_pi8)                                                              \
+  _(mm_cvtpu16_ps)                                                             \
+  _(mm_cvtpu8_ps)                                                              \
+  _(mm_cvtsi32_ss)                                                             \
+  _(mm_cvtsi64_ss)                                                             \
+  _(mm_cvtss_f32)                                                              \
+  _(mm_cvtss_si32)                                                             \
+  _(mm_cvtss_si64)                                                             \
+  _(mm_cvtt_ps2pi)                                                             \
+  _(mm_cvtt_ss2si)                                                             \
+  _(mm_cvttps_pi32)                                                            \
+  _(mm_cvttss_si32)                                                            \
+  _(mm_cvttss_si64)                                                            \
+  _(mm_div_ps)                                                                 \
+  _(mm_div_ss)                                                                 \
+  _(mm_extract_pi16)                                                           \
+  _(mm_free)                                                                   \
+  _(mm_get_flush_zero_mode)                                                    \
+  _(mm_get_rounding_mode)                                                      \
+  _(mm_getcsr)                                                                 \
+  _(mm_insert_pi16)                                                            \
+  _(mm_load_ps)                                                                \
+  _(mm_load_ps1)                                                               \
+  _(mm_load_ss)                                                                \
+  _(mm_load1_ps)                                                               \
+  _(mm_loadh_pi)                                                               \
+  _(mm_loadl_pi)                                                               \
+  _(mm_loadr_ps)                                                               \
+  _(mm_loadu_ps)                                                               \
+  _(mm_loadu_si16)                                                             \
+  _(mm_loadu_si64)                                                             \
+  _(mm_malloc)                                                                 \
+  _(mm_maskmove_si64)                                                          \
+  _(m_maskmovq)                                                                \
+  _(mm_max_pi16)                                                               \
+  _(mm_max_ps)                                                                 \
+  _(mm_max_pu8)                                                                \
+  _(mm_max_ss)                                                                 \
+  _(mm_min_pi16)                                                               \
+  _(mm_min_ps)                                                                 \
+  _(mm_min_pu8)                                                                \
+  _(mm_min_ss)                                                                 \
+  _(mm_move_ss)                                                                \
+  _(mm_movehl_ps)                                                              \
+  _(mm_movelh_ps)                                                              \
+  _(mm_movemask_pi8)                                                           \
+  _(mm_movemask_ps)                                                            \
+  _(mm_mul_ps)                                                                 \
+  _(mm_mul_ss)                                                                 \
+  _(mm_mulhi_pu16)                                                             \
+  _(mm_or_ps)                                                                  \
+  _(m_pavgb)                                                                   \
+  _(m_pavgw)                                                                   \
+  _(m_pextrw)                                                                  \
+  _(m_pinsrw)                                                                  \
+  _(m_pmaxsw)                                                                  \
+  _(m_pmaxub)                                                                  \
+  _(m_pminsw)                                                                  \
+  _(m_pminub)                                                                  \
+  _(m_pmovmskb)                                                                \
+  _(m_pmulhuw)                                                                 \
+  _(mm_prefetch)                                                               \
+  _(m_psadbw)                                                                  \
+  _(m_pshufw)                                                                  \
+  _(mm_rcp_ps)                                                                 \
+  _(mm_rcp_ss)                                                                 \
+  _(mm_rsqrt_ps)                                                               \
+  _(mm_rsqrt_ss)                                                               \
+  _(mm_sad_pu8)                                                                \
+  _(mm_set_flush_zero_mode)                                                    \
+  _(mm_set_ps)                                                                 \
+  _(mm_set_ps1)                                                                \
+  _(mm_set_rounding_mode)                                                      \
+  _(mm_set_ss)                                                                 \
+  _(mm_set1_ps)                                                                \
+  _(mm_setcsr)                                                                 \
+  _(mm_setr_ps)                                                                \
+  _(mm_setzero_ps)                                                             \
+  _(mm_sfence)                                                                 \
+  _(mm_shuffle_pi16)                                                           \
+  _(mm_shuffle_ps)                                                             \
+  _(mm_sqrt_ps)                                                                \
+  _(mm_sqrt_ss)                                                                \
+  _(mm_store_ps)                                                               \
+  _(mm_store_ps1)                                                              \
+  _(mm_store_ss)                                                               \
+  _(mm_store1_ps)                                                              \
+  _(mm_storeh_pi)                                                              \
+  _(mm_storel_pi)                                                              \
+  _(mm_storer_ps)                                                              \
+  _(mm_storeu_ps)                                                              \
+  _(mm_storeu_si16)                                                            \
+  _(mm_storeu_si64)                                                            \
+  _(mm_stream_pi)                                                              \
+  _(mm_stream_ps)                                                              \
+  _(mm_sub_ps)                                                                 \
+  _(mm_sub_ss)                                                                 \
+  _(mm_ucomieq_ss)                                                             \
+  _(mm_ucomige_ss)                                                             \
+  _(mm_ucomigt_ss)                                                             \
+  _(mm_ucomile_ss)                                                             \
+  _(mm_ucomilt_ss)                                                             \
+  _(mm_ucomineq_ss)                                                            \
+  _(mm_undefined_ps)                                                           \
+  _(mm_unpackhi_ps)                                                            \
+  _(mm_unpacklo_ps)                                                            \
+  _(mm_xor_ps)                                                                 \
+  /* SSE2 */                                                                   \
+  _(mm_add_epi16)                                                              \
+  _(mm_add_epi32)                                                              \
+  _(mm_add_epi64)                                                              \
+  _(mm_add_epi8)                                                               \
+  _(mm_add_pd)                                                                 \
+  _(mm_add_sd)                                                                 \
+  _(mm_add_si64)                                                               \
+  _(mm_adds_epi16)                                                             \
+  _(mm_adds_epi8)                                                              \
+  _(mm_adds_epu16)                                                             \
+  _(mm_adds_epu8)                                                              \
+  _(mm_and_pd)                                                                 \
+  _(mm_and_si128)                                                              \
+  _(mm_andnot_pd)                                                              \
+  _(mm_andnot_si128)                                                           \
+  _(mm_avg_epu16)                                                              \
+  _(mm_avg_epu8)                                                               \
+  _(mm_bslli_si128)                                                            \
+  _(mm_bsrli_si128)                                                            \
+  _(mm_castpd_ps)                                                              \
+  _(mm_castpd_si128)                                                           \
+  _(mm_castps_pd)                                                              \
+  _(mm_castps_si128)                                                           \
+  _(mm_castsi128_pd)                                                           \
+  _(mm_castsi128_ps)                                                           \
+  _(mm_clflush)                                                                \
+  _(mm_cmpeq_epi16)                                                            \
+  _(mm_cmpeq_epi32)                                                            \
+  _(mm_cmpeq_epi8)                                                             \
+  _(mm_cmpeq_pd)                                                               \
+  _(mm_cmpeq_sd)                                                               \
+  _(mm_cmpge_pd)                                                               \
+  _(mm_cmpge_sd)                                                               \
+  _(mm_cmpgt_epi16)                                                            \
+  _(mm_cmpgt_epi32)                                                            \
+  _(mm_cmpgt_epi8)                                                             \
+  _(mm_cmpgt_pd)                                                               \
+  _(mm_cmpgt_sd)                                                               \
+  _(mm_cmple_pd)                                                               \
+  _(mm_cmple_sd)                                                               \
+  _(mm_cmplt_epi16)                                                            \
+  _(mm_cmplt_epi32)                                                            \
+  _(mm_cmplt_epi8)                                                             \
+  _(mm_cmplt_pd)                                                               \
+  _(mm_cmplt_sd)                                                               \
+  _(mm_cmpneq_pd)                                                              \
+  _(mm_cmpneq_sd)                                                              \
+  _(mm_cmpnge_pd)                                                              \
+  _(mm_cmpnge_sd)                                                              \
+  _(mm_cmpngt_pd)                                                              \
+  _(mm_cmpngt_sd)                                                              \
+  _(mm_cmpnle_pd)                                                              \
+  _(mm_cmpnle_sd)                                                              \
+  _(mm_cmpnlt_pd)                                                              \
+  _(mm_cmpnlt_sd)                                                              \
+  _(mm_cmpord_pd)                                                              \
+  _(mm_cmpord_sd)                                                              \
+  _(mm_cmpunord_pd)                                                            \
+  _(mm_cmpunord_sd)                                                            \
+  _(mm_comieq_sd)                                                              \
+  _(mm_comige_sd)                                                              \
+  _(mm_comigt_sd)                                                              \
+  _(mm_comile_sd)                                                              \
+  _(mm_comilt_sd)                                                              \
+  _(mm_comineq_sd)                                                             \
+  _(mm_cvtepi32_pd)                                                            \
+  _(mm_cvtepi32_ps)                                                            \
+  _(mm_cvtpd_epi32)                                                            \
+  _(mm_cvtpd_pi32)                                                             \
+  _(mm_cvtpd_ps)                                                               \
+  _(mm_cvtpi32_pd)                                                             \
+  _(mm_cvtps_epi32)                                                            \
+  _(mm_cvtps_pd)                                                               \
+  _(mm_cvtsd_f64)                                                              \
+  _(mm_cvtsd_si32)                                                             \
+  _(mm_cvtsd_si64)                                                             \
+  _(mm_cvtsd_si64x)                                                            \
+  _(mm_cvtsd_ss)                                                               \
+  _(mm_cvtsi128_si32)                                                          \
+  _(mm_cvtsi128_si64)                                                          \
+  _(mm_cvtsi128_si64x)                                                         \
+  _(mm_cvtsi32_sd)                                                             \
+  _(mm_cvtsi32_si128)                                                          \
+  _(mm_cvtsi64_sd)                                                             \
+  _(mm_cvtsi64_si128)                                                          \
+  _(mm_cvtsi64x_sd)                                                            \
+  _(mm_cvtsi64x_si128)                                                         \
+  _(mm_cvtss_sd)                                                               \
+  _(mm_cvttpd_epi32)                                                           \
+  _(mm_cvttpd_pi32)                                                            \
+  _(mm_cvttps_epi32)                                                           \
+  _(mm_cvttsd_si32)                                                            \
+  _(mm_cvttsd_si64)                                                            \
+  _(mm_cvttsd_si64x)                                                           \
+  _(mm_div_pd)                                                                 \
+  _(mm_div_sd)                                                                 \
+  _(mm_extract_epi16)                                                          \
+  _(mm_insert_epi16)                                                           \
+  _(mm_lfence)                                                                 \
+  _(mm_load_pd)                                                                \
+  _(mm_load_pd1)                                                               \
+  _(mm_load_sd)                                                                \
+  _(mm_load_si128)                                                             \
+  _(mm_load1_pd)                                                               \
+  _(mm_loadh_pd)                                                               \
+  _(mm_loadl_epi64)                                                            \
+  _(mm_loadl_pd)                                                               \
+  _(mm_loadr_pd)                                                               \
+  _(mm_loadu_pd)                                                               \
+  _(mm_loadu_si128)                                                            \
+  _(mm_loadu_si32)                                                             \
+  _(mm_madd_epi16)                                                             \
+  _(mm_maskmoveu_si128)                                                        \
+  _(mm_max_epi16)                                                              \
+  _(mm_max_epu8)                                                               \
+  _(mm_max_pd)                                                                 \
+  _(mm_max_sd)                                                                 \
+  _(mm_mfence)                                                                 \
+  _(mm_min_epi16)                                                              \
+  _(mm_min_epu8)                                                               \
+  _(mm_min_pd)                                                                 \
+  _(mm_min_sd)                                                                 \
+  _(mm_move_epi64)                                                             \
+  _(mm_move_sd)                                                                \
+  _(mm_movemask_epi8)                                                          \
+  _(mm_movemask_pd)                                                            \
+  _(mm_movepi64_pi64)                                                          \
+  _(mm_movpi64_epi64)                                                          \
+  _(mm_mul_epu32)                                                              \
+  _(mm_mul_pd)                                                                 \
+  _(mm_mul_sd)                                                                 \
+  _(mm_mul_su32)                                                               \
+  _(mm_mulhi_epi16)                                                            \
+  _(mm_mulhi_epu16)                                                            \
+  _(mm_mullo_epi16)                                                            \
+  _(mm_or_pd)                                                                  \
+  _(mm_or_si128)                                                               \
+  _(mm_packs_epi16)                                                            \
+  _(mm_packs_epi32)                                                            \
+  _(mm_packus_epi16)                                                           \
+  _(mm_pause)                                                                  \
+  _(mm_sad_epu8)                                                               \
+  _(mm_set_epi16)                                                              \
+  _(mm_set_epi32)                                                              \
+  _(mm_set_epi64)                                                              \
+  _(mm_set_epi64x)                                                             \
+  _(mm_set_epi8)                                                               \
+  _(mm_set_pd)                                                                 \
+  _(mm_set_pd1)                                                                \
+  _(mm_set_sd)                                                                 \
+  _(mm_set1_epi16)                                                             \
+  _(mm_set1_epi32)                                                             \
+  _(mm_set1_epi64)                                                             \
+  _(mm_set1_epi64x)                                                            \
+  _(mm_set1_epi8)                                                              \
+  _(mm_set1_pd)                                                                \
+  _(mm_setr_epi16)                                                             \
+  _(mm_setr_epi32)                                                             \
+  _(mm_setr_epi64)                                                             \
+  _(mm_setr_epi8)                                                              \
+  _(mm_setr_pd)                                                                \
+  _(mm_setzero_pd)                                                             \
+  _(mm_setzero_si128)                                                          \
+  _(mm_shuffle_epi32)                                                          \
+  _(mm_shuffle_pd)                                                             \
+  _(mm_shufflehi_epi16)                                                        \
+  _(mm_shufflelo_epi16)                                                        \
+  _(mm_sll_epi16)                                                              \
+  _(mm_sll_epi32)                                                              \
+  _(mm_sll_epi64)                                                              \
+  _(mm_slli_epi16)                                                             \
+  _(mm_slli_epi32)                                                             \
+  _(mm_slli_epi64)                                                             \
+  _(mm_slli_si128)                                                             \
+  _(mm_sqrt_pd)                                                                \
+  _(mm_sqrt_sd)                                                                \
+  _(mm_sra_epi16)                                                              \
+  _(mm_sra_epi32)                                                              \
+  _(mm_srai_epi16)                                                             \
+  _(mm_srai_epi32)                                                             \
+  _(mm_srl_epi16)                                                              \
+  _(mm_srl_epi32)                                                              \
+  _(mm_srl_epi64)                                                              \
+  _(mm_srli_epi16)                                                             \
+  _(mm_srli_epi32)                                                             \
+  _(mm_srli_epi64)                                                             \
+  _(mm_srli_si128)                                                             \
+  _(mm_store_pd)                                                               \
+  _(mm_store_pd1)                                                              \
+  _(mm_store_sd)                                                               \
+  _(mm_store_si128)                                                            \
+  _(mm_store1_pd)                                                              \
+  _(mm_storeh_pd)                                                              \
+  _(mm_storel_epi64)                                                           \
+  _(mm_storel_pd)                                                              \
+  _(mm_storer_pd)                                                              \
+  _(mm_storeu_pd)                                                              \
+  _(mm_storeu_si128)                                                           \
+  _(mm_storeu_si32)                                                            \
+  _(mm_stream_pd)                                                              \
+  _(mm_stream_si128)                                                           \
+  _(mm_stream_si32)                                                            \
+  _(mm_stream_si64)                                                            \
+  _(mm_sub_epi16)                                                              \
+  _(mm_sub_epi32)                                                              \
+  _(mm_sub_epi64)                                                              \
+  _(mm_sub_epi8)                                                               \
+  _(mm_sub_pd)                                                                 \
+  _(mm_sub_sd)                                                                 \
+  _(mm_sub_si64)                                                               \
+  _(mm_subs_epi16)                                                             \
+  _(mm_subs_epi8)                                                              \
+  _(mm_subs_epu16)                                                             \
+  _(mm_subs_epu8)                                                              \
+  _(mm_ucomieq_sd)                                                             \
+  _(mm_ucomige_sd)                                                             \
+  _(mm_ucomigt_sd)                                                             \
+  _(mm_ucomile_sd)                                                             \
+  _(mm_ucomilt_sd)                                                             \
+  _(mm_ucomineq_sd)                                                            \
+  _(mm_undefined_pd)                                                           \
+  _(mm_undefined_si128)                                                        \
+  _(mm_unpackhi_epi16)                                                         \
+  _(mm_unpackhi_epi32)                                                         \
+  _(mm_unpackhi_epi64)                                                         \
+  _(mm_unpackhi_epi8)                                                          \
+  _(mm_unpackhi_pd)                                                            \
+  _(mm_unpacklo_epi16)                                                         \
+  _(mm_unpacklo_epi32)                                                         \
+  _(mm_unpacklo_epi64)                                                         \
+  _(mm_unpacklo_epi8)                                                          \
+  _(mm_unpacklo_pd)                                                            \
+  _(mm_xor_pd)                                                                 \
+  _(mm_xor_si128)                                                              \
+  /* SSE3 */                                                                   \
+  _(mm_addsub_pd)                                                              \
+  _(mm_addsub_ps)                                                              \
+  _(mm_hadd_pd)                                                                \
+  _(mm_hadd_ps)                                                                \
+  _(mm_hsub_pd)                                                                \
+  _(mm_hsub_ps)                                                                \
+  _(mm_lddqu_si128)                                                            \
+  _(mm_loaddup_pd)                                                             \
+  _(mm_movedup_pd)                                                             \
+  _(mm_movehdup_ps)                                                            \
+  _(mm_moveldup_ps)                                                            \
+  /* SSSE3 */                                                                  \
+  _(mm_abs_epi16)                                                              \
+  _(mm_abs_epi32)                                                              \
+  _(mm_abs_epi8)                                                               \
+  _(mm_abs_pi16)                                                               \
+  _(mm_abs_pi32)                                                               \
+  _(mm_abs_pi8)                                                                \
+  _(mm_alignr_epi8)                                                            \
+  _(mm_alignr_pi8)                                                             \
+  _(mm_hadd_epi16)                                                             \
+  _(mm_hadd_epi32)                                                             \
+  _(mm_hadd_pi16)                                                              \
+  _(mm_hadd_pi32)                                                              \
+  _(mm_hadds_epi16)                                                            \
+  _(mm_hadds_pi16)                                                             \
+  _(mm_hsub_epi16)                                                             \
+  _(mm_hsub_epi32)                                                             \
+  _(mm_hsub_pi16)                                                              \
+  _(mm_hsub_pi32)                                                              \
+  _(mm_hsubs_epi16)                                                            \
+  _(mm_hsubs_pi16)                                                             \
+  _(mm_maddubs_epi16)                                                          \
+  _(mm_maddubs_pi16)                                                           \
+  _(mm_mulhrs_epi16)                                                           \
+  _(mm_mulhrs_pi16)                                                            \
+  _(mm_shuffle_epi8)                                                           \
+  _(mm_shuffle_pi8)                                                            \
+  _(mm_sign_epi16)                                                             \
+  _(mm_sign_epi32)                                                             \
+  _(mm_sign_epi8)                                                              \
+  _(mm_sign_pi16)                                                              \
+  _(mm_sign_pi32)                                                              \
+  _(mm_sign_pi8)                                                               \
+  /* SSE4.1 */                                                                 \
+  _(mm_blend_epi16)                                                            \
+  _(mm_blend_pd)                                                               \
+  _(mm_blend_ps)                                                               \
+  _(mm_blendv_epi8)                                                            \
+  _(mm_blendv_pd)                                                              \
+  _(mm_blendv_ps)                                                              \
+  _(mm_ceil_pd)                                                                \
+  _(mm_ceil_ps)                                                                \
+  _(mm_ceil_sd)                                                                \
+  _(mm_ceil_ss)                                                                \
+  _(mm_cmpeq_epi64)                                                            \
+  _(mm_cvtepi16_epi32)                                                         \
+  _(mm_cvtepi16_epi64)                                                         \
+  _(mm_cvtepi32_epi64)                                                         \
+  _(mm_cvtepi8_epi16)                                                          \
+  _(mm_cvtepi8_epi32)                                                          \
+  _(mm_cvtepi8_epi64)                                                          \
+  _(mm_cvtepu16_epi32)                                                         \
+  _(mm_cvtepu16_epi64)                                                         \
+  _(mm_cvtepu32_epi64)                                                         \
+  _(mm_cvtepu8_epi16)                                                          \
+  _(mm_cvtepu8_epi32)                                                          \
+  _(mm_cvtepu8_epi64)                                                          \
+  _(mm_dp_pd)                                                                  \
+  _(mm_dp_ps)                                                                  \
+  _(mm_extract_epi32)                                                          \
+  _(mm_extract_epi64)                                                          \
+  _(mm_extract_epi8)                                                           \
+  _(mm_extract_ps)                                                             \
+  _(mm_floor_pd)                                                               \
+  _(mm_floor_ps)                                                               \
+  _(mm_floor_sd)                                                               \
+  _(mm_floor_ss)                                                               \
+  _(mm_insert_epi32)                                                           \
+  _(mm_insert_epi64)                                                           \
+  _(mm_insert_epi8)                                                            \
+  _(mm_insert_ps)                                                              \
+  _(mm_max_epi32)                                                              \
+  _(mm_max_epi8)                                                               \
+  _(mm_max_epu16)                                                              \
+  _(mm_max_epu32)                                                              \
+  _(mm_min_epi32)                                                              \
+  _(mm_min_epi8)                                                               \
+  _(mm_min_epu16)                                                              \
+  _(mm_min_epu32)                                                              \
+  _(mm_minpos_epu16)                                                           \
+  _(mm_mpsadbw_epu8)                                                           \
+  _(mm_mul_epi32)                                                              \
+  _(mm_mullo_epi32)                                                            \
+  _(mm_packus_epi32)                                                           \
+  _(mm_round_pd)                                                               \
+  _(mm_round_ps)                                                               \
+  _(mm_round_sd)                                                               \
+  _(mm_round_ss)                                                               \
+  _(mm_stream_load_si128)                                                      \
+  _(mm_test_all_ones)                                                          \
+  _(mm_test_all_zeros)                                                         \
+  _(mm_test_mix_ones_zeros)                                                    \
+  _(mm_testc_si128)                                                            \
+  _(mm_testnzc_si128)                                                          \
+  _(mm_testz_si128)                                                            \
+  /* SSE4.2 */                                                                 \
+  /*_(mm_cmpestra)*/                                                           \
+  /*_(mm_cmpestrc) */                                                          \
+  /*_(mm_cmpestri) */                                                          \
+  /*_(mm_cmpestrm) */                                                          \
+  /*_(mm_cmpestro) */                                                          \
+  /*_(mm_cmpestrs) */                                                          \
+  /*_(mm_cmpestrz) */                                                          \
+  /*_(mm_cmpgt_epi64) */                                                       \
+  /*_(mm_cmpistra) */                                                          \
+  /*_(mm_cmpistrc) */                                                          \
+  /*_(mm_cmpistri) */                                                          \
+  /*_(mm_cmpistrm) */                                                          \
+  /*_(mm_cmpistro) */                                                          \
+  /*_(mm_cmpistrs) */                                                          \
+  /*_(mm_cmpistrz) */                                                          \
+  /*_(mm_crc32_u16) */                                                         \
+  /*_(mm_crc32_u32) */                                                         \
+  /*_(mm_crc32_u64) */                                                         \
+  /*_(mm_crc32_u8) */                                                          \
+  /* AES */                                                                    \
+  _(mm_aesenc_si128)                                                           \
+  _(mm_aesdec_si128)                                                           \
+  _(mm_aesenclast_si128)                                                       \
+  _(mm_aesdeclast_si128)                                                       \
+  _(mm_aesimc_si128)                                                           \
+  _(mm_aeskeygenassist_si128)                                                  \
+  /* Others */                                                                 \
+  _(mm_clmulepi64_si128)                                                       \
+  _(mm_get_denormals_zero_mode)                                                \
+  _(mm_popcnt_u32)                                                             \
+  _(mm_popcnt_u64)                                                             \
+  _(mm_set_denormals_zero_mode)                                                \
+  _(rdtsc)                                                                     \
+  _(last) /* This indicates the end of macros */
+
+namespace SSE2RVV {
+// The way unit tests are implemented is that 10,000 random floating point and
+// integer vec4 numbers are generated as sample data.
+//
+// A short C implementation of every intrinsic is implemented and compared to
+// the actual expected results from the corresponding SSE intrinsic against all
+// of the 10,000 randomized input vectors. When running on RISCV, then the
+// results are compared to the RISCV approximate version.
+extern const char *instruction_string[];
+enum INSTRUCTION_TEST {
+#define _(x) it_##x,
+  INTRIN_LIST
+#undef _
+};
+
+class SSE2RVV_TEST {
+public:
+  static SSE2RVV_TEST *create(void); // create the test.
+
+  // Run test of this instruction;
+  // Passed: TEST_SUCCESS (1)
+  // Failed: TEST_FAIL (0)
+  // Unimplemented: TEST_UNIMPL (-1)
+  virtual result_t run_test(INSTRUCTION_TEST test) = 0;
+  virtual void release(void) = 0;
+};
+
+} // namespace SSE2RVV
+
+#endif // SSE2RVV_TEST_H
diff --git a/tests/main.cpp b/tests/main.cpp
new file mode 100644
index 0000000..e7deff6
--- /dev/null
+++ b/tests/main.cpp
@@ -0,0 +1,36 @@
+#include "impl.h"
+#include <stdint.h>
+#include <stdio.h>
+
+int main(int /*argc*/, const char ** /*argv*/) {
+  SSE2RVV::SSE2RVV_TEST *test = SSE2RVV::SSE2RVV_TEST::create();
+  uint32_t pass_count = 0;
+  uint32_t failed_count = 0;
+  uint32_t ignore_count = 0;
+  for (uint32_t i = 0; i < SSE2RVV::it_last; i++) {
+    SSE2RVV::INSTRUCTION_TEST it = SSE2RVV::INSTRUCTION_TEST(i);
+    SSE2RVV::result_t ret = test->run_test(it);
+    // If the test fails, we will run it again so we can step into the
+    // debugger and figure out why!
+    if (ret == SSE2RVV::TEST_FAIL) {
+      printf("Test %-30s failed\n", SSE2RVV::instruction_string[it]);
+      failed_count++;
+    } else if (ret == SSE2RVV::TEST_UNIMPL) {
+      printf("Test %-30s skipped\n", SSE2RVV::instruction_string[it]);
+      ignore_count++;
+    } else {
+      printf("Test %-30s passed\n", SSE2RVV::instruction_string[it]);
+      pass_count++;
+    }
+  }
+  test->release();
+  printf("SSE2RVV_TEST Complete!\n"
+         "Passed:  %d\n"
+         "Failed:  %d\n"
+         "Ignored: %d\n"
+         "Coverage rate: %.2f%%\n",
+         pass_count, failed_count, ignore_count,
+         (float)pass_count / (pass_count + failed_count + ignore_count) * 100);
+
+  return failed_count ? -1 : 0;
+}