diff --git a/Makefile b/Makefile
index 2a11bea..9a84284 100644
--- a/Makefile
+++ b/Makefile
@@ -28,8 +28,6 @@ else # CROSS_COMPILE was set
 		$(error Unsupported cross-compiler)
 	endif
 
-	ARCH_CFLAGS = -march=$(processor)gcv_zba
-
 	ifeq ($(SIMULATOR_TYPE), qemu)
 		SIMULATOR += qemu-riscv64
 		SIMULATOR_FLAGS = -cpu $(processor),v=true,zba=true,vlen=128
@@ -40,6 +38,14 @@ else # CROSS_COMPILE was set
 	endif
 endif
 
+ifeq ($(processor),$(filter $(processor),rv64 rv32))
+    ARCH_CFLAGS = -march=$(processor)gcv_zba
+else ifeq ($(processor),$(filter $(processor),i386 x86_64))
+    ARCH_CFLAGS = -maes -mpclmul -mssse3 -msse4.2
+else
+    $(error Unsupported architecture)
+endif
+
 CXXFLAGS += -Wall -Wcast-qual -I. $(ARCH_CFLAGS)
 LDFLAGS	+= -lm
 OBJS = \
diff --git a/sse2rvv.h b/sse2rvv.h
index 3a7b3e6..c58d709 100644
--- a/sse2rvv.h
+++ b/sse2rvv.h
@@ -73,145 +73,6 @@
 #include <stdint.h>
 #include <stdlib.h>
 
-#if defined(__GNUC__) || defined(__clang__)
-#define _sse2rvv_define0(type, s, body)                                        \
-  __extension__({                                                              \
-    type _a = (s);                                                             \
-    body                                                                       \
-  })
-#define _sse2rvv_define1(type, s, body)                                        \
-  __extension__({                                                              \
-    type _a = (s);                                                             \
-    body                                                                       \
-  })
-#define _sse2rvv_define2(type, a, b, body)                                     \
-  __extension__({                                                              \
-    type _a = (a), _b = (b);                                                   \
-    body                                                                       \
-  })
-#define _sse2rvv_return(ret) (ret)
-#else
-#define _sse2rvv_define0(type, a, body) [=](type _a) { body }(a)
-#define _sse2rvv_define1(type, a, body) [](type _a) { body }(a)
-#define _sse2rvv_define2(type, a, b, body)                                     \
-  [](type _a, type _b) { body }((a), (b))
-#define _sse2rvv_return(ret) return ret
-#endif
-
-#define _sse2rvv_init(...)                                                     \
-  { __VA_ARGS__ }
-
-/* Compiler barrier */
-#define SSE2RVV_BARRIER()                                                      \
-  do {                                                                         \
-    __asm__ __volatile__("" ::: "memory");                                     \
-    (void)0;                                                                   \
-  } while (0)
-
-/* Memory barriers
- * __atomic_thread_fence does not include a compiler barrier; instead,
- * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
- * semantics.
- */
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
-#include <stdatomic.h>
-#endif
-
-FORCE_INLINE void _sse2rvv_smp_mb(void) {
-  SSE2RVV_BARRIER();
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) &&              \
-    !defined(__STDC_NO_ATOMICS__)
-  atomic_thread_fence(memory_order_seq_cst);
-#elif defined(__GNUC__) || defined(__clang__)
-  __atomic_thread_fence(__ATOMIC_SEQ_CST);
-#endif
-}
-
-/* "__has_builtin" can be used to query support for built-in functions
- * provided by gcc/clang and other compilers that support it.
- */
-#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
-/* Compatibility with gcc <= 9 */
-#if defined(__GNUC__) && (__GNUC__ <= 9)
-#define __has_builtin(x) HAS##x
-#define HAS__builtin_popcount 1
-#define HAS__builtin_popcountll 1
-
-// __builtin_shuffle introduced in GCC 4.7.0
-#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
-#define HAS__builtin_shuffle 1
-#else
-#define HAS__builtin_shuffle 0
-#endif
-
-#define HAS__builtin_shufflevector 0
-#define HAS__builtin_nontemporal_store 0
-#else
-#define __has_builtin(x) 0
-#endif
-#endif
-
-/**
- * MACRO for shuffle parameter for _mm_shuffle_ps().
- * Argument fp3 is a digit[0123] that represents the fp from argument "b"
- * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
- * for fp2 in result. fp1 is a digit[0123] that represents the fp from
- * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
- * fp0 is the same for fp0 of result.
- */
-#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)                                        \
-  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
-
-#if __has_builtin(__builtin_shufflevector)
-#define _sse2rvv_shuffle(type, a, b, ...)                                      \
-  __builtin_shufflevector(a, b, __VA_ARGS__)
-#elif __has_builtin(__builtin_shuffle)
-#define _sse2rvv_shuffle(type, a, b, ...)                                      \
-  __extension__({                                                              \
-    type tmp = {__VA_ARGS__};                                                  \
-    __builtin_shuffle(a, b, tmp);                                              \
-  })
-#endif
-
-#ifdef _sse2rvv_shuffle
-#define vshuffle_s16(a, b, ...) _sse2rvv_shuffle(int16x4_t, a, b, __VA_ARGS__)
-#define vshuffleq_s16(a, b, ...) _sse2rvv_shuffle(int16x8_t, a, b, __VA_ARGS__)
-#define vshuffle_s32(a, b, ...) _sse2rvv_shuffle(int32x2_t, a, b, __VA_ARGS__)
-#define vshuffleq_s32(a, b, ...) _sse2rvv_shuffle(int32x4_t, a, b, __VA_ARGS__)
-#define vshuffle_s64(a, b, ...) _sse2rvv_shuffle(int64x1_t, a, b, __VA_ARGS__)
-#define vshuffleq_s64(a, b, ...) _sse2rvv_shuffle(int64x2_t, a, b, __VA_ARGS__)
-#endif
-
-/* Rounding mode macros. */
-#define _MM_FROUND_TO_NEAREST_INT 0x00
-#define _MM_FROUND_TO_NEG_INF 0x01
-#define _MM_FROUND_TO_POS_INF 0x02
-#define _MM_FROUND_TO_ZERO 0x03
-#define _MM_FROUND_CUR_DIRECTION 0x04
-#define _MM_FROUND_NO_EXC 0x08
-#define _MM_FROUND_RAISE_EXC 0x00
-#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
-#define _MM_ROUND_NEAREST 0x0000
-#define _MM_ROUND_DOWN 0x2000
-#define _MM_ROUND_UP 0x4000
-#define _MM_ROUND_TOWARD_ZERO 0x6000
-/* Flush zero mode macros. */
-#define _MM_FLUSH_ZERO_MASK 0x8000
-#define _MM_FLUSH_ZERO_ON 0x8000
-#define _MM_FLUSH_ZERO_OFF 0x0000
-/* Denormals are zeros mode macros. */
-#define _MM_DENORMALS_ZERO_MASK 0x0040
-#define _MM_DENORMALS_ZERO_ON 0x0040
-#define _MM_DENORMALS_ZERO_OFF 0x0000
-
-/* indicate immediate constant argument in a given range */
-#define __constrange(a, b) const
-
 /* A few intrinsics accept traditional data types like ints or floats, but
  * most operate on data types that are specific to SSE.
  * If a vector type ends in d, it contains doubles, and if it does not have
@@ -277,7 +138,7 @@ typedef union ALIGN_STRUCT(16) SIMDVec {
 
 // Function declaration
 // SSE
-FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void);
+// FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void);
 // FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
 // FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
 // FORCE_INLINE __m128 _mm_set_ps1(float);
@@ -840,24 +701,6 @@ typedef struct {
 // FORCE_INLINE void _mm_free(void *addr) {}
 #endif
 
-FORCE_INLINE uint64_t _sse2rvv_get_fpcr(void) {
-  uint64_t value;
-#if defined(_MSC_VER)
-  value = _ReadStatusReg(ARM64_FPCR);
-#else
-  __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */
-#endif
-  return value;
-}
-
-FORCE_INLINE void _sse2rvv_set_fpcr(uint64_t value) {
-#if defined(_MSC_VER)
-  _WriteStatusReg(ARM64_FPCR, value);
-#else
-  __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */
-#endif
-}
-
 // Macro: Get the flush zero bits from the MXCSR control and status register.
 // The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
 // _MM_FLUSH_ZERO_OFF
@@ -868,28 +711,7 @@ FORCE_INLINE void _sse2rvv_set_fpcr(uint64_t value) {
 // The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
 // _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
-FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void) {
-  union {
-    fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64)
-    uint64_t value;
-#else
-    uint32_t value;
-#endif
-  } r;
-
-#if defined(__aarch64__) || defined(_M_ARM64)
-  r.value = _sse2rvv_get_fpcr();
-#else
-  __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-  if (r.field.bit22) {
-    return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
-  } else {
-    return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
-  }
-}
+// FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void) {}
 
 // Copy a to dst, and insert the 16-bit integer i into dst at the location
 // specified by imm8.
@@ -1198,46 +1020,7 @@ FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void) {
 // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
 // _MM_ROUND_TOWARD_ZERO
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
-FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) {
-  union {
-    fpcr_bitfield field;
-#if defined(__aarch64__) || defined(_M_ARM64)
-    uint64_t value;
-#else
-    uint32_t value;
-#endif
-  } r;
-
-#if defined(__aarch64__) || defined(_M_ARM64)
-  r.value = _sse2rvv_get_fpcr();
-#else
-  __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-  switch (rounding) {
-  case _MM_ROUND_TOWARD_ZERO:
-    r.field.bit22 = 1;
-    r.field.bit23 = 1;
-    break;
-  case _MM_ROUND_DOWN:
-    r.field.bit22 = 0;
-    r.field.bit23 = 1;
-    break;
-  case _MM_ROUND_UP:
-    r.field.bit22 = 1;
-    r.field.bit23 = 0;
-    break;
-  default: //_MM_ROUND_NEAREST
-    r.field.bit22 = 0;
-    r.field.bit23 = 0;
-  }
-
-#if defined(__aarch64__) || defined(_M_ARM64)
-  _sse2rvv_set_fpcr(r.value);
-#else
-  __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
-#endif
-}
+// FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) {}
 
 // Copy single-precision (32-bit) floating-point element a to the lower element
 // of dst, and zero the upper 3 elements.
@@ -3291,548 +3074,6 @@ FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) {
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
 // FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) {}
 
-/* SSE4.2 */
-
-static const uint16_t ALIGN_STRUCT(16) _sse2rvv_cmpestr_mask16b[8] = {
-    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-};
-static const uint8_t ALIGN_STRUCT(16) _sse2rvv_cmpestr_mask8b[16] = {
-    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-};
-
-/* specify the source data format */
-#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
-#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
-#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
-#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
-
-/* specify the comparison operation */
-#define _SIDD_CMP_EQUAL_ANY 0x00     /* compare equal any: strchr */
-#define _SIDD_CMP_RANGES 0x04        /* compare ranges */
-#define _SIDD_CMP_EQUAL_EACH 0x08    /* compare equal each: strcmp */
-#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
-
-/* specify the polarity */
-#define _SIDD_POSITIVE_POLARITY 0x00
-#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
-#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
-#define _SIDD_MASKED_NEGATIVE_POLARITY                                         \
-  0x30 /* negate results only before end of string */
-
-/* specify the output selection in _mm_cmpXstri */
-#define _SIDD_LEAST_SIGNIFICANT 0x00
-#define _SIDD_MOST_SIGNIFICANT 0x40
-
-/* specify the output selection in _mm_cmpXstrm */
-#define _SIDD_BIT_MASK 0x00
-#define _SIDD_UNIT_MASK 0x40
-
-/* Pattern Matching for C macros.
- * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
- */
-
-/* catenate */
-#define SSE2RVV_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
-#define SSE2RVV_CAT(a, b) SSE2RVV_PRIMITIVE_CAT(a, b)
-
-#define SSE2RVV_IIF(c) SSE2RVV_PRIMITIVE_CAT(SSE2RVV_IIF_, c)
-/* run the 2nd parameter */
-#define SSE2RVV_IIF_0(t, ...) __VA_ARGS__
-/* run the 1st parameter */
-#define SSE2RVV_IIF_1(t, ...) t
-
-#define SSE2RVV_COMPL(b) SSE2RVV_PRIMITIVE_CAT(SSE2RVV_COMPL_, b)
-#define SSE2RVV_COMPL_0 1
-#define SSE2RVV_COMPL_1 0
-
-#define SSE2RVV_DEC(x) SSE2RVV_PRIMITIVE_CAT(SSE2RVV_DEC_, x)
-#define SSE2RVV_DEC_1 0
-#define SSE2RVV_DEC_2 1
-#define SSE2RVV_DEC_3 2
-#define SSE2RVV_DEC_4 3
-#define SSE2RVV_DEC_5 4
-#define SSE2RVV_DEC_6 5
-#define SSE2RVV_DEC_7 6
-#define SSE2RVV_DEC_8 7
-#define SSE2RVV_DEC_9 8
-#define SSE2RVV_DEC_10 9
-#define SSE2RVV_DEC_11 10
-#define SSE2RVV_DEC_12 11
-#define SSE2RVV_DEC_13 12
-#define SSE2RVV_DEC_14 13
-#define SSE2RVV_DEC_15 14
-#define SSE2RVV_DEC_16 15
-
-/* detection */
-#define SSE2RVV_CHECK_N(x, n, ...) n
-#define SSE2RVV_CHECK(...) SSE2RVV_CHECK_N(__VA_ARGS__, 0, )
-#define SSE2RVV_PROBE(x) x, 1,
-
-#define SSE2RVV_NOT(x) SSE2RVV_CHECK(SSE2RVV_PRIMITIVE_CAT(SSE2RVV_NOT_, x))
-#define SSE2RVV_NOT_0 SSE2RVV_PROBE(~)
-
-#define SSE2RVV_BOOL(x) SSE2RVV_COMPL(SSE2RVV_NOT(x))
-#define SSE2RVV_IF(c) SSE2RVV_IIF(SSE2RVV_BOOL(c))
-
-#define SSE2RVV_EAT(...)
-#define SSE2RVV_EXPAND(...) __VA_ARGS__
-#define SSE2RVV_WHEN(c) SSE2RVV_IF(c)(SSE2RVV_EXPAND, SSE2RVV_EAT)
-
-/* recursion */
-/* deferred expression */
-#define SSE2RVV_EMPTY()
-#define SSE2RVV_DEFER(id) id SSE2RVV_EMPTY()
-#define SSE2RVV_OBSTRUCT(...) __VA_ARGS__ SSE2RVV_DEFER(SSE2RVV_EMPTY)()
-#define SSE2RVV_EXPAND(...) __VA_ARGS__
-
-#define SSE2RVV_EVAL(...)                                                      \
-  SSE2RVV_EVAL1(SSE2RVV_EVAL1(SSE2RVV_EVAL1(__VA_ARGS__)))
-#define SSE2RVV_EVAL1(...)                                                     \
-  SSE2RVV_EVAL2(SSE2RVV_EVAL2(SSE2RVV_EVAL2(__VA_ARGS__)))
-#define SSE2RVV_EVAL2(...)                                                     \
-  SSE2RVV_EVAL3(SSE2RVV_EVAL3(SSE2RVV_EVAL3(__VA_ARGS__)))
-#define SSE2RVV_EVAL3(...) __VA_ARGS__
-
-#define SSE2RVV_REPEAT(count, macro, ...)                                      \
-  SSE2RVV_WHEN(count)                                                          \
-  (SSE2RVV_OBSTRUCT(SSE2RVV_REPEAT_INDIRECT)()(                                \
-      SSE2RVV_DEC(count), macro,                                               \
-      __VA_ARGS__)SSE2RVV_OBSTRUCT(macro)(SSE2RVV_DEC(count), __VA_ARGS__))
-#define SSE2RVV_REPEAT_INDIRECT() SSE2RVV_REPEAT
-
-#define SSE2RVV_SIZE_OF_byte 8
-#define SSE2RVV_NUMBER_OF_LANES_byte 16
-#define SSE2RVV_SIZE_OF_word 16
-#define SSE2RVV_NUMBER_OF_LANES_word 8
-
-#define SSE2RVV_COMPARE_EQUAL_THEN_FILL_LANE(i, type)                          \
-  mtx[i] = vreinterpretq_m128i_##type(vceqq_##type(                            \
-      vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)),   \
-      vreinterpretq_##type##_m128i(a)));
-
-#define SSE2RVV_FILL_LANE(i, type)                                             \
-  vec_b[i] =                                                                   \
-      vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
-
-#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size,         \
-                       number_of_lanes, byte_or_word)                          \
-  do {                                                                         \
-    SSE2RVV_CAT(                                                               \
-        data_type_prefix,                                                      \
-        SSE2RVV_CAT(size, SSE2RVV_CAT(x, SSE2RVV_CAT(number_of_lanes, _t))))   \
-    vec_b[number_of_lanes];                                                    \
-    __m128i mask = SSE2RVV_IIF(byte_or_word)(                                  \
-        vreinterpretq_m128i_u16(vdupq_n_u16(0xff)),                            \
-        vreinterpretq_m128i_u32(vdupq_n_u32(0xffff)));                         \
-    SSE2RVV_EVAL(SSE2RVV_REPEAT(number_of_lanes, SSE2RVV_FILL_LANE,            \
-                                SSE2RVV_CAT(type_prefix, size)))               \
-    for (int i = 0; i < number_of_lanes; i++) {                                \
-      mtx[i] =                                                                 \
-          SSE2RVV_CAT(vreinterpretq_m128i_u, size)(SSE2RVV_CAT(vbslq_u, size)( \
-              SSE2RVV_CAT(vreinterpretq_u, SSE2RVV_CAT(size, _m128i))(mask),   \
-              SSE2RVV_CAT(vcgeq_, SSE2RVV_CAT(type_prefix, size))(             \
-                  vec_b[i],                                                    \
-                  SSE2RVV_CAT(vreinterpretq_,                                  \
-                              SSE2RVV_CAT(type_prefix,                         \
-                                          SSE2RVV_CAT(size, _m128i(a))))),     \
-              SSE2RVV_CAT(vcleq_, SSE2RVV_CAT(type_prefix, size))(             \
-                  vec_b[i],                                                    \
-                  SSE2RVV_CAT(vreinterpretq_,                                  \
-                              SSE2RVV_CAT(type_prefix,                         \
-                                          SSE2RVV_CAT(size, _m128i(a)))))));   \
-    }                                                                          \
-  } while (0)
-
-#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes)                           \
-  do {                                                                         \
-    SSE2RVV_EVAL(SSE2RVV_REPEAT(number_of_lanes,                               \
-                                SSE2RVV_COMPARE_EQUAL_THEN_FILL_LANE,          \
-                                SSE2RVV_CAT(u, size)))                         \
-  } while (0)
-
-#define SSE2RVV_CMP_EQUAL_ANY_IMPL(type)                                       \
-  static int _sse2rvv_cmp_##type##_equal_any(__m128i a, int la, __m128i b,     \
-                                             int lb) {                         \
-    __m128i mtx[16];                                                           \
-    PCMPSTR_EQ(a, b, mtx, SSE2RVV_CAT(SSE2RVV_SIZE_OF_, type),                 \
-               SSE2RVV_CAT(SSE2RVV_NUMBER_OF_LANES_, type));                   \
-    return SSE2RVV_CAT(                                                        \
-        _sse2rvv_aggregate_equal_any_,                                         \
-        SSE2RVV_CAT(SSE2RVV_CAT(SSE2RVV_SIZE_OF_, type),                       \
-                    SSE2RVV_CAT(x, SSE2RVV_CAT(SSE2RVV_NUMBER_OF_LANES_,       \
-                                               type))))(la, lb, mtx);          \
-  }
-
-#define SSE2RVV_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)             \
-  static int _sse2rvv_cmp_##us##type##_ranges(__m128i a, int la, __m128i b,    \
-                                              int lb) {                        \
-    __m128i mtx[16];                                                           \
-    PCMPSTR_RANGES(a, b, mtx, data_type, us,                                   \
-                   SSE2RVV_CAT(SSE2RVV_SIZE_OF_, type),                        \
-                   SSE2RVV_CAT(SSE2RVV_NUMBER_OF_LANES_, type), byte_or_word); \
-    return SSE2RVV_CAT(                                                        \
-        _sse2rvv_aggregate_ranges_,                                            \
-        SSE2RVV_CAT(SSE2RVV_CAT(SSE2RVV_SIZE_OF_, type),                       \
-                    SSE2RVV_CAT(x, SSE2RVV_CAT(SSE2RVV_NUMBER_OF_LANES_,       \
-                                               type))))(la, lb, mtx);          \
-  }
-
-#define SSE2RVV_CMP_EQUAL_ORDERED_IMPL(type)                                   \
-  static int _sse2rvv_cmp_##type##_equal_ordered(__m128i a, int la, __m128i b, \
-                                                 int lb) {                     \
-    __m128i mtx[16];                                                           \
-    PCMPSTR_EQ(a, b, mtx, SSE2RVV_CAT(SSE2RVV_SIZE_OF_, type),                 \
-               SSE2RVV_CAT(SSE2RVV_NUMBER_OF_LANES_, type));                   \
-    return SSE2RVV_CAT(                                                        \
-        _sse2rvv_aggregate_equal_ordered_,                                     \
-        SSE2RVV_CAT(                                                           \
-            SSE2RVV_CAT(SSE2RVV_SIZE_OF_, type),                               \
-            SSE2RVV_CAT(x, SSE2RVV_CAT(SSE2RVV_NUMBER_OF_LANES_, type))))(     \
-        SSE2RVV_CAT(SSE2RVV_NUMBER_OF_LANES_, type), la, lb, mtx);             \
-  }
-
-static int _sse2rvv_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16]) {
-  int res = 0;
-  int m = (1 << la) - 1;
-  uint8x8_t vec_mask = vld1_u8(_sse2rvv_cmpestr_mask8b);
-  uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
-  uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
-  uint8x16_t vec = vcombine_u8(t_lo, t_hi);
-  for (int j = 0; j < lb; j++) {
-    mtx[j] =
-        vreinterpretq_m128i_u8(vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
-    mtx[j] =
-        vreinterpretq_m128i_u8(vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
-    int tmp = _sse2rvv_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
-    res |= (tmp << j);
-  }
-  return res;
-}
-
-static int _sse2rvv_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16]) {
-  int res = 0;
-  int m = (1 << la) - 1;
-  uint16x8_t vec =
-      vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2rvv_cmpestr_mask16b));
-  for (int j = 0; j < lb; j++) {
-    mtx[j] = vreinterpretq_m128i_u16(
-        vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
-    mtx[j] = vreinterpretq_m128i_u16(
-        vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
-    int tmp = _sse2rvv_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
-    res |= (tmp << j);
-  }
-  return res;
-}
-
-/* clang-format off */
-#define SSE2RVV_GENERATE_CMP_EQUAL_ANY(prefix) \
-    prefix##IMPL(byte) \
-    prefix##IMPL(word)
-/* clang-format on */
-
-SSE2RVV_GENERATE_CMP_EQUAL_ANY(SSE2RVV_CMP_EQUAL_ANY_)
-
-static int _sse2rvv_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16]) {
-  int res = 0;
-  int m = (1 << la) - 1;
-  uint16x8_t vec =
-      vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2rvv_cmpestr_mask16b));
-  for (int j = 0; j < lb; j++) {
-    mtx[j] = vreinterpretq_m128i_u16(
-        vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
-    mtx[j] = vreinterpretq_m128i_u16(
-        vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
-    __m128i tmp = vreinterpretq_m128i_u32(
-        vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
-    uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
-                                   vreinterpretq_u32_m128i(tmp));
-#if defined(__aarch64__) || defined(_M_ARM64)
-    int t = vaddvq_u32(vec_res) ? 1 : 0;
-#else
-    uint64x2_t sumh = vpaddlq_u32(vec_res);
-    int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
-#endif
-    res |= (t << j);
-  }
-  return res;
-}
-
-static int _sse2rvv_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16]) {
-  int res = 0;
-  int m = (1 << la) - 1;
-  uint8x8_t vec_mask = vld1_u8(_sse2rvv_cmpestr_mask8b);
-  uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
-  uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
-  uint8x16_t vec = vcombine_u8(t_lo, t_hi);
-  for (int j = 0; j < lb; j++) {
-    mtx[j] =
-        vreinterpretq_m128i_u8(vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
-    mtx[j] =
-        vreinterpretq_m128i_u8(vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
-    __m128i tmp = vreinterpretq_m128i_u16(
-        vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
-    uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
-                                   vreinterpretq_u16_m128i(tmp));
-    int t = _sse2rvv_vaddvq_u16(vec_res) ? 1 : 0;
-    res |= (t << j);
-  }
-  return res;
-}
-
-#define SSE2RVV_CMP_RANGES_IS_BYTE 1
-#define SSE2RVV_CMP_RANGES_IS_WORD 0
-
-/* clang-format off */
-#define SSE2RVV_GENERATE_CMP_RANGES(prefix)             \
-    prefix##IMPL(byte, uint, u, prefix##IS_BYTE)         \
-    prefix##IMPL(byte, int, s, prefix##IS_BYTE)          \
-    prefix##IMPL(word, uint, u, prefix##IS_WORD)         \
-    prefix##IMPL(word, int, s, prefix##IS_WORD)
-/* clang-format on */
-
-SSE2RVV_GENERATE_CMP_RANGES(SSE2RVV_CMP_RANGES_)
-
-#undef SSE2RVV_CMP_RANGES_IS_BYTE
-#undef SSE2RVV_CMP_RANGES_IS_WORD
-
-static int _sse2rvv_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb) {
-  uint8x16_t mtx =
-      vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
-  int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
-  int m1 = 0x10000 - (1 << la);
-  int tb = 0x10000 - (1 << lb);
-  uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
-  uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
-  vec_mask = vld1_u8(_sse2rvv_cmpestr_mask8b);
-  vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
-  vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
-  vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
-  vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
-  tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
-  tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
-
-  res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
-  res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
-  res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
-  res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
-  res_lo = vand_u8(res_lo, vec_mask);
-  res_hi = vand_u8(res_hi, vec_mask);
-
-  int res = _sse2rvv_vaddv_u8(res_lo) + (_sse2rvv_vaddv_u8(res_hi) << 8);
-  return res;
-}
-
-static int _sse2rvv_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb) {
-  uint16x8_t mtx =
-      vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
-  int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
-  int m1 = 0x100 - (1 << la);
-  int tb = 0x100 - (1 << lb);
-  uint16x8_t vec_mask = vld1q_u16(_sse2rvv_cmpestr_mask16b);
-  uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
-  uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
-  uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
-  mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
-  mtx = vbslq_u16(vec1, tmp, mtx);
-  mtx = vandq_u16(mtx, vec_mask);
-  return _sse2rvv_vaddvq_u16(mtx);
-}
-
-#define SSE2RVV_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
-#define SSE2RVV_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
-
-#define SSE2RVV_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type)   \
-  static int _sse2rvv_aggregate_equal_ordered_##size##x##number_of_lanes(      \
-      int bound, int la, int lb, __m128i mtx[16]) {                            \
-    int res = 0;                                                               \
-    int m1 = SSE2RVV_IIF(data_type)(0x10000, 0x100) - (1 << la);               \
-    uint##size##x8_t vec_mask =                                                \
-        SSE2RVV_IIF(data_type)(vld1_u##size(_sse2rvv_cmpestr_mask##size##b),   \
-                               vld1q_u##size(_sse2rvv_cmpestr_mask##size##b)); \
-    uint##size##x##number_of_lanes##_t vec1 = SSE2RVV_IIF(data_type)(          \
-        vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask),           \
-                         vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)),     \
-        vtstq_u##size(vdupq_n_u##size(m1), vec_mask));                         \
-    uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1);     \
-    uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0);          \
-    for (int j = 0; j < lb; j++) {                                             \
-      mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size(                      \
-          vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j])));         \
-    }                                                                          \
-    for (int j = lb; j < bound; j++) {                                         \
-      mtx[j] = vreinterpretq_m128i_u##size(                                    \
-          vbslq_u##size(vec1, vec_minusone, vec_zero));                        \
-    }                                                                          \
-    unsigned SSE2RVV_IIF(data_type)(char, short) *ptr =                        \
-        (unsigned SSE2RVV_IIF(data_type)(char, short) *)mtx;                   \
-    for (int i = 0; i < bound; i++) {                                          \
-      int val = 1;                                                             \
-      for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)             \
-        val &= ptr[k * bound + j];                                             \
-      res += val << i;                                                         \
-    }                                                                          \
-    return res;                                                                \
-  }
-
-/* clang-format off */
-#define SSE2RVV_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
-    prefix##IMPL(8, 16, prefix##IS_UBYTE)               \
-    prefix##IMPL(16, 8, prefix##IS_UWORD)
-/* clang-format on */
-
-SSE2RVV_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2RVV_AGGREGATE_EQUAL_ORDER_)
-
-#undef SSE2RVV_AGGREGATE_EQUAL_ORDER_IS_UBYTE
-#undef SSE2RVV_AGGREGATE_EQUAL_ORDER_IS_UWORD
-
-/* clang-format off */
-#define SSE2RVV_GENERATE_CMP_EQUAL_ORDERED(prefix) \
-    prefix##IMPL(byte)                              \
-    prefix##IMPL(word)
-/* clang-format on */
-
-SSE2RVV_GENERATE_CMP_EQUAL_ORDERED(SSE2RVV_CMP_EQUAL_ORDERED_)
-
-#define SSE2RVV_CMPESTR_LIST                                                   \
-  _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any)                                   \
-  _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any)                                   \
-  _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any)                                   \
-  _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any)                                   \
-  _(CMP_UBYTE_RANGES, cmp_ubyte_ranges)                                        \
-  _(CMP_UWORD_RANGES, cmp_uword_ranges)                                        \
-  _(CMP_SBYTE_RANGES, cmp_sbyte_ranges)                                        \
-  _(CMP_SWORD_RANGES, cmp_sword_ranges)                                        \
-  _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each)                                 \
-  _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each)                                 \
-  _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each)                                 \
-  _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each)                                 \
-  _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered)                           \
-  _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered)                           \
-  _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered)                           \
-  _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
-
-enum {
-#define _(name, func_suffix) name,
-  SSE2RVV_CMPESTR_LIST
-#undef _
-};
-typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
-static cmpestr_func_t _sse2rvv_cmpfunc_table[] = {
-#define _(name, func_suffix) _sse2rvv_##func_suffix,
-    SSE2RVV_CMPESTR_LIST
-#undef _
-};
-
-FORCE_INLINE int _sse2rvv_sido_negative(int res, int lb, int imm8, int bound) {
-  switch (imm8 & 0x30) {
-  case _SIDD_NEGATIVE_POLARITY:
-    res ^= 0xffffffff;
-    break;
-  case _SIDD_MASKED_NEGATIVE_POLARITY:
-    res ^= (1 << lb) - 1;
-    break;
-  default:
-    break;
-  }
-
-  return res & ((bound == 8) ? 0xFF : 0xFFFF);
-}
-
-FORCE_INLINE int _sse2rvv_clz(unsigned int x) {
-#ifdef _MSC_VER
-  unsigned long cnt = 0;
-  if (_BitScanReverse(&cnt, x))
-    return 31 - cnt;
-  return 32;
-#else
-  return x != 0 ? __builtin_clz(x) : 32;
-#endif
-}
-
-FORCE_INLINE int _sse2rvv_ctz(unsigned int x) {
-#ifdef _MSC_VER
-  unsigned long cnt = 0;
-  if (_BitScanForward(&cnt, x))
-    return cnt;
-  return 32;
-#else
-  return x != 0 ? __builtin_ctz(x) : 32;
-#endif
-}
-
-FORCE_INLINE int _sse2rvv_ctzll(unsigned long long x) {
-#ifdef _MSC_VER
-  unsigned long cnt;
-#if defined(SSE2RVV_HAS_BITSCAN64)
-  if (_BitScanForward64(&cnt, x))
-    return (int)(cnt);
-#else
-  if (_BitScanForward(&cnt, (unsigned long)(x)))
-    return (int)cnt;
-  if (_BitScanForward(&cnt, (unsigned long)(x >> 32)))
-    return (int)(cnt + 32);
-#endif /* SSE2RVV_HAS_BITSCAN64 */
-  return 64;
-#else /* assume GNU compatible compilers */
-  return x != 0 ? __builtin_ctzll(x) : 64;
-#endif
-}
-
-#define SSE2RVV_MIN(x, y) (x) < (y) ? (x) : (y)
-
-#define SSE2RVV_CMPSTR_SET_UPPER(var, imm) const int var = (imm & 0x01) ? 8 : 16
-
-#define SSE2RVV_CMPESTRX_LEN_PAIR(a, b, la, lb)                                \
-  int tmp1 = la ^ (la >> 31);                                                  \
-  la = tmp1 - (la >> 31);                                                      \
-  int tmp2 = lb ^ (lb >> 31);                                                  \
-  lb = tmp2 - (lb >> 31);                                                      \
-  la = SSE2RVV_MIN(la, bound);                                                 \
-  lb = SSE2RVV_MIN(lb, bound)
-
-// Compare all pairs of character in string a and b,
-// then aggregate the result.
-// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
-// length of string, we use SSE2RVV_CMP{I,E}STRX_GET_LEN to get the length of
-// string a and b.
-#define SSE2RVV_COMP_AGG(a, b, la, lb, imm8, IE)                               \
-  SSE2RVV_CMPSTR_SET_UPPER(bound, imm8);                                       \
-  SSE2RVV_##IE##_LEN_PAIR(a, b, la, lb);                                       \
-  int r2 = (_sse2rvv_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb);                \
-  r2 = _sse2rvv_sido_negative(r2, lb, imm8, bound)
-
-#define SSE2RVV_CMPSTR_GENERATE_INDEX(r2, bound, imm8)                         \
-  return (r2 == 0)                                                             \
-             ? bound                                                           \
-             : ((imm8 & 0x40) ? (31 - _sse2rvv_clz(r2)) : _sse2rvv_ctz(r2))
-
-#define SSE2RVV_CMPSTR_GENERATE_MASK(dst)                                      \
-  __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0));                         \
-  if (imm8 & 0x40) {                                                           \
-    if (bound == 8) {                                                          \
-      uint16x8_t tmp =                                                         \
-          vtstq_u16(vdupq_n_u16(r2), vld1q_u16(_sse2rvv_cmpestr_mask16b));     \
-      dst = vreinterpretq_m128i_u16(                                           \
-          vbslq_u16(tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst)));      \
-    } else {                                                                   \
-      uint8x16_t vec_r2 = vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));      \
-      uint8x16_t tmp = vtstq_u8(vec_r2, vld1q_u8(_sse2rvv_cmpestr_mask8b));    \
-      dst = vreinterpretq_m128i_u8(                                            \
-          vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst)));         \
-    }                                                                          \
-  } else {                                                                     \
-    if (bound == 16) {                                                         \
-      dst = vreinterpretq_m128i_u16(                                           \
-          vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0));       \
-    } else {                                                                   \
-      dst = vreinterpretq_m128i_u8(                                            \
-          vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0));           \
-    }                                                                          \
-  }                                                                            \
-  return dst
-
 // Compare packed strings in a and b with lengths la and lb using the control
 // in imm8, and returns 1 if b did not contain a null character and the
 // resulting mask was zero, and 0 otherwise.
@@ -3864,11 +3105,8 @@ FORCE_INLINE int _sse2rvv_ctzll(unsigned long long x) {
 // Compare packed strings in a and b with lengths la and lb using the control
 // in imm8, and store the generated mask in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
-FORCE_INLINE __m128i _mm_cmpestrm(__m128i a, int la, __m128i b, int lb,
-                                  const int imm8) {
-  SSE2RVV_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
-  SSE2RVV_CMPSTR_GENERATE_MASK(dst);
-}
+// FORCE_INLINE __m128i _mm_cmpestrm(__m128i a, int la, __m128i b, int lb,
+//                                   const int imm8) {}
 
 // Compare packed strings in a and b with lengths la and lb using the control in
 // imm8, and returns bit 0 of the resulting bit mask.
@@ -3897,32 +3135,6 @@ FORCE_INLINE __m128i _mm_cmpestrm(__m128i a, int la, __m128i b, int lb,
 //                               int lb,
 //                               const int imm8) {}
 
-#define SSE2RVV_CMPISTRX_LENGTH(str, len, imm8)                                \
-  do {                                                                         \
-    if (imm8 & 0x01) {                                                         \
-      uint16x8_t equal_mask_##str =                                            \
-          vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0));             \
-      uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);                  \
-      uint64_t matches_##str =                                                 \
-          vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);                    \
-      len = _sse2rvv_ctzll(matches_##str) >> 3;                                \
-    } else {                                                                   \
-      uint16x8_t equal_mask_##str = vreinterpretq_u16_u8(                      \
-          vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0)));               \
-      uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);                  \
-      uint64_t matches_##str =                                                 \
-          vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);                    \
-      len = _sse2rvv_ctzll(matches_##str) >> 2;                                \
-    }                                                                          \
-  } while (0)
-
-#define SSE2RVV_CMPISTRX_LEN_PAIR(a, b, la, lb)                                \
-  int la, lb;                                                                  \
-  do {                                                                         \
-    SSE2RVV_CMPISTRX_LENGTH(a, la, imm8);                                      \
-    SSE2RVV_CMPISTRX_LENGTH(b, lb, imm8);                                      \
-  } while (0)
-
 // Compare packed strings with implicit lengths in a and b using the control in
 // imm8, and returns 1 if b did not contain a null character and the resulting
 // mask was zero, and 0 otherwise.
@@ -3985,106 +3197,6 @@ FORCE_INLINE __m128i _mm_cmpestrm(__m128i a, int la, __m128i b, int lb,
 
 /* AES */
 
-#if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__))
-/* clang-format off */
-#define SSE2RVV_AES_SBOX(w)                                           \
-    {                                                                  \
-        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
-        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
-        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
-        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
-        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
-        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
-        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
-        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
-        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
-        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
-        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
-        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
-        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
-        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
-        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
-        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
-        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
-        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
-        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
-        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
-        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
-        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
-        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
-        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
-        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
-        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
-        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
-        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
-        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
-        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
-        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
-        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
-        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
-        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
-        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
-        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
-        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
-    }
-#define SSE2RVV_AES_RSBOX(w)                                          \
-    {                                                                  \
-        w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
-        w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
-        w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
-        w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
-        w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
-        w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
-        w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
-        w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
-        w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
-        w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
-        w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
-        w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
-        w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
-        w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
-        w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
-        w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
-        w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
-        w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
-        w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
-        w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
-        w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
-        w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
-        w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
-        w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
-        w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
-        w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
-        w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
-        w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
-        w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
-        w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
-        w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
-        w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
-        w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
-        w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
-        w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
-        w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
-        w(0x55), w(0x21), w(0x0c), w(0x7d)                             \
-    }
-/* clang-format on */
-
-/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
-#define SSE2RVV_AES_H0(x) (x)
-static const uint8_t _sse2rvv_sbox[256] = SSE2RVV_AES_SBOX(SSE2RVV_AES_H0);
-static const uint8_t _sse2rvv_rsbox[256] = SSE2RVV_AES_RSBOX(SSE2RVV_AES_H0);
-#undef SSE2RVV_AES_H0
-
-/* x_time function and matrix multiply function */
-#if !defined(__aarch64__) && !defined(_M_ARM64)
-#define SSE2RVV_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
-#define SSE2RVV_MULTIPLY(x, y)                                                 \
-  (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2RVV_XT(x)) ^                            \
-   ((y >> 2 & 1) * SSE2RVV_XT(SSE2RVV_XT(x))) ^                                \
-   ((y >> 3 & 1) * SSE2RVV_XT(SSE2RVV_XT(SSE2RVV_XT(x)))) ^                    \
-   ((y >> 4 & 1) * SSE2RVV_XT(SSE2RVV_XT(SSE2RVV_XT(SSE2RVV_XT(x))))))
-#endif
-
 // In the absence of crypto extensions, implement aesenc using regular NEON
 // intrinsics instead. See:
 // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
@@ -4121,48 +3233,6 @@ static const uint8_t _sse2rvv_rsbox[256] = SSE2RVV_AES_RSBOX(SSE2RVV_AES_H0);
 // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
 // for details.
 // FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) {}
-#undef SSE2RVV_AES_SBOX
-#undef SSE2RVV_AES_RSBOX
-
-#if defined(__aarch64__)
-#undef SSE2RVV_XT
-#undef SSE2RVV_MULTIPLY
-#endif
-
-#else /* __ARM_FEATURE_CRYPTO */
-// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
-// AESMC and then manually applying the real key as an xor operation. This
-// unfortunately means an additional xor op; the compiler should be able to
-// optimize this away for repeated calls however. See
-// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
-// for more details.
-// FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b) {}
-
-// Perform one round of an AES decryption flow on data (state) in a using the
-// round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
-// FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey) {}
-
-// Perform the last round of an AES encryption flow on data (state) in a using
-// the round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
-// FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) {}
-
-// Perform the last round of an AES decryption flow on data (state) in a using
-// the round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
-// FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey) {}
-
-// Perform the InvMixColumns transformation on a and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
-// FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a) {}
-
-// Assist in expanding the AES cipher key by computing steps towards generating
-// a round key for encryption cipher using data from a and an 8-bit round
-// constant specified in imm8, and store the result in dst."
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
-// FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) {}
-#endif
 
 /* Others */
 
@@ -4188,43 +3258,7 @@ static const uint8_t _sse2rvv_rsbox[256] = SSE2RVV_AES_RSBOX(SSE2RVV_AES_H0);
 
 // Return the current 64-bit value of the processor's time-stamp counter.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
-FORCE_INLINE uint64_t _rdtsc(void) {
-#if defined(__aarch64__) || defined(_M_ARM64)
-  uint64_t val;
-
-  /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
-   * system counter is at least 56 bits wide; from Armv8.6, the counter
-   * must be 64 bits wide.  So the system counter could be less than 64
-   * bits wide and it is attributed with the flag 'cap_user_time_short'
-   * is true.
-   */
-#if defined(_MSC_VER)
-  val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2));
-#else
-  __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
-#endif
-
-  return val;
-#else
-  uint32_t pmccntr, pmuseren, pmcntenset;
-  // Read the user mode Performance Monitoring Unit (PMU)
-  // User Enable Register (PMUSERENR) access permissions.
-  __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
-  if (pmuseren & 1) { // Allows reading PMUSERENR for user mode code.
-    __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
-    if (pmcntenset & 0x80000000UL) { // Is it counting?
-      __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
-      // The counter is set up to count every 64th cycle
-      return (uint64_t)(pmccntr) << 6;
-    }
-  }
-
-  // Fallback to syscall as we can't enable PMUSERENR in user mode.
-  struct timeval tv;
-  gettimeofday(&tv, NULL);
-  return (uint64_t)(tv.tv_sec) * 1000000 + tv.tv_usec;
-#endif
-}
+// FORCE_INLINE uint64_t _rdtsc(void) {}
 
 #if defined(__GNUC__) || defined(__clang__)
 #pragma pop_macro("ALIGN_STRUCT")
diff --git a/tests/impl.cpp b/tests/impl.cpp
index ef8eeae..cb03180 100644
--- a/tests/impl.cpp
+++ b/tests/impl.cpp
@@ -11229,6 +11229,12 @@ result_t SSE2RVV_TEST_IMPL::run_single_test(INSTRUCTION_TEST test, uint32_t i) {
   return ret;
 }
 
+const char *instruction_string[] = {
+#define _(x) #x,
+    INTRIN_LIST
+#undef _
+};
+
 SSE2RVV_TEST *SSE2RVV_TEST::create(void) {
   SSE2RVV_TEST_IMPL *st = new SSE2RVV_TEST_IMPL;
   return static_cast<SSE2RVV_TEST *>(st);
diff --git a/tests/impl.h b/tests/impl.h
index 60f02b9..d05d5f8 100644
--- a/tests/impl.h
+++ b/tests/impl.h
@@ -545,7 +545,7 @@ namespace SSE2RVV {
 // A short C implementation of every intrinsic is implemented and compared to
 // the actual expected results from the corresponding SSE intrinsic against all
 // of the 10,000 randomized input vectors. When running on RISCV, then the
-// results are compared to the NEON approximate version.
+// results are compared to the RISCV approximate version.
 extern const char *instruction_string[];
 enum INSTRUCTION_TEST {
 #define _(x) it_##x,