diff --git a/Makefile b/Makefile index 2a11bea..9a84284 100644 --- a/Makefile +++ b/Makefile @@ -28,8 +28,6 @@ else # CROSS_COMPILE was set $(error Unsupported cross-compiler) endif - ARCH_CFLAGS = -march=$(processor)gcv_zba - ifeq ($(SIMULATOR_TYPE), qemu) SIMULATOR += qemu-riscv64 SIMULATOR_FLAGS = -cpu $(processor),v=true,zba=true,vlen=128 @@ -40,6 +38,14 @@ else # CROSS_COMPILE was set endif endif +ifeq ($(processor),$(filter $(processor),rv64 rv32)) + ARCH_CFLAGS = -march=$(processor)gcv_zba +else ifeq ($(processor),$(filter $(processor),i386 x86_64)) + ARCH_CFLAGS = -maes -mpclmul -mssse3 -msse4.2 +else + $(error Unsupported architecture) +endif + CXXFLAGS += -Wall -Wcast-qual -I. $(ARCH_CFLAGS) LDFLAGS += -lm OBJS = \ diff --git a/sse2rvv.h b/sse2rvv.h index 3a7b3e6..c58d709 100644 --- a/sse2rvv.h +++ b/sse2rvv.h @@ -73,145 +73,6 @@ #include #include -#if defined(__GNUC__) || defined(__clang__) -#define _sse2rvv_define0(type, s, body) \ - __extension__({ \ - type _a = (s); \ - body \ - }) -#define _sse2rvv_define1(type, s, body) \ - __extension__({ \ - type _a = (s); \ - body \ - }) -#define _sse2rvv_define2(type, a, b, body) \ - __extension__({ \ - type _a = (a), _b = (b); \ - body \ - }) -#define _sse2rvv_return(ret) (ret) -#else -#define _sse2rvv_define0(type, a, body) [=](type _a) { body }(a) -#define _sse2rvv_define1(type, a, body) [](type _a) { body }(a) -#define _sse2rvv_define2(type, a, b, body) \ - [](type _a, type _b) { body }((a), (b)) -#define _sse2rvv_return(ret) return ret -#endif - -#define _sse2rvv_init(...) \ - { __VA_ARGS__ } - -/* Compiler barrier */ -#define SSE2RVV_BARRIER() \ - do { \ - __asm__ __volatile__("" ::: "memory"); \ - (void)0; \ - } while (0) - -/* Memory barriers - * __atomic_thread_fence does not include a compiler barrier; instead, - * the barrier is part of __atomic_load/__atomic_store's "volatile-like" - * semantics. - */ -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) -#include -#endif - -FORCE_INLINE void _sse2rvv_smp_mb(void) { - SSE2RVV_BARRIER(); -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \ - !defined(__STDC_NO_ATOMICS__) - atomic_thread_fence(memory_order_seq_cst); -#elif defined(__GNUC__) || defined(__clang__) - __atomic_thread_fence(__ATOMIC_SEQ_CST); -#endif -} - -/* "__has_builtin" can be used to query support for built-in functions - * provided by gcc/clang and other compilers that support it. - */ -#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */ -/* Compatibility with gcc <= 9 */ -#if defined(__GNUC__) && (__GNUC__ <= 9) -#define __has_builtin(x) HAS##x -#define HAS__builtin_popcount 1 -#define HAS__builtin_popcountll 1 - -// __builtin_shuffle introduced in GCC 4.7.0 -#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7)) -#define HAS__builtin_shuffle 1 -#else -#define HAS__builtin_shuffle 0 -#endif - -#define HAS__builtin_shufflevector 0 -#define HAS__builtin_nontemporal_store 0 -#else -#define __has_builtin(x) 0 -#endif -#endif - -/** - * MACRO for shuffle parameter for _mm_shuffle_ps(). - * Argument fp3 is a digit[0123] that represents the fp from argument "b" - * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same - * for fp2 in result. fp1 is a digit[0123] that represents the fp from - * argument "a" of mm_shuffle_ps that will be places in fp1 of result. - * fp0 is the same for fp0 of result. - */ -#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ - (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) - -#if __has_builtin(__builtin_shufflevector) -#define _sse2rvv_shuffle(type, a, b, ...) \ - __builtin_shufflevector(a, b, __VA_ARGS__) -#elif __has_builtin(__builtin_shuffle) -#define _sse2rvv_shuffle(type, a, b, ...) \ - __extension__({ \ - type tmp = {__VA_ARGS__}; \ - __builtin_shuffle(a, b, tmp); \ - }) -#endif - -#ifdef _sse2rvv_shuffle -#define vshuffle_s16(a, b, ...) _sse2rvv_shuffle(int16x4_t, a, b, __VA_ARGS__) -#define vshuffleq_s16(a, b, ...) _sse2rvv_shuffle(int16x8_t, a, b, __VA_ARGS__) -#define vshuffle_s32(a, b, ...) _sse2rvv_shuffle(int32x2_t, a, b, __VA_ARGS__) -#define vshuffleq_s32(a, b, ...) _sse2rvv_shuffle(int32x4_t, a, b, __VA_ARGS__) -#define vshuffle_s64(a, b, ...) _sse2rvv_shuffle(int64x1_t, a, b, __VA_ARGS__) -#define vshuffleq_s64(a, b, ...) _sse2rvv_shuffle(int64x2_t, a, b, __VA_ARGS__) -#endif - -/* Rounding mode macros. */ -#define _MM_FROUND_TO_NEAREST_INT 0x00 -#define _MM_FROUND_TO_NEG_INF 0x01 -#define _MM_FROUND_TO_POS_INF 0x02 -#define _MM_FROUND_TO_ZERO 0x03 -#define _MM_FROUND_CUR_DIRECTION 0x04 -#define _MM_FROUND_NO_EXC 0x08 -#define _MM_FROUND_RAISE_EXC 0x00 -#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) -#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) -#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) -#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) -#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) -#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) -#define _MM_ROUND_NEAREST 0x0000 -#define _MM_ROUND_DOWN 0x2000 -#define _MM_ROUND_UP 0x4000 -#define _MM_ROUND_TOWARD_ZERO 0x6000 -/* Flush zero mode macros. */ -#define _MM_FLUSH_ZERO_MASK 0x8000 -#define _MM_FLUSH_ZERO_ON 0x8000 -#define _MM_FLUSH_ZERO_OFF 0x0000 -/* Denormals are zeros mode macros. */ -#define _MM_DENORMALS_ZERO_MASK 0x0040 -#define _MM_DENORMALS_ZERO_ON 0x0040 -#define _MM_DENORMALS_ZERO_OFF 0x0000 - -/* indicate immediate constant argument in a given range */ -#define __constrange(a, b) const - /* A few intrinsics accept traditional data types like ints or floats, but * most operate on data types that are specific to SSE. * If a vector type ends in d, it contains doubles, and if it does not have @@ -277,7 +138,7 @@ typedef union ALIGN_STRUCT(16) SIMDVec { // Function declaration // SSE -FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void); +// FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void); // FORCE_INLINE __m128 _mm_move_ss(__m128, __m128); // FORCE_INLINE __m128 _mm_or_ps(__m128, __m128); // FORCE_INLINE __m128 _mm_set_ps1(float); @@ -840,24 +701,6 @@ typedef struct { // FORCE_INLINE void _mm_free(void *addr) {} #endif -FORCE_INLINE uint64_t _sse2rvv_get_fpcr(void) { - uint64_t value; -#if defined(_MSC_VER) - value = _ReadStatusReg(ARM64_FPCR); -#else - __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */ -#endif - return value; -} - -FORCE_INLINE void _sse2rvv_set_fpcr(uint64_t value) { -#if defined(_MSC_VER) - _WriteStatusReg(ARM64_FPCR, value); -#else - __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */ -#endif -} - // Macro: Get the flush zero bits from the MXCSR control and status register. // The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or // _MM_FLUSH_ZERO_OFF @@ -868,28 +711,7 @@ FORCE_INLINE void _sse2rvv_set_fpcr(uint64_t value) { // The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, // _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE -FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void) { - union { - fpcr_bitfield field; -#if defined(__aarch64__) || defined(_M_ARM64) - uint64_t value; -#else - uint32_t value; -#endif - } r; - -#if defined(__aarch64__) || defined(_M_ARM64) - r.value = _sse2rvv_get_fpcr(); -#else - __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ -#endif - - if (r.field.bit22) { - return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP; - } else { - return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST; - } -} +// FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void) {} // Copy a to dst, and insert the 16-bit integer i into dst at the location // specified by imm8. @@ -1198,46 +1020,7 @@ FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void) { // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, // _MM_ROUND_TOWARD_ZERO // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE -FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) { - union { - fpcr_bitfield field; -#if defined(__aarch64__) || defined(_M_ARM64) - uint64_t value; -#else - uint32_t value; -#endif - } r; - -#if defined(__aarch64__) || defined(_M_ARM64) - r.value = _sse2rvv_get_fpcr(); -#else - __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ -#endif - - switch (rounding) { - case _MM_ROUND_TOWARD_ZERO: - r.field.bit22 = 1; - r.field.bit23 = 1; - break; - case _MM_ROUND_DOWN: - r.field.bit22 = 0; - r.field.bit23 = 1; - break; - case _MM_ROUND_UP: - r.field.bit22 = 1; - r.field.bit23 = 0; - break; - default: //_MM_ROUND_NEAREST - r.field.bit22 = 0; - r.field.bit23 = 0; - } - -#if defined(__aarch64__) || defined(_M_ARM64) - _sse2rvv_set_fpcr(r.value); -#else - __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ -#endif -} +// FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) {} // Copy single-precision (32-bit) floating-point element a to the lower element // of dst, and zero the upper 3 elements. @@ -3291,548 +3074,6 @@ FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) { // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128 // FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) {} -/* SSE4.2 */ - -static const uint16_t ALIGN_STRUCT(16) _sse2rvv_cmpestr_mask16b[8] = { - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, -}; -static const uint8_t ALIGN_STRUCT(16) _sse2rvv_cmpestr_mask8b[16] = { - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, -}; - -/* specify the source data format */ -#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */ -#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */ -#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */ -#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */ - -/* specify the comparison operation */ -#define _SIDD_CMP_EQUAL_ANY 0x00 /* compare equal any: strchr */ -#define _SIDD_CMP_RANGES 0x04 /* compare ranges */ -#define _SIDD_CMP_EQUAL_EACH 0x08 /* compare equal each: strcmp */ -#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */ - -/* specify the polarity */ -#define _SIDD_POSITIVE_POLARITY 0x00 -#define _SIDD_MASKED_POSITIVE_POLARITY 0x20 -#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */ -#define _SIDD_MASKED_NEGATIVE_POLARITY \ - 0x30 /* negate results only before end of string */ - -/* specify the output selection in _mm_cmpXstri */ -#define _SIDD_LEAST_SIGNIFICANT 0x00 -#define _SIDD_MOST_SIGNIFICANT 0x40 - -/* specify the output selection in _mm_cmpXstrm */ -#define _SIDD_BIT_MASK 0x00 -#define _SIDD_UNIT_MASK 0x40 - -/* Pattern Matching for C macros. - * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms - */ - -/* catenate */ -#define SSE2RVV_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__ -#define SSE2RVV_CAT(a, b) SSE2RVV_PRIMITIVE_CAT(a, b) - -#define SSE2RVV_IIF(c) SSE2RVV_PRIMITIVE_CAT(SSE2RVV_IIF_, c) -/* run the 2nd parameter */ -#define SSE2RVV_IIF_0(t, ...) __VA_ARGS__ -/* run the 1st parameter */ -#define SSE2RVV_IIF_1(t, ...) t - -#define SSE2RVV_COMPL(b) SSE2RVV_PRIMITIVE_CAT(SSE2RVV_COMPL_, b) -#define SSE2RVV_COMPL_0 1 -#define SSE2RVV_COMPL_1 0 - -#define SSE2RVV_DEC(x) SSE2RVV_PRIMITIVE_CAT(SSE2RVV_DEC_, x) -#define SSE2RVV_DEC_1 0 -#define SSE2RVV_DEC_2 1 -#define SSE2RVV_DEC_3 2 -#define SSE2RVV_DEC_4 3 -#define SSE2RVV_DEC_5 4 -#define SSE2RVV_DEC_6 5 -#define SSE2RVV_DEC_7 6 -#define SSE2RVV_DEC_8 7 -#define SSE2RVV_DEC_9 8 -#define SSE2RVV_DEC_10 9 -#define SSE2RVV_DEC_11 10 -#define SSE2RVV_DEC_12 11 -#define SSE2RVV_DEC_13 12 -#define SSE2RVV_DEC_14 13 -#define SSE2RVV_DEC_15 14 -#define SSE2RVV_DEC_16 15 - -/* detection */ -#define SSE2RVV_CHECK_N(x, n, ...) n -#define SSE2RVV_CHECK(...) SSE2RVV_CHECK_N(__VA_ARGS__, 0, ) -#define SSE2RVV_PROBE(x) x, 1, - -#define SSE2RVV_NOT(x) SSE2RVV_CHECK(SSE2RVV_PRIMITIVE_CAT(SSE2RVV_NOT_, x)) -#define SSE2RVV_NOT_0 SSE2RVV_PROBE(~) - -#define SSE2RVV_BOOL(x) SSE2RVV_COMPL(SSE2RVV_NOT(x)) -#define SSE2RVV_IF(c) SSE2RVV_IIF(SSE2RVV_BOOL(c)) - -#define SSE2RVV_EAT(...) -#define SSE2RVV_EXPAND(...) __VA_ARGS__ -#define SSE2RVV_WHEN(c) SSE2RVV_IF(c)(SSE2RVV_EXPAND, SSE2RVV_EAT) - -/* recursion */ -/* deferred expression */ -#define SSE2RVV_EMPTY() -#define SSE2RVV_DEFER(id) id SSE2RVV_EMPTY() -#define SSE2RVV_OBSTRUCT(...) __VA_ARGS__ SSE2RVV_DEFER(SSE2RVV_EMPTY)() -#define SSE2RVV_EXPAND(...) __VA_ARGS__ - -#define SSE2RVV_EVAL(...) \ - SSE2RVV_EVAL1(SSE2RVV_EVAL1(SSE2RVV_EVAL1(__VA_ARGS__))) -#define SSE2RVV_EVAL1(...) \ - SSE2RVV_EVAL2(SSE2RVV_EVAL2(SSE2RVV_EVAL2(__VA_ARGS__))) -#define SSE2RVV_EVAL2(...) \ - SSE2RVV_EVAL3(SSE2RVV_EVAL3(SSE2RVV_EVAL3(__VA_ARGS__))) -#define SSE2RVV_EVAL3(...) __VA_ARGS__ - -#define SSE2RVV_REPEAT(count, macro, ...) \ - SSE2RVV_WHEN(count) \ - (SSE2RVV_OBSTRUCT(SSE2RVV_REPEAT_INDIRECT)()( \ - SSE2RVV_DEC(count), macro, \ - __VA_ARGS__)SSE2RVV_OBSTRUCT(macro)(SSE2RVV_DEC(count), __VA_ARGS__)) -#define SSE2RVV_REPEAT_INDIRECT() SSE2RVV_REPEAT - -#define SSE2RVV_SIZE_OF_byte 8 -#define SSE2RVV_NUMBER_OF_LANES_byte 16 -#define SSE2RVV_SIZE_OF_word 16 -#define SSE2RVV_NUMBER_OF_LANES_word 8 - -#define SSE2RVV_COMPARE_EQUAL_THEN_FILL_LANE(i, type) \ - mtx[i] = vreinterpretq_m128i_##type(vceqq_##type( \ - vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \ - vreinterpretq_##type##_m128i(a))); - -#define SSE2RVV_FILL_LANE(i, type) \ - vec_b[i] = \ - vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)); - -#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size, \ - number_of_lanes, byte_or_word) \ - do { \ - SSE2RVV_CAT( \ - data_type_prefix, \ - SSE2RVV_CAT(size, SSE2RVV_CAT(x, SSE2RVV_CAT(number_of_lanes, _t)))) \ - vec_b[number_of_lanes]; \ - __m128i mask = SSE2RVV_IIF(byte_or_word)( \ - vreinterpretq_m128i_u16(vdupq_n_u16(0xff)), \ - vreinterpretq_m128i_u32(vdupq_n_u32(0xffff))); \ - SSE2RVV_EVAL(SSE2RVV_REPEAT(number_of_lanes, SSE2RVV_FILL_LANE, \ - SSE2RVV_CAT(type_prefix, size))) \ - for (int i = 0; i < number_of_lanes; i++) { \ - mtx[i] = \ - SSE2RVV_CAT(vreinterpretq_m128i_u, size)(SSE2RVV_CAT(vbslq_u, size)( \ - SSE2RVV_CAT(vreinterpretq_u, SSE2RVV_CAT(size, _m128i))(mask), \ - SSE2RVV_CAT(vcgeq_, SSE2RVV_CAT(type_prefix, size))( \ - vec_b[i], \ - SSE2RVV_CAT(vreinterpretq_, \ - SSE2RVV_CAT(type_prefix, \ - SSE2RVV_CAT(size, _m128i(a))))), \ - SSE2RVV_CAT(vcleq_, SSE2RVV_CAT(type_prefix, size))( \ - vec_b[i], \ - SSE2RVV_CAT(vreinterpretq_, \ - SSE2RVV_CAT(type_prefix, \ - SSE2RVV_CAT(size, _m128i(a))))))); \ - } \ - } while (0) - -#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes) \ - do { \ - SSE2RVV_EVAL(SSE2RVV_REPEAT(number_of_lanes, \ - SSE2RVV_COMPARE_EQUAL_THEN_FILL_LANE, \ - SSE2RVV_CAT(u, size))) \ - } while (0) - -#define SSE2RVV_CMP_EQUAL_ANY_IMPL(type) \ - static int _sse2rvv_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \ - int lb) { \ - __m128i mtx[16]; \ - PCMPSTR_EQ(a, b, mtx, SSE2RVV_CAT(SSE2RVV_SIZE_OF_, type), \ - SSE2RVV_CAT(SSE2RVV_NUMBER_OF_LANES_, type)); \ - return SSE2RVV_CAT( \ - _sse2rvv_aggregate_equal_any_, \ - SSE2RVV_CAT(SSE2RVV_CAT(SSE2RVV_SIZE_OF_, type), \ - SSE2RVV_CAT(x, SSE2RVV_CAT(SSE2RVV_NUMBER_OF_LANES_, \ - type))))(la, lb, mtx); \ - } - -#define SSE2RVV_CMP_RANGES_IMPL(type, data_type, us, byte_or_word) \ - static int _sse2rvv_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \ - int lb) { \ - __m128i mtx[16]; \ - PCMPSTR_RANGES(a, b, mtx, data_type, us, \ - SSE2RVV_CAT(SSE2RVV_SIZE_OF_, type), \ - SSE2RVV_CAT(SSE2RVV_NUMBER_OF_LANES_, type), byte_or_word); \ - return SSE2RVV_CAT( \ - _sse2rvv_aggregate_ranges_, \ - SSE2RVV_CAT(SSE2RVV_CAT(SSE2RVV_SIZE_OF_, type), \ - SSE2RVV_CAT(x, SSE2RVV_CAT(SSE2RVV_NUMBER_OF_LANES_, \ - type))))(la, lb, mtx); \ - } - -#define SSE2RVV_CMP_EQUAL_ORDERED_IMPL(type) \ - static int _sse2rvv_cmp_##type##_equal_ordered(__m128i a, int la, __m128i b, \ - int lb) { \ - __m128i mtx[16]; \ - PCMPSTR_EQ(a, b, mtx, SSE2RVV_CAT(SSE2RVV_SIZE_OF_, type), \ - SSE2RVV_CAT(SSE2RVV_NUMBER_OF_LANES_, type)); \ - return SSE2RVV_CAT( \ - _sse2rvv_aggregate_equal_ordered_, \ - SSE2RVV_CAT( \ - SSE2RVV_CAT(SSE2RVV_SIZE_OF_, type), \ - SSE2RVV_CAT(x, SSE2RVV_CAT(SSE2RVV_NUMBER_OF_LANES_, type))))( \ - SSE2RVV_CAT(SSE2RVV_NUMBER_OF_LANES_, type), la, lb, mtx); \ - } - -static int _sse2rvv_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16]) { - int res = 0; - int m = (1 << la) - 1; - uint8x8_t vec_mask = vld1_u8(_sse2rvv_cmpestr_mask8b); - uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask); - uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask); - uint8x16_t vec = vcombine_u8(t_lo, t_hi); - for (int j = 0; j < lb; j++) { - mtx[j] = - vreinterpretq_m128i_u8(vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j]))); - mtx[j] = - vreinterpretq_m128i_u8(vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7)); - int tmp = _sse2rvv_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0; - res |= (tmp << j); - } - return res; -} - -static int _sse2rvv_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16]) { - int res = 0; - int m = (1 << la) - 1; - uint16x8_t vec = - vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2rvv_cmpestr_mask16b)); - for (int j = 0; j < lb; j++) { - mtx[j] = vreinterpretq_m128i_u16( - vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j]))); - mtx[j] = vreinterpretq_m128i_u16( - vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15)); - int tmp = _sse2rvv_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0; - res |= (tmp << j); - } - return res; -} - -/* clang-format off */ -#define SSE2RVV_GENERATE_CMP_EQUAL_ANY(prefix) \ - prefix##IMPL(byte) \ - prefix##IMPL(word) -/* clang-format on */ - -SSE2RVV_GENERATE_CMP_EQUAL_ANY(SSE2RVV_CMP_EQUAL_ANY_) - -static int _sse2rvv_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16]) { - int res = 0; - int m = (1 << la) - 1; - uint16x8_t vec = - vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2rvv_cmpestr_mask16b)); - for (int j = 0; j < lb; j++) { - mtx[j] = vreinterpretq_m128i_u16( - vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j]))); - mtx[j] = vreinterpretq_m128i_u16( - vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15)); - __m128i tmp = vreinterpretq_m128i_u32( - vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16)); - uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]), - vreinterpretq_u32_m128i(tmp)); -#if defined(__aarch64__) || defined(_M_ARM64) - int t = vaddvq_u32(vec_res) ? 1 : 0; -#else - uint64x2_t sumh = vpaddlq_u32(vec_res); - int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1); -#endif - res |= (t << j); - } - return res; -} - -static int _sse2rvv_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16]) { - int res = 0; - int m = (1 << la) - 1; - uint8x8_t vec_mask = vld1_u8(_sse2rvv_cmpestr_mask8b); - uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask); - uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask); - uint8x16_t vec = vcombine_u8(t_lo, t_hi); - for (int j = 0; j < lb; j++) { - mtx[j] = - vreinterpretq_m128i_u8(vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j]))); - mtx[j] = - vreinterpretq_m128i_u8(vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7)); - __m128i tmp = vreinterpretq_m128i_u16( - vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8)); - uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]), - vreinterpretq_u16_m128i(tmp)); - int t = _sse2rvv_vaddvq_u16(vec_res) ? 1 : 0; - res |= (t << j); - } - return res; -} - -#define SSE2RVV_CMP_RANGES_IS_BYTE 1 -#define SSE2RVV_CMP_RANGES_IS_WORD 0 - -/* clang-format off */ -#define SSE2RVV_GENERATE_CMP_RANGES(prefix) \ - prefix##IMPL(byte, uint, u, prefix##IS_BYTE) \ - prefix##IMPL(byte, int, s, prefix##IS_BYTE) \ - prefix##IMPL(word, uint, u, prefix##IS_WORD) \ - prefix##IMPL(word, int, s, prefix##IS_WORD) -/* clang-format on */ - -SSE2RVV_GENERATE_CMP_RANGES(SSE2RVV_CMP_RANGES_) - -#undef SSE2RVV_CMP_RANGES_IS_BYTE -#undef SSE2RVV_CMP_RANGES_IS_WORD - -static int _sse2rvv_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb) { - uint8x16_t mtx = - vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)); - int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb)); - int m1 = 0x10000 - (1 << la); - int tb = 0x10000 - (1 << lb); - uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi; - uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi; - vec_mask = vld1_u8(_sse2rvv_cmpestr_mask8b); - vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask); - vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask); - vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask); - vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask); - tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask); - tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask); - - res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx)); - res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx)); - res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo); - res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi); - res_lo = vand_u8(res_lo, vec_mask); - res_hi = vand_u8(res_hi, vec_mask); - - int res = _sse2rvv_vaddv_u8(res_lo) + (_sse2rvv_vaddv_u8(res_hi) << 8); - return res; -} - -static int _sse2rvv_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb) { - uint16x8_t mtx = - vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); - int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb)); - int m1 = 0x100 - (1 << la); - int tb = 0x100 - (1 << lb); - uint16x8_t vec_mask = vld1q_u16(_sse2rvv_cmpestr_mask16b); - uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask); - uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask); - uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask); - mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx); - mtx = vbslq_u16(vec1, tmp, mtx); - mtx = vandq_u16(mtx, vec_mask); - return _sse2rvv_vaddvq_u16(mtx); -} - -#define SSE2RVV_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1 -#define SSE2RVV_AGGREGATE_EQUAL_ORDER_IS_UWORD 0 - -#define SSE2RVV_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type) \ - static int _sse2rvv_aggregate_equal_ordered_##size##x##number_of_lanes( \ - int bound, int la, int lb, __m128i mtx[16]) { \ - int res = 0; \ - int m1 = SSE2RVV_IIF(data_type)(0x10000, 0x100) - (1 << la); \ - uint##size##x8_t vec_mask = \ - SSE2RVV_IIF(data_type)(vld1_u##size(_sse2rvv_cmpestr_mask##size##b), \ - vld1q_u##size(_sse2rvv_cmpestr_mask##size##b)); \ - uint##size##x##number_of_lanes##_t vec1 = SSE2RVV_IIF(data_type)( \ - vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask), \ - vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \ - vtstq_u##size(vdupq_n_u##size(m1), vec_mask)); \ - uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \ - uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0); \ - for (int j = 0; j < lb; j++) { \ - mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size( \ - vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j]))); \ - } \ - for (int j = lb; j < bound; j++) { \ - mtx[j] = vreinterpretq_m128i_u##size( \ - vbslq_u##size(vec1, vec_minusone, vec_zero)); \ - } \ - unsigned SSE2RVV_IIF(data_type)(char, short) *ptr = \ - (unsigned SSE2RVV_IIF(data_type)(char, short) *)mtx; \ - for (int i = 0; i < bound; i++) { \ - int val = 1; \ - for (int j = 0, k = i; j < bound - i && k < bound; j++, k++) \ - val &= ptr[k * bound + j]; \ - res += val << i; \ - } \ - return res; \ - } - -/* clang-format off */ -#define SSE2RVV_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \ - prefix##IMPL(8, 16, prefix##IS_UBYTE) \ - prefix##IMPL(16, 8, prefix##IS_UWORD) -/* clang-format on */ - -SSE2RVV_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2RVV_AGGREGATE_EQUAL_ORDER_) - -#undef SSE2RVV_AGGREGATE_EQUAL_ORDER_IS_UBYTE -#undef SSE2RVV_AGGREGATE_EQUAL_ORDER_IS_UWORD - -/* clang-format off */ -#define SSE2RVV_GENERATE_CMP_EQUAL_ORDERED(prefix) \ - prefix##IMPL(byte) \ - prefix##IMPL(word) -/* clang-format on */ - -SSE2RVV_GENERATE_CMP_EQUAL_ORDERED(SSE2RVV_CMP_EQUAL_ORDERED_) - -#define SSE2RVV_CMPESTR_LIST \ - _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any) \ - _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any) \ - _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any) \ - _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any) \ - _(CMP_UBYTE_RANGES, cmp_ubyte_ranges) \ - _(CMP_UWORD_RANGES, cmp_uword_ranges) \ - _(CMP_SBYTE_RANGES, cmp_sbyte_ranges) \ - _(CMP_SWORD_RANGES, cmp_sword_ranges) \ - _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each) \ - _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each) \ - _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each) \ - _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each) \ - _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \ - _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \ - _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \ - _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered) - -enum { -#define _(name, func_suffix) name, - SSE2RVV_CMPESTR_LIST -#undef _ -}; -typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb); -static cmpestr_func_t _sse2rvv_cmpfunc_table[] = { -#define _(name, func_suffix) _sse2rvv_##func_suffix, - SSE2RVV_CMPESTR_LIST -#undef _ -}; - -FORCE_INLINE int _sse2rvv_sido_negative(int res, int lb, int imm8, int bound) { - switch (imm8 & 0x30) { - case _SIDD_NEGATIVE_POLARITY: - res ^= 0xffffffff; - break; - case _SIDD_MASKED_NEGATIVE_POLARITY: - res ^= (1 << lb) - 1; - break; - default: - break; - } - - return res & ((bound == 8) ? 0xFF : 0xFFFF); -} - -FORCE_INLINE int _sse2rvv_clz(unsigned int x) { -#ifdef _MSC_VER - unsigned long cnt = 0; - if (_BitScanReverse(&cnt, x)) - return 31 - cnt; - return 32; -#else - return x != 0 ? __builtin_clz(x) : 32; -#endif -} - -FORCE_INLINE int _sse2rvv_ctz(unsigned int x) { -#ifdef _MSC_VER - unsigned long cnt = 0; - if (_BitScanForward(&cnt, x)) - return cnt; - return 32; -#else - return x != 0 ? __builtin_ctz(x) : 32; -#endif -} - -FORCE_INLINE int _sse2rvv_ctzll(unsigned long long x) { -#ifdef _MSC_VER - unsigned long cnt; -#if defined(SSE2RVV_HAS_BITSCAN64) - if (_BitScanForward64(&cnt, x)) - return (int)(cnt); -#else - if (_BitScanForward(&cnt, (unsigned long)(x))) - return (int)cnt; - if (_BitScanForward(&cnt, (unsigned long)(x >> 32))) - return (int)(cnt + 32); -#endif /* SSE2RVV_HAS_BITSCAN64 */ - return 64; -#else /* assume GNU compatible compilers */ - return x != 0 ? __builtin_ctzll(x) : 64; -#endif -} - -#define SSE2RVV_MIN(x, y) (x) < (y) ? (x) : (y) - -#define SSE2RVV_CMPSTR_SET_UPPER(var, imm) const int var = (imm & 0x01) ? 8 : 16 - -#define SSE2RVV_CMPESTRX_LEN_PAIR(a, b, la, lb) \ - int tmp1 = la ^ (la >> 31); \ - la = tmp1 - (la >> 31); \ - int tmp2 = lb ^ (lb >> 31); \ - lb = tmp2 - (lb >> 31); \ - la = SSE2RVV_MIN(la, bound); \ - lb = SSE2RVV_MIN(lb, bound) - -// Compare all pairs of character in string a and b, -// then aggregate the result. -// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the -// length of string, we use SSE2RVV_CMP{I,E}STRX_GET_LEN to get the length of -// string a and b. -#define SSE2RVV_COMP_AGG(a, b, la, lb, imm8, IE) \ - SSE2RVV_CMPSTR_SET_UPPER(bound, imm8); \ - SSE2RVV_##IE##_LEN_PAIR(a, b, la, lb); \ - int r2 = (_sse2rvv_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \ - r2 = _sse2rvv_sido_negative(r2, lb, imm8, bound) - -#define SSE2RVV_CMPSTR_GENERATE_INDEX(r2, bound, imm8) \ - return (r2 == 0) \ - ? bound \ - : ((imm8 & 0x40) ? (31 - _sse2rvv_clz(r2)) : _sse2rvv_ctz(r2)) - -#define SSE2RVV_CMPSTR_GENERATE_MASK(dst) \ - __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ - if (imm8 & 0x40) { \ - if (bound == 8) { \ - uint16x8_t tmp = \ - vtstq_u16(vdupq_n_u16(r2), vld1q_u16(_sse2rvv_cmpestr_mask16b)); \ - dst = vreinterpretq_m128i_u16( \ - vbslq_u16(tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst))); \ - } else { \ - uint8x16_t vec_r2 = vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8)); \ - uint8x16_t tmp = vtstq_u8(vec_r2, vld1q_u8(_sse2rvv_cmpestr_mask8b)); \ - dst = vreinterpretq_m128i_u8( \ - vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst))); \ - } \ - } else { \ - if (bound == 16) { \ - dst = vreinterpretq_m128i_u16( \ - vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \ - } else { \ - dst = vreinterpretq_m128i_u8( \ - vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0)); \ - } \ - } \ - return dst - // Compare packed strings in a and b with lengths la and lb using the control // in imm8, and returns 1 if b did not contain a null character and the // resulting mask was zero, and 0 otherwise. @@ -3864,11 +3105,8 @@ FORCE_INLINE int _sse2rvv_ctzll(unsigned long long x) { // Compare packed strings in a and b with lengths la and lb using the control // in imm8, and store the generated mask in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm -FORCE_INLINE __m128i _mm_cmpestrm(__m128i a, int la, __m128i b, int lb, - const int imm8) { - SSE2RVV_COMP_AGG(a, b, la, lb, imm8, CMPESTRX); - SSE2RVV_CMPSTR_GENERATE_MASK(dst); -} +// FORCE_INLINE __m128i _mm_cmpestrm(__m128i a, int la, __m128i b, int lb, +// const int imm8) {} // Compare packed strings in a and b with lengths la and lb using the control in // imm8, and returns bit 0 of the resulting bit mask. @@ -3897,32 +3135,6 @@ FORCE_INLINE __m128i _mm_cmpestrm(__m128i a, int la, __m128i b, int lb, // int lb, // const int imm8) {} -#define SSE2RVV_CMPISTRX_LENGTH(str, len, imm8) \ - do { \ - if (imm8 & 0x01) { \ - uint16x8_t equal_mask_##str = \ - vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \ - uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \ - uint64_t matches_##str = \ - vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \ - len = _sse2rvv_ctzll(matches_##str) >> 3; \ - } else { \ - uint16x8_t equal_mask_##str = vreinterpretq_u16_u8( \ - vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0))); \ - uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \ - uint64_t matches_##str = \ - vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \ - len = _sse2rvv_ctzll(matches_##str) >> 2; \ - } \ - } while (0) - -#define SSE2RVV_CMPISTRX_LEN_PAIR(a, b, la, lb) \ - int la, lb; \ - do { \ - SSE2RVV_CMPISTRX_LENGTH(a, la, imm8); \ - SSE2RVV_CMPISTRX_LENGTH(b, lb, imm8); \ - } while (0) - // Compare packed strings with implicit lengths in a and b using the control in // imm8, and returns 1 if b did not contain a null character and the resulting // mask was zero, and 0 otherwise. @@ -3985,106 +3197,6 @@ FORCE_INLINE __m128i _mm_cmpestrm(__m128i a, int la, __m128i b, int lb, /* AES */ -#if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__)) -/* clang-format off */ -#define SSE2RVV_AES_SBOX(w) \ - { \ - w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \ - w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \ - w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \ - w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \ - w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \ - w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \ - w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \ - w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \ - w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \ - w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \ - w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \ - w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \ - w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \ - w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \ - w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \ - w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \ - w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \ - w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \ - w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \ - w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \ - w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \ - w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \ - w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \ - w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \ - w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \ - w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \ - w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \ - w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \ - w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \ - w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \ - w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \ - w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \ - w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \ - w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \ - w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \ - w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \ - w(0xb0), w(0x54), w(0xbb), w(0x16) \ - } -#define SSE2RVV_AES_RSBOX(w) \ - { \ - w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \ - w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \ - w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \ - w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \ - w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \ - w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \ - w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \ - w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \ - w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \ - w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \ - w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \ - w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \ - w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \ - w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \ - w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \ - w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \ - w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \ - w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \ - w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \ - w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \ - w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \ - w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \ - w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \ - w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \ - w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \ - w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \ - w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \ - w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \ - w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \ - w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \ - w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \ - w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \ - w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \ - w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \ - w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \ - w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \ - w(0x55), w(0x21), w(0x0c), w(0x7d) \ - } -/* clang-format on */ - -/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */ -#define SSE2RVV_AES_H0(x) (x) -static const uint8_t _sse2rvv_sbox[256] = SSE2RVV_AES_SBOX(SSE2RVV_AES_H0); -static const uint8_t _sse2rvv_rsbox[256] = SSE2RVV_AES_RSBOX(SSE2RVV_AES_H0); -#undef SSE2RVV_AES_H0 - -/* x_time function and matrix multiply function */ -#if !defined(__aarch64__) && !defined(_M_ARM64) -#define SSE2RVV_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b)) -#define SSE2RVV_MULTIPLY(x, y) \ - (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2RVV_XT(x)) ^ \ - ((y >> 2 & 1) * SSE2RVV_XT(SSE2RVV_XT(x))) ^ \ - ((y >> 3 & 1) * SSE2RVV_XT(SSE2RVV_XT(SSE2RVV_XT(x)))) ^ \ - ((y >> 4 & 1) * SSE2RVV_XT(SSE2RVV_XT(SSE2RVV_XT(SSE2RVV_XT(x)))))) -#endif - // In the absence of crypto extensions, implement aesenc using regular NEON // intrinsics instead. See: // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/ @@ -4121,48 +3233,6 @@ static const uint8_t _sse2rvv_rsbox[256] = SSE2RVV_AES_RSBOX(SSE2RVV_AES_H0); // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/ // for details. // FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) {} -#undef SSE2RVV_AES_SBOX -#undef SSE2RVV_AES_RSBOX - -#if defined(__aarch64__) -#undef SSE2RVV_XT -#undef SSE2RVV_MULTIPLY -#endif - -#else /* __ARM_FEATURE_CRYPTO */ -// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and -// AESMC and then manually applying the real key as an xor operation. This -// unfortunately means an additional xor op; the compiler should be able to -// optimize this away for repeated calls however. See -// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a -// for more details. -// FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b) {} - -// Perform one round of an AES decryption flow on data (state) in a using the -// round key in RoundKey, and store the result in dst. -// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128 -// FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey) {} - -// Perform the last round of an AES encryption flow on data (state) in a using -// the round key in RoundKey, and store the result in dst. -// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128 -// FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) {} - -// Perform the last round of an AES decryption flow on data (state) in a using -// the round key in RoundKey, and store the result in dst. -// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128 -// FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey) {} - -// Perform the InvMixColumns transformation on a and store the result in dst. -// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128 -// FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a) {} - -// Assist in expanding the AES cipher key by computing steps towards generating -// a round key for encryption cipher using data from a and an 8-bit round -// constant specified in imm8, and store the result in dst." -// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128 -// FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) {} -#endif /* Others */ @@ -4188,43 +3258,7 @@ static const uint8_t _sse2rvv_rsbox[256] = SSE2RVV_AES_RSBOX(SSE2RVV_AES_H0); // Return the current 64-bit value of the processor's time-stamp counter. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc -FORCE_INLINE uint64_t _rdtsc(void) { -#if defined(__aarch64__) || defined(_M_ARM64) - uint64_t val; - - /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the - * system counter is at least 56 bits wide; from Armv8.6, the counter - * must be 64 bits wide. So the system counter could be less than 64 - * bits wide and it is attributed with the flag 'cap_user_time_short' - * is true. - */ -#if defined(_MSC_VER) - val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2)); -#else - __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val)); -#endif - - return val; -#else - uint32_t pmccntr, pmuseren, pmcntenset; - // Read the user mode Performance Monitoring Unit (PMU) - // User Enable Register (PMUSERENR) access permissions. - __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren)); - if (pmuseren & 1) { // Allows reading PMUSERENR for user mode code. - __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset)); - if (pmcntenset & 0x80000000UL) { // Is it counting? - __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr)); - // The counter is set up to count every 64th cycle - return (uint64_t)(pmccntr) << 6; - } - } - - // Fallback to syscall as we can't enable PMUSERENR in user mode. - struct timeval tv; - gettimeofday(&tv, NULL); - return (uint64_t)(tv.tv_sec) * 1000000 + tv.tv_usec; -#endif -} +// FORCE_INLINE uint64_t _rdtsc(void) {} #if defined(__GNUC__) || defined(__clang__) #pragma pop_macro("ALIGN_STRUCT") diff --git a/tests/impl.cpp b/tests/impl.cpp index ef8eeae..cb03180 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -11229,6 +11229,12 @@ result_t SSE2RVV_TEST_IMPL::run_single_test(INSTRUCTION_TEST test, uint32_t i) { return ret; } +const char *instruction_string[] = { +#define _(x) #x, + INTRIN_LIST +#undef _ +}; + SSE2RVV_TEST *SSE2RVV_TEST::create(void) { SSE2RVV_TEST_IMPL *st = new SSE2RVV_TEST_IMPL; return static_cast(st); diff --git a/tests/impl.h b/tests/impl.h index 60f02b9..d05d5f8 100644 --- a/tests/impl.h +++ b/tests/impl.h @@ -545,7 +545,7 @@ namespace SSE2RVV { // A short C implementation of every intrinsic is implemented and compared to // the actual expected results from the corresponding SSE intrinsic against all // of the 10,000 randomized input vectors. When running on RISCV, then the -// results are compared to the NEON approximate version. +// results are compared to the RISCV approximate version. extern const char *instruction_string[]; enum INSTRUCTION_TEST { #define _(x) it_##x,