Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add veor3q_[s8|s16|s32|s64|u8|u16|u32|u64] #501

Merged
merged 1 commit into from
Aug 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ endif

ifndef CROSS_COMPILE
processor := $(shell uname -m)
ARCH_CFLAGS = -march=armv8.4-a+simd+i8mm+dotprod
ARCH_CFLAGS = -march=armv8.4-a+simd+i8mm+dotprod+sha3
else # CROSS_COMPILE was set
CC = $(CROSS_COMPILE)gcc
CXX = $(CROSS_COMPILE)g++
Expand Down
32 changes: 24 additions & 8 deletions neon2rvv.h
Original file line number Diff line number Diff line change
Expand Up @@ -10701,21 +10701,37 @@ FORCE_INLINE int32x4_t vdotq_lane_s32(int32x4_t r, int8x16_t a, int8x8_t b, cons

// FORCE_INLINE uint64x2_t vsha512su1q_u64(uint64x2_t s01_s02, uint64x2_t w14_15, uint64x2_t w9_10);

// FORCE_INLINE uint8x16_t veor3q_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);
FORCE_INLINE uint8x16_t veor3q_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
return __riscv_vxor_vv_u8m1(a, __riscv_vxor_vv_u8m1(b, c, 16), 16);
}

// FORCE_INLINE uint16x8_t veor3q_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);
FORCE_INLINE uint16x8_t veor3q_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
return __riscv_vxor_vv_u16m1(a, __riscv_vxor_vv_u16m1(b, c, 8), 8);
}

// FORCE_INLINE uint32x4_t veor3q_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);
FORCE_INLINE uint32x4_t veor3q_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
return __riscv_vxor_vv_u32m1(a, __riscv_vxor_vv_u32m1(b, c, 4), 4);
}

// FORCE_INLINE uint64x2_t veor3q_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c);
FORCE_INLINE uint64x2_t veor3q_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
return __riscv_vxor_vv_u64m1(a, __riscv_vxor_vv_u64m1(b, c, 2), 2);
}

// FORCE_INLINE int8x16_t veor3q_s8(int8x16_t a, int8x16_t b, int8x16_t c);
FORCE_INLINE int8x16_t veor3q_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
return __riscv_vxor_vv_i8m1(a, __riscv_vxor_vv_i8m1(b, c, 16), 16);
}

// FORCE_INLINE int16x8_t veor3q_s16(int16x8_t a, int16x8_t b, int16x8_t c);
FORCE_INLINE int16x8_t veor3q_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
return __riscv_vxor_vv_i16m1(a, __riscv_vxor_vv_i16m1(b, c, 8), 8);
}

// FORCE_INLINE int32x4_t veor3q_s32(int32x4_t a, int32x4_t b, int32x4_t c);
FORCE_INLINE int32x4_t veor3q_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
return __riscv_vxor_vv_i32m1(a, __riscv_vxor_vv_i32m1(b, c, 4), 4);
}

// FORCE_INLINE int64x2_t veor3q_s64(int64x2_t a, int64x2_t b, int64x2_t c);
FORCE_INLINE int64x2_t veor3q_s64(int64x2_t a, int64x2_t b, int64x2_t c) {
return __riscv_vxor_vv_i64m1(a, __riscv_vxor_vv_i64m1(b, c, 2), 2);
}

// FORCE_INLINE uint64x2_t vrax1q_u64(uint64x2_t a, uint64x2_t b);

Expand Down
194 changes: 186 additions & 8 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37311,21 +37311,199 @@ result_t test_vsha512su0q_u64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { r

result_t test_vsha512su1q_u64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }

result_t test_veor3q_u8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_veor3q_u8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#if defined(__clang__)
return TEST_UNIMPL;
#else
#ifdef ENABLE_TEST_ALL
const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2;
const uint8_t *_c = (const uint8_t *)impl.test_cases_int_pointer3;
uint8_t _d[16];
for (int i = 0; i < 16; i++) {
_d[i] = _a[i] ^ _b[i] ^ _c[i];
}

uint8x16_t a = vld1q_u8(_a);
uint8x16_t b = vld1q_u8(_b);
uint8x16_t c = vld1q_u8(_c);
uint8x16_t d = veor3q_u8(a, b, c);
return validate_uint8(d, _d[0], _d[1], _d[2], _d[3], _d[4], _d[5], _d[6], _d[7], _d[8], _d[9], _d[10], _d[11], _d[12],
_d[13], _d[14], _d[15]);
#else
return TEST_UNIMPL;
#endif // defined(__clang__)
#endif // ENABLE_TEST_ALL
}

result_t test_veor3q_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#if defined(__clang__)
return TEST_UNIMPL;
#else
#ifdef ENABLE_TEST_ALL
const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1;
const uint16_t *_b = (const uint16_t *)impl.test_cases_int_pointer2;
const uint16_t *_c = (const uint16_t *)impl.test_cases_int_pointer3;
uint16_t _d[8];
for (int i = 0; i < 8; i++) {
_d[i] = _a[i] ^ _b[i] ^ _c[i];
}

uint16x8_t a = vld1q_u16(_a);
uint16x8_t b = vld1q_u16(_b);
uint16x8_t c = vld1q_u16(_c);
uint16x8_t d = veor3q_u16(a, b, c);
return validate_uint16(d, _d[0], _d[1], _d[2], _d[3], _d[4], _d[5], _d[6], _d[7]);
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
#endif // defined(__clang__)
}

result_t test_veor3q_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#if defined(__clang__)
return TEST_UNIMPL;
#else
#ifdef ENABLE_TEST_ALL
const uint32_t *_a = (const uint32_t *)impl.test_cases_int_pointer1;
const uint32_t *_b = (const uint32_t *)impl.test_cases_int_pointer2;
const uint32_t *_c = (const uint32_t *)impl.test_cases_int_pointer3;
uint32_t _d[4];
for (int i = 0; i < 4; i++) {
_d[i] = _a[i] ^ _b[i] ^ _c[i];
}

uint32x4_t a = vld1q_u32(_a);
uint32x4_t b = vld1q_u32(_b);
uint32x4_t c = vld1q_u32(_c);
uint32x4_t d = veor3q_u32(a, b, c);
return validate_uint32(d, _d[0], _d[1], _d[2], _d[3]);
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
#endif // defined(__clang__)
}

result_t test_veor3q_u64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#if defined(__clang__)
return TEST_UNIMPL;
#else
#ifdef ENABLE_TEST_ALL
const uint64_t *_a = (const uint64_t *)impl.test_cases_int_pointer1;
const uint64_t *_b = (const uint64_t *)impl.test_cases_int_pointer2;
const uint64_t *_c = (const uint64_t *)impl.test_cases_int_pointer3;
uint64_t _d[2];
for (int i = 0; i < 2; i++) {
_d[i] = _a[i] ^ _b[i] ^ _c[i];
}

uint64x2_t a = vld1q_u64(_a);
uint64x2_t b = vld1q_u64(_b);
uint64x2_t c = vld1q_u64(_c);
uint64x2_t d = veor3q_u64(a, b, c);
return validate_uint64(d, _d[0], _d[1]);
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
#endif // defined(__clang__)
}

result_t test_veor3q_s8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#if defined(__clang__)
return TEST_UNIMPL;
#else
#ifdef ENABLE_TEST_ALL
const int8_t *_a = (const int8_t *)impl.test_cases_int_pointer1;
const int8_t *_b = (const int8_t *)impl.test_cases_int_pointer2;
const int8_t *_c = (const int8_t *)impl.test_cases_int_pointer3;
int8_t _d[16];
for (int i = 0; i < 16; i++) {
_d[i] = _a[i] ^ _b[i] ^ _c[i];
}

result_t test_veor3q_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
int8x16_t a = vld1q_s8(_a);
int8x16_t b = vld1q_s8(_b);
int8x16_t c = vld1q_s8(_c);
int8x16_t d = veor3q_s8(a, b, c);
return validate_int8(d, _d[0], _d[1], _d[2], _d[3], _d[4], _d[5], _d[6], _d[7], _d[8], _d[9], _d[10], _d[11], _d[12],
_d[13], _d[14], _d[15]);
#else
return TEST_UNIMPL;
#endif // defined(__clang__)
#endif // ENABLE_TEST_ALL
}

result_t test_veor3q_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_veor3q_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#if defined(__clang__)
return TEST_UNIMPL;
#else
#ifdef ENABLE_TEST_ALL
const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1;
const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2;
const int16_t *_c = (const int16_t *)impl.test_cases_int_pointer3;
int16_t _d[8];
for (int i = 0; i < 8; i++) {
_d[i] = _a[i] ^ _b[i] ^ _c[i];
}

result_t test_veor3q_u64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
int16x8_t a = vld1q_s16(_a);
int16x8_t b = vld1q_s16(_b);
int16x8_t c = vld1q_s16(_c);
int16x8_t d = veor3q_s16(a, b, c);
return validate_int16(d, _d[0], _d[1], _d[2], _d[3], _d[4], _d[5], _d[6], _d[7]);
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
#endif // defined(__clang__)
}

result_t test_veor3q_s8(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_veor3q_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#if defined(__clang__)
return TEST_UNIMPL;
#else
#ifdef ENABLE_TEST_ALL
const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1;
const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2;
const int32_t *_c = (const int32_t *)impl.test_cases_int_pointer3;
int32_t _d[4];
for (int i = 0; i < 4; i++) {
_d[i] = _a[i] ^ _b[i] ^ _c[i];
}

result_t test_veor3q_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
int32x4_t a = vld1q_s32(_a);
int32x4_t b = vld1q_s32(_b);
int32x4_t c = vld1q_s32(_c);
int32x4_t d = veor3q_s32(a, b, c);
return validate_int32(d, _d[0], _d[1], _d[2], _d[3]);
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
#endif // defined(__clang__)
}

result_t test_veor3q_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_veor3q_s64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#if defined(__clang__)
return TEST_UNIMPL;
#else
#ifdef ENABLE_TEST_ALL
const int64_t *_a = (const int64_t *)impl.test_cases_int_pointer1;
const int64_t *_b = (const int64_t *)impl.test_cases_int_pointer2;
const int64_t *_c = (const int64_t *)impl.test_cases_int_pointer3;
int64_t _d[2];
for (int i = 0; i < 2; i++) {
_d[i] = _a[i] ^ _b[i] ^ _c[i];
}

result_t test_veor3q_s64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
int64x2_t a = vld1q_s64(_a);
int64x2_t b = vld1q_s64(_b);
int64x2_t c = vld1q_s64(_c);
int64x2_t d = veor3q_s64(a, b, c);
return validate_int64(d, _d[0], _d[1]);
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
#endif // defined(__clang__)
}

result_t test_vrax1q_u64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }

Expand Down
16 changes: 8 additions & 8 deletions tests/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -4069,14 +4069,14 @@
/*_(vsha512h2q_u64) */ \
/*_(vsha512su0q_u64) */ \
/*_(vsha512su1q_u64) */ \
/*_(veor3q_u8) */ \
/*_(veor3q_u16) */ \
/*_(veor3q_u32) */ \
/*_(veor3q_u64) */ \
/*_(veor3q_s8) */ \
/*_(veor3q_s16) */ \
/*_(veor3q_s32) */ \
/*_(veor3q_s64) */ \
_(veor3q_u8) \
_(veor3q_u16) \
_(veor3q_u32) \
_(veor3q_u64) \
_(veor3q_s8) \
_(veor3q_s16) \
_(veor3q_s32) \
_(veor3q_s64) \
/*_(vrax1q_u64) */ \
/*_(vxarq_u64) */ \
/*_(vbcaxq_u8) */ \
Expand Down
Loading