-
Notifications
You must be signed in to change notification settings - Fork 61
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Made has_equal_in a new format callable #2064
base: main
Are you sure you want to change the base?
Conversation
acd53c7
to
7a34c68
Compare
752ec1c
to
08f7713
Compare
Waiting for the SVE2 CI |
4743b5d
to
ef14221
Compare
Closes #2065 |
Codegen for ; exactly 128 bits of data
test(eve::arm_sve256_v0::wide<unsigned char, eve::fixed<16l>>, eve::arm_sve256_v0::wide<unsigned char, eve::fixed<16l>>):
ptrue p0.b, vl32
match p0.b, p0/z, z0.b, z1.b
ret
; less than 128 bits of data (32 bits)
test(eve::arm_sve256_v0::wide<unsigned char, eve::fixed<4l>>, eve::arm_sve256_v0::wide<unsigned char, eve::fixed<4l>>):
mov z1.s, s1
ptrue p0.b, vl4
match p0.b, p0/z, z0.b, z1.b
ret
; more than 128 bits of data (256, in SVE2-256 mode)
test(eve::arm_sve256_v0::wide<unsigned char, eve::fixed<32l>>, eve::arm_sve256_v0::wide<unsigned char, eve::fixed<32l>>):
ptrue p0.b, vl32
movprfx z26, z1
ext z26.b, z26.b, z1.b, #16
match p3.b, p0/z, z0.b, z26.b
match p2.b, p0/z, z0.b, z1.b
movprfx z27, z0
ext z27.b, z27.b, z0.b, #16
orr p2.b, p0/z, p2.b, p3.b
match p1.b, p0/z, z27.b, z26.b
match p3.b, p0/z, z27.b, z1.b
orr p3.b, p0/z, p3.b, p1.b
index z29.b, #0, #1
mov z30.b, p3/z, #-1
cmpls p1.b, p0/z, z29.b, #15
mov z31.b, p2/z, #-1
mov z28.b, p1/z, #-1
and z28.b, z28.b, #0x10
cmplo p3.b, p0/z, z29.b, z28.b
splice z31.b, p3, z31.b, z30.b
cmpne p0.b, p0/z, z31.b, #0
ret The code for the first two cases is optimal (the special "broadcasting move" is used to round up the second operand to 128 bits while keeping the same values, and the mask used by the The code for the third case is good enough, we have two pairs of |
8bb3943
to
bcb20a9
Compare
bcb20a9
to
936db96
Compare
I don't like 256 codegen. I think it's a bit much. You can do 16bytes with 16 bytes. Then swap two sides and do that again. |
include/eve/module/core/regular/impl/simd/arm/sve/has_equal_in.hpp
Outdated
Show resolved
Hide resolved
Updated codegen with new algorithm : test(eve::arm_sve512_v0::wide<unsigned char, eve::fixed<32l> >, eve::arm_sve512_v0::wide<unsigned char, eve::fixed<32l> >):
ptrue p3.b, vl64
movprfx z31, z1
ext z31.b, z31.b, z1.b, #32
ext z31.b, z31.b, z1.b, #32
movprfx z30, z31
ext z30.b, z30.b, z31.b, #16
match p0.b, p3/z, z0.b, z31.b
match p2.b, p3/z, z0.b, z30.b
orr p0.b, p3/z, p0.b, p2.b
ret SVE-512 - 512 bits (4 match lanes) test(eve::arm_sve512_v0::wide<unsigned char, eve::fixed<64l> >, eve::arm_sve512_v0::wide<unsigned char, eve::fixed<64l> >):
ptrue p3.b, vl64
movprfx z31, z1
ext z31.b, z31.b, z1.b, #0
ext z31.b, z31.b, z1.b, #0
movprfx z30, z31
ext z30.b, z30.b, z31.b, #16
match p1.b, p3/z, z0.b, z30.b
movprfx z29, z30
ext z29.b, z29.b, z30.b, #16
match p2.b, p3/z, z0.b, z31.b
movprfx z28, z29
ext z28.b, z28.b, z29.b, #16
orr p2.b, p3/z, p2.b, p1.b
match p0.b, p3/z, z0.b, z28.b
match p1.b, p3/z, z0.b, z29.b
orr p2.b, p3/z, p2.b, p1.b
orr p0.b, p3/z, p2.b, p0.b
ret SVE-256 256 bits (4 match lanes) test(eve::arm_sve256_v0::wide<unsigned char, eve::fixed<32l> >, eve::arm_sve256_v0::wide<unsigned char, eve::fixed<32l> >):
ptrue p3.b, vl32
movprfx z31, z1
ext z31.b, z31.b, z1.b, #0
ext z31.b, z31.b, z1.b, #0
movprfx z30, z31
ext z30.b, z30.b, z31.b, #16
match p0.b, p3/z, z0.b, z31.b
match p2.b, p3/z, z0.b, z30.b
orr p0.b, p3/z, p0.b, p2.b
ret Tested using : #include <eve/eve.hpp>
using namespace eve;
using T = wide<unsigned char, fixed<64>>; // change size & type here
auto test(T a, T b) {
return has_equal_in(a, b);
} |
de44353
to
1d24753
Compare
1d24753
to
ef7c329
Compare
2e4b71f
to
107ae89
Compare
struct has_equal_in_t : callable<has_equal_in_t, Options> | ||
{ | ||
template<simd_value T, simd_value U, simd_predicate<T, U> Op> | ||
constexpr EVE_FORCEINLINE auto operator()(T x, U match_against, Op op) const noexcept -> decltype(op(x, match_against)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
std::result_of_t
// There is no need to broadcast the values inside the first operand because we will just adjust the mask to only | ||
// consider the active lanes. | ||
fw_t haystack{x}; | ||
fw_t needle = shuffle(fw_t{match_against}, eve::as_pattern([](auto i, auto) { return i % N::value; })); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
shuffle_l<3>(fw_t{match_against}, [](auto i, auto) { return i % N::value; });
No description provided.