Skip to content

Commit 50d1c66

Browse files
Optimize loading of batch_bool from memory on arm
Use the same approach as #1172
1 parent 005f629 commit 50d1c66

File tree

1 file changed

+46
-0
lines changed

1 file changed

+46
-0
lines changed

include/xsimd/arch/xsimd_neon.hpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,52 @@ namespace xsimd
573573
return vld1q_f32(src);
574574
}
575575

576+
/* batch bool version */
577+
template <class T, class A, detail::enable_sized_t<T, 1> = 0>
578+
XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<neon>) noexcept
579+
{
580+
auto vmem = load_unaligned<A>((unsigned char const*)mem, convert<unsigned char> {}, A {});
581+
return batch_bool<T, A>(bitwise_cast<typename std::make_unsigned<T>::type>(0 - vmem).data);
582+
}
583+
template <class T, class A, detail::enable_sized_t<T, 1> = 0>
584+
XSIMD_INLINE batch_bool<T, A> load_aligned(bool const* mem, batch_bool<T, A> t, requires_arch<neon> r) noexcept
585+
{
586+
return load_unaligned(mem, t, r);
587+
}
588+
589+
template <class T, class A, detail::enable_sized_t<T, 2> = 0>
590+
XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<neon>) noexcept
591+
{
592+
batch<unsigned short, neon> vmem = vmovl_u8(vld1_u8((unsigned char const*)mem));
593+
return batch_bool<T, A>(bitwise_cast<typename std::make_unsigned<T>::type>(0 - vmem).data);
594+
}
595+
596+
template <class T, class A, detail::enable_sized_t<T, 2> = 0>
597+
XSIMD_INLINE batch_bool<T, A> load_aligned(bool const* mem, batch_bool<T, A> t, requires_arch<neon> r) noexcept
598+
{
599+
return load_unaligned(mem, t, r);
600+
}
601+
602+
template <class T, class A, detail::enable_sized_t<T, 4> = 0>
603+
XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<neon>) noexcept
604+
{
605+
uint32x2_t tmp = vset_lane_u32(*(unsigned int*)mem, vdup_n_u32(0), 0);
606+
return batch_bool<T, A>(bitwise_cast<typename std::make_unsigned<T>::type>(0 - vmovl_u16(vget_low_u16(vmovl_u8(tmp)))));
607+
}
608+
609+
template <class T, class A, detail::enable_sized_t<T, 4> = 0>
610+
XSIMD_INLINE batch_bool<T, A> load_aligned(bool const* mem, batch_bool<T, A> t, requires_arch<neon> r) noexcept
611+
{
612+
return load_unaligned(mem, t, r);
613+
}
614+
615+
template <class A> = 0>
616+
XSIMD_INLINE batch_bool<float, A> load_aligned(bool const* mem, batch_bool<float, A> t, requires_arch<neon> r) noexcept
617+
{
618+
uint32x2_t tmp = vset_lane_u32(*(unsigned int*)mem, vdup_n_u32(0), 0);
619+
return batch_bool<float, A>(0 - vmovl_u16(vget_low_u16(vmovl_u8(tmp))));
620+
}
621+
576622
/*********
577623
* store *
578624
*********/

0 commit comments

Comments
 (0)