Skip to content

Commit c3f023a

Browse files
authored
Improve move_mask and boolean ops for i64x2 i32x4 i16x8 i8x16 (#162)
* bug in scalar behavior of f32::fast_min/fast_max * fixing max * neon * simd * aarch * aarch * neon * update wasmtime version * fix wasm * fix * fix movemask * syntax * fixed fast_max * revert yaml * movemask and shr * movemask only * fix arm * fix neon * fix neon * add neon to i64x2 * centralize move_mask ops in integer code * update wasmtime version * update wasmtime version * add tracing * more tracing for all * added i64 tests for movemask * improve tests * fixed * revert build changes * remove float changes for now * better neon version * remove inefficient neon implemenation
1 parent ab41447 commit c3f023a

15 files changed

+468
-39
lines changed

Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,6 @@ std = []
2323
[dependencies]
2424
safe_arch = { version = "0.7", features = ["bytemuck"] }
2525
bytemuck = "1"
26+
27+
[dev-dependencies]
28+
rand = "0.8"

src/i16x8_.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,10 @@ impl i16x8 {
487487
(move_mask_i8_m128i(self.sse) & 0b1010101010101010) != 0
488488
} else if #[cfg(target_feature="simd128")] {
489489
u16x8_bitmask(self.simd) != 0
490+
} else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
491+
unsafe {
492+
vminvq_s16(self.neon) < 0
493+
}
490494
} else {
491495
let v : [u64;2] = cast(self);
492496
((v[0] | v[1]) & 0x8000800080008000) != 0
@@ -502,6 +506,10 @@ impl i16x8 {
502506
(move_mask_i8_m128i(self.sse) & 0b1010101010101010) == 0b1010101010101010
503507
} else if #[cfg(target_feature="simd128")] {
504508
u16x8_bitmask(self.simd) == 0b11111111
509+
} else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
510+
unsafe {
511+
vmaxvq_s16(self.neon) < 0
512+
}
505513
} else {
506514
let v : [u64;2] = cast(self);
507515
(v[0] & v[1] & 0x8000800080008000) == 0x8000800080008000

src/i32x4_.rs

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,9 @@ impl_shr_t_for_i32x4!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
331331
/// the type. (same as wrapping_shr)
332332
impl Shr<i32x4> for i32x4 {
333333
type Output = Self;
334+
335+
#[inline]
336+
#[must_use]
334337
fn shr(self, rhs: i32x4) -> Self::Output {
335338
pick! {
336339
if #[cfg(target_feature="avx2")] {
@@ -365,6 +368,9 @@ impl Shr<i32x4> for i32x4 {
365368
/// the type. (same as wrapping_shl)
366369
impl Shl<i32x4> for i32x4 {
367370
type Output = Self;
371+
372+
#[inline]
373+
#[must_use]
368374
fn shl(self, rhs: i32x4) -> Self::Output {
369375
pick! {
370376
if #[cfg(target_feature="avx2")] {
@@ -618,6 +624,7 @@ impl i32x4 {
618624
pub fn move_mask(self) -> i32 {
619625
pick! {
620626
if #[cfg(target_feature="sse")] {
627+
// use f32 move_mask since it is the same size as i32
621628
move_mask_m128(cast(self.sse))
622629
} else if #[cfg(target_feature="simd128")] {
623630
u32x4_bitmask(self.simd) as i32
@@ -631,14 +638,14 @@ impl i32x4 {
631638
let selectbit : uint32x4_t = core::intrinsics::transmute([1u32, 2, 4, 8]);
632639
let r = vandq_u32(masked, selectbit);
633640

634-
// horizontally add the 16-bit lanes
641+
// horizontally add the 32-bit lanes
635642
vaddvq_u32(r) as i32
636643
}
637644
} else {
638-
(((self.arr[0] as i32) < 0) as i32) << 0 |
639-
(((self.arr[1] as i32) < 0) as i32) << 1 |
640-
(((self.arr[2] as i32) < 0) as i32) << 2 |
641-
(((self.arr[3] as i32) < 0) as i32) << 3
645+
((self.arr[0] < 0) as i32) << 0 |
646+
((self.arr[1] < 0) as i32) << 1 |
647+
((self.arr[2] < 0) as i32) << 2 |
648+
((self.arr[3] < 0) as i32) << 3
642649
}
643650
}
644651
}
@@ -647,10 +654,16 @@ impl i32x4 {
647654
#[must_use]
648655
pub fn any(self) -> bool {
649656
pick! {
650-
if #[cfg(target_feature="sse2")] {
651-
(move_mask_i8_m128i(self.sse) & 0b1000100010001000) != 0
657+
if #[cfg(target_feature="sse")] {
658+
// use f32 move_mask since it is the same size as i32
659+
move_mask_m128(cast(self.sse)) != 0
652660
} else if #[cfg(target_feature="simd128")] {
653661
u32x4_bitmask(self.simd) != 0
662+
} else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
663+
// some lanes are negative
664+
unsafe {
665+
vminvq_s32(self.neon) < 0
666+
}
654667
} else {
655668
let v : [u64;2] = cast(self);
656669
((v[0] | v[1]) & 0x8000000080000000) != 0
@@ -662,10 +675,16 @@ impl i32x4 {
662675
#[must_use]
663676
pub fn all(self) -> bool {
664677
pick! {
665-
if #[cfg(target_feature="sse2")] {
666-
(move_mask_i8_m128i(self.sse) & 0b1000100010001000) == 0b1000100010001000
678+
if #[cfg(target_feature="sse")] {
679+
// use f32 move_mask since it is the same size as i32
680+
move_mask_m128(cast(self.sse)) == 0b1111
667681
} else if #[cfg(target_feature="simd128")] {
668682
u32x4_bitmask(self.simd) == 0b1111
683+
} else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
684+
// all lanes are negative
685+
unsafe {
686+
vmaxvq_s32(self.neon) < 0
687+
}
669688
} else {
670689
let v : [u64;2] = cast(self);
671690
(v[0] & v[1] & 0x8000000080000000) == 0x8000000080000000

src/i32x8_.rs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,9 @@ impl_shr_t_for_i32x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
236236
/// the type. (same as wrapping_shr)
237237
impl Shr<i32x8> for i32x8 {
238238
type Output = Self;
239+
240+
#[inline]
241+
#[must_use]
239242
fn shr(self, rhs: i32x8) -> Self::Output {
240243
pick! {
241244
if #[cfg(target_feature="avx2")] {
@@ -259,6 +262,9 @@ impl Shr<i32x8> for i32x8 {
259262
/// the type. (same as wrapping_shl)
260263
impl Shl<i32x8> for i32x8 {
261264
type Output = Self;
265+
266+
#[inline]
267+
#[must_use]
262268
fn shl(self, rhs: i32x8) -> Self::Output {
263269
pick! {
264270
if #[cfg(target_feature="avx2")] {
@@ -507,7 +513,8 @@ impl i32x8 {
507513
pub fn move_mask(self) -> i32 {
508514
pick! {
509515
if #[cfg(target_feature="avx2")] {
510-
move_mask_m256(cast(self.avx2)) as i32
516+
// use f32 move_mask since it is the same size as i32
517+
move_mask_m256(cast(self.avx2))
511518
} else {
512519
self.a.move_mask() | (self.b.move_mask() << 4)
513520
}
@@ -519,7 +526,7 @@ impl i32x8 {
519526
pub fn any(self) -> bool {
520527
pick! {
521528
if #[cfg(target_feature="avx2")] {
522-
((move_mask_i8_m256i(self.avx2) as u32) & 0b10001000100010001000100010001000) != 0
529+
move_mask_m256(cast(self.avx2)) != 0
523530
} else {
524531
(self.a | self.b).any()
525532
}
@@ -530,7 +537,7 @@ impl i32x8 {
530537
pub fn all(self) -> bool {
531538
pick! {
532539
if #[cfg(target_feature="avx2")] {
533-
((move_mask_i8_m256i(self.avx2) as u32) & 0b10001000100010001000100010001000) == 0b10001000100010001000100010001000
540+
move_mask_m256(cast(self.avx2)) == 0b11111111
534541
} else {
535542
(self.a & self.b).all()
536543
}

src/i64x2_.rs

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,66 @@ impl i64x2 {
445445
cast([arr[0] as f64, arr[1] as f64])
446446
}
447447

448+
/// returns the bit mask for each high bit set in the vector with the lowest
449+
/// lane being the lowest bit
450+
#[inline]
451+
#[must_use]
452+
pub fn move_mask(self) -> i32 {
453+
pick! {
454+
if #[cfg(target_feature="sse")] {
455+
// use f64 move_mask since it is the same size as i64
456+
move_mask_m128d(cast(self.sse))
457+
} else if #[cfg(target_feature="simd128")] {
458+
i64x2_bitmask(self.simd) as i32
459+
} else {
460+
// nothing amazingly efficient for neon
461+
let arr: [u64; 2] = cast(self);
462+
(arr[0] >> 63 | ((arr[1] >> 62) & 2)) as i32
463+
}
464+
}
465+
}
466+
467+
/// true if any high bits are set for any value in the vector
468+
#[inline]
469+
#[must_use]
470+
pub fn any(self) -> bool {
471+
pick! {
472+
if #[cfg(target_feature="sse")] {
473+
// use f64 move_mask since it is the same size as i64
474+
move_mask_m128d(cast(self.sse)) != 0
475+
} else if #[cfg(target_feature="simd128")] {
476+
i64x2_bitmask(self.simd) != 0
477+
} else {
478+
let v : [u64;2] = cast(self);
479+
((v[0] | v[1]) & 0x8000000000000000) != 0
480+
}
481+
}
482+
}
483+
484+
/// true if all high bits are set for every value in the vector
485+
#[inline]
486+
#[must_use]
487+
pub fn all(self) -> bool {
488+
pick! {
489+
if #[cfg(target_feature="avx2")] {
490+
// use f64 move_mask since it is the same size as i64
491+
move_mask_m128d(cast(self.sse)) == 0b11
492+
} else if #[cfg(target_feature="simd128")] {
493+
i64x2_bitmask(self.simd) == 0b11
494+
} else {
495+
let v : [u64;2] = cast(self);
496+
((v[0] & v[1]) & 0x8000000000000000) == 0x8000000000000000
497+
}
498+
}
499+
}
500+
501+
/// true if no high bits are set for any values of the vector
502+
#[inline]
503+
#[must_use]
504+
pub fn none(self) -> bool {
505+
!self.any()
506+
}
507+
448508
#[inline]
449509
pub fn to_array(self) -> [i64; 2] {
450510
cast(self)

src/i64x4_.rs

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,54 @@ impl i64x4 {
361361
cast([arr[0] as f64, arr[1] as f64, arr[2] as f64, arr[3] as f64])
362362
}
363363

364+
/// returns the bit mask for each high bit set in the vector with the lowest
365+
/// lane being the lowest bit
366+
#[inline]
367+
#[must_use]
368+
pub fn move_mask(self) -> i32 {
369+
pick! {
370+
if #[cfg(target_feature="avx2")] {
371+
// use f64 move_mask since it is the same size as i64
372+
move_mask_m256d(cast(self.avx2))
373+
} else {
374+
self.a.move_mask() | (self.b.move_mask() << 2)
375+
}
376+
}
377+
}
378+
379+
/// true if any high bits are set for any value in the vector
380+
#[inline]
381+
#[must_use]
382+
pub fn any(self) -> bool {
383+
pick! {
384+
if #[cfg(target_feature="avx2")] {
385+
move_mask_m256d(cast(self.avx2)) != 0
386+
} else {
387+
(self.a | self.b).any()
388+
}
389+
}
390+
}
391+
392+
/// true if all high bits are set for every value in the vector
393+
#[inline]
394+
#[must_use]
395+
pub fn all(self) -> bool {
396+
pick! {
397+
if #[cfg(target_feature="avx2")] {
398+
move_mask_m256d(cast(self.avx2)) == 0b1111
399+
} else {
400+
(self.a & self.b).all()
401+
}
402+
}
403+
}
404+
405+
/// true if no high bits are set for any values of the vector
406+
#[inline]
407+
#[must_use]
408+
pub fn none(self) -> bool {
409+
!self.any()
410+
}
411+
364412
#[inline]
365413
pub fn to_array(self) -> [i64; 4] {
366414
cast(self)

src/i8x16_.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,10 @@ impl i8x16 {
667667
move_mask_i8_m128i(self.sse) != 0
668668
} else if #[cfg(target_feature="simd128")] {
669669
u8x16_bitmask(self.simd) != 0
670+
} else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
671+
unsafe {
672+
vminvq_s8(self.neon) < 0
673+
}
670674
} else {
671675
let v : [u64;2] = cast(self);
672676
((v[0] | v[1]) & 0x80808080808080) != 0
@@ -681,6 +685,10 @@ impl i8x16 {
681685
move_mask_i8_m128i(self.sse) == 0b1111_1111_1111_1111
682686
} else if #[cfg(target_feature="simd128")] {
683687
u8x16_bitmask(self.simd) == 0b1111_1111_1111_1111
688+
} else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
689+
unsafe {
690+
vmaxvq_s8(self.neon) < 0
691+
}
684692
} else {
685693
let v : [u64;2] = cast(self);
686694
(v[0] & v[1] & 0x80808080808080) == 0x80808080808080

src/i8x32_.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@ impl i8x32 {
294294
pub fn move_mask(self) -> i32 {
295295
pick! {
296296
if #[cfg(target_feature="avx2")] {
297-
move_mask_i8_m256i(self.avx) as i32
297+
move_mask_i8_m256i(self.avx)
298298
} else {
299299
self.a.move_mask() | (self.b.move_mask() << 16)
300300
}

src/u32x8_.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,9 @@ impl_shr_t_for_u32x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
183183
/// the type. (same as wrapping_shr)
184184
impl Shr<u32x8> for u32x8 {
185185
type Output = Self;
186+
187+
#[inline]
188+
#[must_use]
186189
fn shr(self, rhs: u32x8) -> Self::Output {
187190
pick! {
188191
if #[cfg(target_feature="avx2")] {
@@ -206,6 +209,9 @@ impl Shr<u32x8> for u32x8 {
206209
/// the type. (same as wrapping_shl)
207210
impl Shl<u32x8> for u32x8 {
208211
type Output = Self;
212+
213+
#[inline]
214+
#[must_use]
209215
fn shl(self, rhs: u32x8) -> Self::Output {
210216
pick! {
211217
if #[cfg(target_feature="avx2")] {

0 commit comments

Comments
 (0)