Lokathor
diff --git a/‎Cargo.toml
Lines changed: 3 additions & 0 deletions b/‎Cargo.toml
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/i16x8_.rs
Lines changed: 8 additions & 0 deletions b/‎src/i16x8_.rs
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/i32x4_.rs
Lines changed: 28 additions & 9 deletions b/‎src/i32x4_.rs
Lines changed: 28 additions & 9 deletions
diff --git a/‎src/i32x8_.rs
Lines changed: 10 additions & 3 deletions b/‎src/i32x8_.rs
Lines changed: 10 additions & 3 deletions
diff --git a/‎src/i64x2_.rs
Lines changed: 60 additions & 0 deletions b/‎src/i64x2_.rs
Lines changed: 60 additions & 0 deletions
diff --git a/‎src/i64x4_.rs
Lines changed: 48 additions & 0 deletions b/‎src/i64x4_.rs
Lines changed: 48 additions & 0 deletions
diff --git a/‎src/i8x16_.rs
Lines changed: 8 additions & 0 deletions b/‎src/i8x16_.rs
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/i8x32_.rs
Lines changed: 1 addition & 1 deletion b/‎src/i8x32_.rs
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/u32x8_.rs
Lines changed: 6 additions & 0 deletions b/‎src/u32x8_.rs
Lines changed: 6 additions & 0 deletions
@@ -23,3 +23,6 @@ std = []
 [dependencies]
 safe_arch = { version = "0.7", features = ["bytemuck"] }
 bytemuck = "1"
+
+[dev-dependencies]
+rand = "0.8"
@@ -487,6 +487,10 @@ impl i16x8 {
         (move_mask_i8_m128i(self.sse) & 0b1010101010101010) != 0
       } else if #[cfg(target_feature="simd128")] {
         u16x8_bitmask(self.simd) != 0
+      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
+        unsafe {
+          vminvq_s16(self.neon) < 0
+        }
       } else {
         let v : [u64;2] = cast(self);
         ((v[0] | v[1]) & 0x8000800080008000) != 0
@@ -502,6 +506,10 @@ impl i16x8 {
         (move_mask_i8_m128i(self.sse) & 0b1010101010101010) == 0b1010101010101010
       } else if #[cfg(target_feature="simd128")] {
         u16x8_bitmask(self.simd) == 0b11111111
+      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
+        unsafe {
+          vmaxvq_s16(self.neon) < 0
+        }
       } else {
         let v : [u64;2] = cast(self);
         (v[0] & v[1] & 0x8000800080008000) == 0x8000800080008000
 
@@ -331,6 +331,9 @@ impl_shr_t_for_i32x4!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
 /// the type. (same as wrapping_shr)
 impl Shr<i32x4> for i32x4 {
   type Output = Self;
+
+  #[inline]
+  #[must_use]
   fn shr(self, rhs: i32x4) -> Self::Output {
     pick! {
       if #[cfg(target_feature="avx2")] {
@@ -365,6 +368,9 @@ impl Shr<i32x4> for i32x4 {
 /// the type. (same as wrapping_shl)
 impl Shl<i32x4> for i32x4 {
   type Output = Self;
+
+  #[inline]
+  #[must_use]
   fn shl(self, rhs: i32x4) -> Self::Output {
     pick! {
       if #[cfg(target_feature="avx2")] {
@@ -618,6 +624,7 @@ impl i32x4 {
   pub fn move_mask(self) -> i32 {
     pick! {
       if #[cfg(target_feature="sse")] {
+        // use f32 move_mask since it is the same size as i32
         move_mask_m128(cast(self.sse))
       } else if #[cfg(target_feature="simd128")] {
         u32x4_bitmask(self.simd) as i32
@@ -631,14 +638,14 @@ impl i32x4 {
           let selectbit : uint32x4_t = core::intrinsics::transmute([1u32, 2, 4, 8]);
           let r = vandq_u32(masked, selectbit);
 
-          // horizontally add the 16-bit lanes
+          // horizontally add the 32-bit lanes
           vaddvq_u32(r) as i32
          }
       } else {
-        (((self.arr[0] as i32) < 0) as i32) << 0 |
-        (((self.arr[1] as i32) < 0) as i32) << 1 |
-        (((self.arr[2] as i32) < 0) as i32) << 2 |
-        (((self.arr[3] as i32) < 0) as i32) << 3
+        ((self.arr[0] < 0) as i32) << 0 |
+        ((self.arr[1] < 0) as i32) << 1 |
+        ((self.arr[2] < 0) as i32) << 2 |
+        ((self.arr[3] < 0) as i32) << 3
       }
     }
   }
@@ -647,10 +654,16 @@ impl i32x4 {
   #[must_use]
   pub fn any(self) -> bool {
     pick! {
-      if #[cfg(target_feature="sse2")] {
-        (move_mask_i8_m128i(self.sse) & 0b1000100010001000) != 0
+      if #[cfg(target_feature="sse")] {
+        // use f32 move_mask since it is the same size as i32
+        move_mask_m128(cast(self.sse)) != 0
       } else if #[cfg(target_feature="simd128")] {
         u32x4_bitmask(self.simd) != 0
+      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
+        // some lanes are negative
+        unsafe {
+          vminvq_s32(self.neon) < 0
+        }
       } else {
         let v : [u64;2] = cast(self);
         ((v[0] | v[1]) & 0x8000000080000000) != 0
@@ -662,10 +675,16 @@ impl i32x4 {
   #[must_use]
   pub fn all(self) -> bool {
     pick! {
-      if #[cfg(target_feature="sse2")] {
-        (move_mask_i8_m128i(self.sse) & 0b1000100010001000) == 0b1000100010001000
+      if #[cfg(target_feature="sse")] {
+        // use f32 move_mask since it is the same size as i32
+        move_mask_m128(cast(self.sse)) == 0b1111
       } else if #[cfg(target_feature="simd128")] {
         u32x4_bitmask(self.simd) == 0b1111
+      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
+        // all lanes are negative
+        unsafe {
+          vmaxvq_s32(self.neon) < 0
+        }
       } else {
         let v : [u64;2] = cast(self);
         (v[0] & v[1] & 0x8000000080000000) == 0x8000000080000000
 
@@ -236,6 +236,9 @@ impl_shr_t_for_i32x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
 /// the type. (same as wrapping_shr)
 impl Shr<i32x8> for i32x8 {
   type Output = Self;
+
+  #[inline]
+  #[must_use]
   fn shr(self, rhs: i32x8) -> Self::Output {
     pick! {
       if #[cfg(target_feature="avx2")] {
@@ -259,6 +262,9 @@ impl Shr<i32x8> for i32x8 {
 /// the type. (same as wrapping_shl)
 impl Shl<i32x8> for i32x8 {
   type Output = Self;
+
+  #[inline]
+  #[must_use]
   fn shl(self, rhs: i32x8) -> Self::Output {
     pick! {
       if #[cfg(target_feature="avx2")] {
@@ -507,7 +513,8 @@ impl i32x8 {
   pub fn move_mask(self) -> i32 {
     pick! {
       if #[cfg(target_feature="avx2")] {
-        move_mask_m256(cast(self.avx2)) as i32
+        // use f32 move_mask since it is the same size as i32
+        move_mask_m256(cast(self.avx2))
       } else {
         self.a.move_mask() | (self.b.move_mask() << 4)
       }
@@ -519,7 +526,7 @@ impl i32x8 {
   pub fn any(self) -> bool {
     pick! {
       if #[cfg(target_feature="avx2")] {
-        ((move_mask_i8_m256i(self.avx2) as u32) & 0b10001000100010001000100010001000) != 0
+        move_mask_m256(cast(self.avx2)) != 0
       } else {
         (self.a | self.b).any()
       }
@@ -530,7 +537,7 @@ impl i32x8 {
   pub fn all(self) -> bool {
     pick! {
       if #[cfg(target_feature="avx2")] {
-        ((move_mask_i8_m256i(self.avx2) as u32) & 0b10001000100010001000100010001000) == 0b10001000100010001000100010001000
+        move_mask_m256(cast(self.avx2)) == 0b11111111
       } else {
         (self.a & self.b).all()
       }
 
@@ -445,6 +445,66 @@ impl i64x2 {
     cast([arr[0] as f64, arr[1] as f64])
   }
 
+  /// returns the bit mask for each high bit set in the vector with the lowest
+  /// lane being the lowest bit
+  #[inline]
+  #[must_use]
+  pub fn move_mask(self) -> i32 {
+    pick! {
+      if #[cfg(target_feature="sse")] {
+        // use f64 move_mask since it is the same size as i64
+        move_mask_m128d(cast(self.sse))
+      } else if #[cfg(target_feature="simd128")] {
+        i64x2_bitmask(self.simd) as i32
+      } else {
+        // nothing amazingly efficient for neon
+        let arr: [u64; 2] = cast(self);
+        (arr[0] >> 63 | ((arr[1] >> 62) & 2)) as i32
+      }
+    }
+  }
+
+  /// true if any high bits are set for any value in the vector
+  #[inline]
+  #[must_use]
+  pub fn any(self) -> bool {
+    pick! {
+      if #[cfg(target_feature="sse")] {
+        // use f64 move_mask since it is the same size as i64
+        move_mask_m128d(cast(self.sse)) != 0
+      } else if #[cfg(target_feature="simd128")] {
+        i64x2_bitmask(self.simd) != 0
+      } else {
+        let v : [u64;2] = cast(self);
+        ((v[0] | v[1]) & 0x8000000000000000) != 0
+      }
+    }
+  }
+
+  /// true if all high bits are set for every value in the vector
+  #[inline]
+  #[must_use]
+  pub fn all(self) -> bool {
+    pick! {
+      if #[cfg(target_feature="avx2")] {
+        // use f64 move_mask since it is the same size as i64
+        move_mask_m128d(cast(self.sse)) == 0b11
+      }  else if #[cfg(target_feature="simd128")] {
+        i64x2_bitmask(self.simd) == 0b11
+      } else {
+        let v : [u64;2] = cast(self);
+        ((v[0] & v[1]) & 0x8000000000000000) == 0x8000000000000000
+      }
+    }
+  }
+
+  /// true if no high bits are set for any values of the vector
+  #[inline]
+  #[must_use]
+  pub fn none(self) -> bool {
+    !self.any()
+  }
+
   #[inline]
   pub fn to_array(self) -> [i64; 2] {
     cast(self)
 
@@ -361,6 +361,54 @@ impl i64x4 {
     cast([arr[0] as f64, arr[1] as f64, arr[2] as f64, arr[3] as f64])
   }
 
+  /// returns the bit mask for each high bit set in the vector with the lowest
+  /// lane being the lowest bit
+  #[inline]
+  #[must_use]
+  pub fn move_mask(self) -> i32 {
+    pick! {
+      if #[cfg(target_feature="avx2")] {
+        // use f64 move_mask since it is the same size as i64
+        move_mask_m256d(cast(self.avx2))
+      } else {
+        self.a.move_mask() | (self.b.move_mask() << 2)
+      }
+    }
+  }
+
+  /// true if any high bits are set for any value in the vector
+  #[inline]
+  #[must_use]
+  pub fn any(self) -> bool {
+    pick! {
+      if #[cfg(target_feature="avx2")] {
+        move_mask_m256d(cast(self.avx2)) != 0
+      } else {
+        (self.a | self.b).any()
+      }
+    }
+  }
+
+  /// true if all high bits are set for every value in the vector
+  #[inline]
+  #[must_use]
+  pub fn all(self) -> bool {
+    pick! {
+      if #[cfg(target_feature="avx2")] {
+        move_mask_m256d(cast(self.avx2)) == 0b1111
+      } else {
+        (self.a & self.b).all()
+      }
+    }
+  }
+
+  /// true if no high bits are set for any values of the vector
+  #[inline]
+  #[must_use]
+  pub fn none(self) -> bool {
+    !self.any()
+  }
+
   #[inline]
   pub fn to_array(self) -> [i64; 4] {
     cast(self)
 
@@ -667,6 +667,10 @@ impl i8x16 {
         move_mask_i8_m128i(self.sse) != 0
       } else if #[cfg(target_feature="simd128")] {
         u8x16_bitmask(self.simd) != 0
+      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
+        unsafe {
+          vminvq_s8(self.neon) < 0
+        }
       } else {
         let v : [u64;2] = cast(self);
         ((v[0] | v[1]) & 0x80808080808080) != 0
@@ -681,6 +685,10 @@ impl i8x16 {
         move_mask_i8_m128i(self.sse) == 0b1111_1111_1111_1111
       } else if #[cfg(target_feature="simd128")] {
         u8x16_bitmask(self.simd) == 0b1111_1111_1111_1111
+      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
+        unsafe {
+          vmaxvq_s8(self.neon) < 0
+        }
       } else {
         let v : [u64;2] = cast(self);
         (v[0] & v[1] & 0x80808080808080) == 0x80808080808080
 
@@ -294,7 +294,7 @@ impl i8x32 {
   pub fn move_mask(self) -> i32 {
     pick! {
       if #[cfg(target_feature="avx2")] {
-        move_mask_i8_m256i(self.avx) as i32
+        move_mask_i8_m256i(self.avx)
       } else {
         self.a.move_mask() | (self.b.move_mask() << 16)
       }
 
@@ -183,6 +183,9 @@ impl_shr_t_for_u32x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
 /// the type. (same as wrapping_shr)
 impl Shr<u32x8> for u32x8 {
   type Output = Self;
+
+  #[inline]
+  #[must_use]
   fn shr(self, rhs: u32x8) -> Self::Output {
     pick! {
       if #[cfg(target_feature="avx2")] {
@@ -206,6 +209,9 @@ impl Shr<u32x8> for u32x8 {
 /// the type. (same as wrapping_shl)
 impl Shl<u32x8> for u32x8 {
   type Output = Self;
+
+  #[inline]
+  #[must_use]
   fn shl(self, rhs: u32x8) -> Self::Output {
     pick! {
       if #[cfg(target_feature="avx2")] {
Original file line number	Diff line number	Diff line change
`@@ -294,7 +294,7 @@ impl i8x32 {`
`294`	`294`	`pub fn move_mask(self) -> i32 {`
`295`	`295`	`pick! {`
`296`	`296`	`if #[cfg(target_feature="avx2")] {`
`297`		`- move_mask_i8_m256i(self.avx) as i32`
	`297`	`+ move_mask_i8_m256i(self.avx)`
`298`	`298`	`} else {`
`299`	`299`	`self.a.move_mask() \| (self.b.move_mask() << 16)`
`300`	`300`	`}`