diff --git a/fearless_simd/src/traits.rs b/fearless_simd/src/traits.rs index 055526af..0e284f54 100644 --- a/fearless_simd/src/traits.rs +++ b/fearless_simd/src/traits.rs @@ -119,6 +119,10 @@ impl SimdElement for i64 { type Mask = i64; } +impl SimdElement for u64 { + type Mask = i64; +} + /// Construction of integer vectors from floats by truncation pub trait SimdCvtTruncate { fn truncate_from(x: T) -> Self; diff --git a/fearless_simd_gen/src/mk_avx2.rs b/fearless_simd_gen/src/mk_avx2.rs index 9d501fd9..470eb009 100644 --- a/fearless_simd_gen/src/mk_avx2.rs +++ b/fearless_simd_gen/src/mk_avx2.rs @@ -112,9 +112,12 @@ fn mk_simd_impl() -> TokenStream { type i16s = i16x8; type u32s = u32x4; type i32s = i32x4; + type u64s = u64x2; + type i64s = i64x2; type mask8s = mask8x16; type mask16s = mask16x8; type mask32s = mask32x4; + type mask64s = mask64x2; #[inline(always)] fn level(self) -> Level { Level::#level_tok(self) diff --git a/fearless_simd_gen/src/mk_fallback.rs b/fearless_simd_gen/src/mk_fallback.rs index a69eca0f..40d796f6 100644 --- a/fearless_simd_gen/src/mk_fallback.rs +++ b/fearless_simd_gen/src/mk_fallback.rs @@ -402,9 +402,12 @@ fn mk_simd_impl() -> TokenStream { type i16s = i16x8; type u32s = u32x4; type i32s = i32x4; + type u64s = u64x2; + type i64s = i64x2; type mask8s = mask8x16; type mask16s = mask16x8; type mask32s = mask32x4; + type mask64s = mask64x2; #[inline(always)] fn level(self) -> Level { Level::#level_tok(self) diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs index 48ca55e6..d6a415da 100644 --- a/fearless_simd_gen/src/mk_neon.rs +++ b/fearless_simd_gen/src/mk_neon.rs @@ -409,9 +409,12 @@ fn mk_simd_impl(level: Level) -> TokenStream { type i16s = i16x8; type u32s = u32x4; type i32s = i32x4; + type u64s = u64x2; + type i64s = i64x2; type mask8s = mask8x16; type mask16s = mask16x8; type mask32s = mask32x4; + type mask64s = mask64x2; #[inline(always)] fn level(self) -> Level { Level::#level_tok(self) diff --git a/fearless_simd_gen/src/mk_simd_trait.rs b/fearless_simd_gen/src/mk_simd_trait.rs index 0a784683..1bcc78ba 100644 --- a/fearless_simd_gen/src/mk_simd_trait.rs +++ b/fearless_simd_gen/src/mk_simd_trait.rs @@ -44,10 +44,12 @@ pub fn mk_simd_trait() -> TokenStream { type u32s: SimdInt, Mask = Self::mask32s> + SimdCvtTruncate; type i32s: SimdInt, Mask = Self::mask32s, Bytes = ::Bytes> + SimdCvtTruncate + core::ops::Neg; + type u64s: SimdInt, Mask = Self::mask64s>; // + SimdCvtTruncate; + type i64s: SimdInt, Mask = Self::mask64s, Bytes = ::Bytes> + core::ops::Neg; // + SimdCvtTruncate; type mask8s: SimdMask, Bytes = ::Bytes> + Select + Select + Select; type mask16s: SimdMask, Bytes = ::Bytes> + Select + Select + Select; - type mask32s: SimdMask, Bytes = ::Bytes> - + Select + Select + Select + Select; + type mask32s: SimdMask, Bytes = ::Bytes> + Select + Select + Select + Select; + type mask64s: SimdMask, Bytes = ::Bytes> + Select + Select + Select; // + Select fn level(self) -> Level; /// Call function with CPU features enabled. diff --git a/fearless_simd_gen/src/mk_sse4_2.rs b/fearless_simd_gen/src/mk_sse4_2.rs index 00ed79e4..26eadf46 100644 --- a/fearless_simd_gen/src/mk_sse4_2.rs +++ b/fearless_simd_gen/src/mk_sse4_2.rs @@ -110,9 +110,12 @@ fn mk_simd_impl() -> TokenStream { type i16s = i16x8; type u32s = u32x4; type i32s = i32x4; + type u64s = u64x2; + type i64s = i64x2; type mask8s = mask8x16; type mask16s = mask16x8; type mask32s = mask32x4; + type mask64s = mask64x2; #[inline(always)] fn level(self) -> Level { Level::#level_tok(self) @@ -258,6 +261,10 @@ pub(crate) fn handle_compare( let max_min_expr = arch.expr(max_min, vec_ty, &args); quote! { #eq_intrinsic(#max_min_expr, a.into()) } + } else if matches!(method, "simd_eq") && vec_ty.scalar_bits == 64 { + let eq = + simple_sign_unaware_intrinsic("cmpeq", vec_ty.scalar, vec_ty.scalar_bits, ty_bits); + quote! { #eq(a.into(), b.into()) } } else if vec_ty.scalar == ScalarType::Unsigned { // SSE4.2 only has signed GT/LT, but not unsigned. let set = set1_intrinsic(vec_ty.scalar, vec_ty.scalar_bits, ty_bits); @@ -265,6 +272,7 @@ pub(crate) fn handle_compare( 8 => quote! { 0x80u8 }, 16 => quote! { 0x8000u16 }, 32 => quote! { 0x80000000u32 }, + 64 => quote! { 0x8000000000000000u64 }, _ => unimplemented!(), }; let gt = @@ -282,10 +290,29 @@ pub(crate) fn handle_compare( #gt(#args) } + } else if vec_ty.scalar_bits == 64 { + let intrinsic_name = if matches!(method, "simd_eq") { + "cmpeq" + } else { + "cmpgt" + }; + + let cmp = simple_intrinsic(intrinsic_name, vec_ty.scalar, vec_ty.scalar_bits, ty_bits); + // SSE4.2 only has signed GT for i64 + let args = if method == "simd_lt" { + quote! { b.into(), a.into() } + } else { + quote! { a.into(), b.into() } + }; + + quote! { + #cmp(#args) + } } else { arch.expr(method, vec_ty, &args) } } else { + // Floating point comparison arch.expr(method, vec_ty, &args) }; @@ -596,6 +623,16 @@ pub(crate) fn handle_unzip( quote! { unsafe { #intrinsic::<#mask>(a.into(), b.into()).simd_into(self) } } } else { match vec_ty.scalar_bits { + 64 => { + let op = if select_even { "lo" } else { "hi" }; + let intrinsic = format_ident!("_mm_unpack{op}_epi64"); + + quote! { + unsafe { + #intrinsic(a.into(), b.into()).simd_into(self) + } + } + } 32 => { let op = if select_even { "lo" } else { "hi" }; diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs index 5004ed60..9c1b560c 100644 --- a/fearless_simd_gen/src/mk_wasm.rs +++ b/fearless_simd_gen/src/mk_wasm.rs @@ -64,6 +64,7 @@ fn mk_simd_impl(level: Level) -> TokenStream { #[inline(always)] fn #method_ident(#args) -> #ret_ty }; + let m = match sig { OpSig::Splat => { let expr = Wasm.expr(method, vec_ty, &[quote! { val }]); @@ -118,6 +119,45 @@ fn mk_simd_impl(level: Level) -> TokenStream { OpSig::Binary => { let args = [quote! { a.into() }, quote! { b.into() }]; match method { + "max" | "min" if vec_ty.scalar_bits == 64 && vec_ty.len == 2 => { + let is_max = method == "max"; + + let xor_for_unsigned = if vec_ty.scalar == ScalarType::Unsigned { + quote! { + let sign_bit = i64x2_splat(0x8000_0000_0000_0000u64 as i64); + let a_signed = v128_xor(a.into(), sign_bit); + let b_signed = v128_xor(b.into(), sign_bit); + } + } else { + quote! { + let a_signed = a.into(); + let b_signed = b.into(); + } + }; + + let body = if is_max { + quote! { + let mask = i64x2_gt(a_signed, b_signed); + let a_masked = v128_and(mask, a.into()); + let b_masked = v128_andnot(mask, b.into()); + v128_or(a_masked, b_masked) + } + } else { + quote! { + let mask = i64x2_gt(a_signed, b_signed); + let a_masked = v128_andnot(mask, a.into()); + let b_masked = v128_and(mask, b.into()); + v128_or(a_masked, b_masked) + } + }; + + quote! { + #method_sig { + #xor_for_unsigned + #body.simd_into(self) + } + } + } "mul" if vec_ty.scalar_bits == 8 && vec_ty.len == 16 => { let (extmul_low, extmul_high) = match vec_ty.scalar { ScalarType::Unsigned => ( @@ -183,9 +223,31 @@ fn mk_simd_impl(level: Level) -> TokenStream { OpSig::Compare => { let args = [quote! { a.into() }, quote! { b.into() }]; let expr = Wasm.expr(method, vec_ty, &args); - quote! { - #method_sig { - #expr.simd_into(self) + + let missing_op = ["lt", "gt", "le", "ge"] + .iter() + .find(|&op| method.ends_with(op)); + + if vec_ty.scalar_bits == 64 + && vec_ty.scalar == ScalarType::Unsigned + && missing_op.is_some() + { + let op = missing_op.unwrap(); + let wasm_ident = format_ident!("i64x2_{}", op); + quote! { + #method_sig { + let sign_bit = i64x2_splat(0x8000_0000_0000_0000u64 as i64); + let a_signed = v128_xor(a.into(), sign_bit); + let b_signed = v128_xor(b.into(), sign_bit); + + #wasm_ident(a_signed, b_signed).simd_into(self) + } + } + } else { + quote! { + #method_sig { + #expr.simd_into(self) + } } } } @@ -386,6 +448,13 @@ fn mk_simd_impl(level: Level) -> TokenStream { quote! { 2, 3, 6, 7 }, quote! { u32x4_shuffle }, ), + 64 => ( + quote! { 0, 2 }, + quote! { 1, 3 }, + quote! { 0, 1 }, + quote! { 2, 3 }, + quote! { u64x2_shuffle }, + ), _ => panic!("unsupported scalar_bits"), }; @@ -455,6 +524,7 @@ fn mk_simd_impl(level: Level) -> TokenStream { quote! { 2, 6, 3, 7 }, quote! { u32x4_shuffle }, ), + 64 => (quote! { 0, 2 }, quote! { 1, 3 }, quote! { u64x2_shuffle }), _ => panic!("unsupported scalar_bits"), }; @@ -526,9 +596,12 @@ fn mk_simd_impl(level: Level) -> TokenStream { type i16s = i16x8; type u32s = u32x4; type i32s = i32x4; + type u64s = u64x2; + type i64s = i64x2; type mask8s = mask8x16; type mask16s = mask16x8; type mask32s = mask32x4; + type mask64s = mask64x2; #[inline(always)] fn level(self) -> Level { diff --git a/fearless_simd_gen/src/types.rs b/fearless_simd_gen/src/types.rs index ed0639a0..e75ba06f 100644 --- a/fearless_simd_gen/src/types.rs +++ b/fearless_simd_gen/src/types.rs @@ -116,6 +116,8 @@ pub const SIMD_TYPES: &[VecType] = &[ VecType::new(ScalarType::Mask, 32, 4), VecType::new(ScalarType::Float, 64, 2), VecType::new(ScalarType::Mask, 64, 2), + VecType::new(ScalarType::Int, 64, 2), + VecType::new(ScalarType::Unsigned, 64, 2), // 256 bit types VecType::new(ScalarType::Float, 32, 8), VecType::new(ScalarType::Int, 8, 32), @@ -129,6 +131,8 @@ pub const SIMD_TYPES: &[VecType] = &[ VecType::new(ScalarType::Mask, 32, 8), VecType::new(ScalarType::Float, 64, 4), VecType::new(ScalarType::Mask, 64, 4), + VecType::new(ScalarType::Int, 64, 4), + VecType::new(ScalarType::Unsigned, 64, 4), // 512 bit types VecType::new(ScalarType::Float, 32, 16), VecType::new(ScalarType::Int, 8, 64), @@ -142,6 +146,8 @@ pub const SIMD_TYPES: &[VecType] = &[ VecType::new(ScalarType::Mask, 32, 16), VecType::new(ScalarType::Float, 64, 8), VecType::new(ScalarType::Mask, 64, 8), + VecType::new(ScalarType::Int, 64, 8), + VecType::new(ScalarType::Unsigned, 64, 8), ]; pub fn type_imports() -> TokenStream { diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs index adb43ffa..6fd59f3b 100644 --- a/fearless_simd_tests/tests/harness/mod.rs +++ b/fearless_simd_tests/tests/harness/mod.rs @@ -703,6 +703,20 @@ fn zip_high_u32x4(simd: S) { assert_eq!(simd.zip_high_u32x4(a, b).val, [2, 6, 3, 7]); } +#[simd_test] +fn zip_low_u64x2(simd: S) { + let a = u64x2::from_slice(simd, &[0, 1]); + let b = u64x2::from_slice(simd, &[4, 5]); + assert_eq!(simd.zip_low_u64x2(a, b).val, [0, 4]); +} + +#[simd_test] +fn zip_high_u64x2(simd: S) { + let a = u64x2::from_slice(simd, &[0, 1]); + let b = u64x2::from_slice(simd, &[4, 5]); + assert_eq!(simd.zip_high_u64x2(a, b).val, [1, 5]); +} + #[simd_test] fn unzip_low_f32x4(simd: S) { let a = f32x4::from_slice(simd, &[1.0, 2.0, 3.0, 4.0]); @@ -871,6 +885,34 @@ fn unzip_high_u32x4(simd: S) { assert_eq!(simd.unzip_high_u32x4(a, b).val, [2, 4, 6, 8]); } +#[simd_test] +fn unzip_low_i64x2(simd: S) { + let a = i64x2::from_slice(simd, &[1, 2]); + let b = i64x2::from_slice(simd, &[3, 4]); + assert_eq!(simd.unzip_low_i64x2(a, b).val, [1, 3]); +} + +#[simd_test] +fn unzip_high_i64x2(simd: S) { + let a = i64x2::from_slice(simd, &[1, 2]); + let b = i64x2::from_slice(simd, &[3, 4]); + assert_eq!(simd.unzip_high_i64x2(a, b).val, [2, 4]); +} + +#[simd_test] +fn unzip_low_u64x2(simd: S) { + let a = u64x2::from_slice(simd, &[1, 2]); + let b = u64x2::from_slice(simd, &[3, 4]); + assert_eq!(simd.unzip_low_u64x2(a, b).val, [1, 3]); +} + +#[simd_test] +fn unzip_high_u64x2(simd: S) { + let a = u64x2::from_slice(simd, &[1, 2]); + let b = u64x2::from_slice(simd, &[3, 4]); + assert_eq!(simd.unzip_high_u64x2(a, b).val, [2, 4]); +} + #[simd_test] fn unzip_low_f64x2(simd: S) { let a = f64x2::from_slice(simd, &[1.0, 2.0]); @@ -944,6 +986,12 @@ fn shr_u32x4(simd: S) { assert_eq!((a >> 8).val, [16777215, 8388608, 256, 1]); } +#[simd_test] +fn shr_u64x2(simd: S) { + let a = u64x2::from_slice(simd, &[u64::MAX, 2147483648u64]); + assert_eq!((a >> 8).val, [u64::MAX >> 8, 8388608]); +} + #[simd_test] fn shrv_u32x4(simd: S) { let a = u32x4::from_slice(simd, &[u32::MAX, 2147483648, 65536, 256]); @@ -969,6 +1017,12 @@ fn shl_u32x4(simd: S) { assert_eq!((a << 4).val, [0xFFFFFFF0, 0xFFFF0, 0xFF0, 0]); } +#[simd_test] +fn shl_u64x2(simd: S) { + let a = u64x2::from_slice(simd, &[0xFFFFFFFFFFFFFFFu64, 0]); + assert_eq!((a << 4).val, [0xFFFFFFFFFFFFFFF0u64, 0]); +} + #[simd_test] fn select_f32x4(simd: S) { let mask = mask32x4::from_slice(simd, &[-1, 0, -1, 0]); @@ -1102,6 +1156,23 @@ fn select_mask32x4(simd: S) { assert_eq!(result.val, [-1, 0, 0, -1]); } +#[simd_test] +fn select_u64x2(simd: S) { + let mask = mask64x2::from_slice(simd, &[0, -1]); + let b = u64x2::from_slice(simd, &[100000, 200000]); + let c = u64x2::from_slice(simd, &[1000, 2000]); + assert_eq!(mask.select(b, c).val, [1000, 200000]); +} + +#[simd_test] +fn select_mask64x2(simd: S) { + let mask = mask64x2::from_slice(simd, &[-1, 0]); + let b = mask64x2::from_slice(simd, &[-1, 42]); + let c = mask64x2::from_slice(simd, &[100, -1]); + let result: mask64x2<_> = mask.select(b, c); + assert_eq!(result.val, [-1, -1]); +} + #[simd_test] fn widen_u8x16(simd: S) { let a = u8x16::from_slice( @@ -1306,6 +1377,86 @@ fn simd_ge_i8x16(simd: S) { ); } +#[simd_test] +fn simd_ge_i64x2(simd: S) { + let vals = i64x2::from_slice(simd, &[0, -45]); + let mask = vals.simd_ge(i64x2::splat(simd, -1)); + + assert_eq!(mask.val, [-1, 0]); +} + +#[simd_test] +fn simd_ge_u64x2(simd: S) { + let vals = u64x2::from_slice(simd, &[0, 45]); + let mask = vals.simd_ge(u64x2::splat(simd, 45)); + + assert_eq!(mask.val, [0, -1]); +} + +#[simd_test] +fn simd_le_i64x2(simd: S) { + let vals = i64x2::from_slice(simd, &[0, -45]); + let mask = vals.simd_le(i64x2::splat(simd, -1)); + + assert_eq!(mask.val, [0, -1]); +} + +#[simd_test] +fn simd_le_u64x2(simd: S) { + let vals = u64x2::from_slice(simd, &[0, 45]); + let mask = vals.simd_le(u64x2::splat(simd, 45)); + + assert_eq!(mask.val, [-1, -1]); +} + +#[simd_test] +fn simd_lt_u64x2(simd: S) { + let vals = u64x2::from_slice(simd, &[0, 45]); + let mask = vals.simd_lt(u64x2::splat(simd, 45)); + + assert_eq!(mask.val, [-1, 0]); +} + +#[simd_test] +fn simd_lt_i64x2(simd: S) { + let vals = i64x2::from_slice(simd, &[0, -45]); + let mask = vals.simd_lt(i64x2::splat(simd, 0)); + + assert_eq!(mask.val, [0, -1]); +} + +#[simd_test] +fn simd_gt_u64x2(simd: S) { + let vals = u64x2::from_slice(simd, &[0, 45]); + let mask = vals.simd_gt(u64x2::splat(simd, 45)); + + assert_eq!(mask.val, [0, 0]); +} + +#[simd_test] +fn simd_gt_i64x2(simd: S) { + let vals = i64x2::from_slice(simd, &[0, 45]); + let mask = vals.simd_gt(i64x2::splat(simd, 44)); + + assert_eq!(mask.val, [0, -1]); +} + +#[simd_test] +fn simd_eq_u64x2(simd: S) { + let vals = u64x2::from_slice(simd, &[45, 45]); + let mask = vals.simd_eq(u64x2::splat(simd, 45)); + + assert_eq!(mask.val, [-1, -1]); +} + +#[simd_test] +fn simd_eq_i64x2(simd: S) { + let vals = i64x2::from_slice(simd, &[-3, -3]); + let mask = vals.simd_eq(i64x2::splat(simd, -3)); + + assert_eq!(mask.val, [-1, -1]); +} + #[simd_test] fn select_native_width_vectors(simd: S) { // Test with native f32 vectors @@ -1352,6 +1503,20 @@ fn select_native_width_vectors(simd: S) { let b_i16 = S::i16s::from_slice(simd, &vec![-50i16; S::i16s::N]); let result_i16 = mask_u16.select(a_i16, b_i16); assert_eq!(result_i16.as_slice(), vec![50i16; S::i16s::N]); + + // Test with native i64 vectors + let a_i64 = S::i64s::from_slice(simd, &vec![50i64; S::i64s::N]); + let b_i64 = S::i64s::from_slice(simd, &vec![-50i64; S::i64s::N]); + let mask_i64 = S::mask64s::from_slice(simd, &vec![-1i64; S::mask64s::N]); + let result_i64 = mask_i64.select(a_i64, b_i64); + assert_eq!(result_i64.as_slice(), vec![50i64; S::i64s::N]); + + // Test with native u64 vectors + let a_u64 = S::u64s::from_slice(simd, &vec![100u64; S::u64s::N]); + let b_u64 = S::u64s::from_slice(simd, &vec![200u64; S::u64s::N]); + let mask_u64 = S::mask64s::from_slice(simd, &vec![-1i64; S::mask64s::N]); + let result_u64 = mask_u64.select(a_u64, b_u64); + assert_eq!(result_u64.as_slice(), vec![100u64; S::u64s::N]); } #[simd_test]