diff --git a/Cargo.toml b/Cargo.toml index 142f4fd..99ae92f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ num-traits = "0.2.19" multiversion = "0.8.0" num-complex = { version = "0.4.6", features = ["bytemuck"], optional = true } bytemuck = { version = "1.23.2", optional = true } -wide = "1.0.3" +wide = "0.8.1" rayon = { version = "1.11.0", optional = true } [features] diff --git a/src/kernels/dit.rs b/src/kernels/dit.rs index 1f4c4d7..879b556 100644 --- a/src/kernels/dit.rs +++ b/src/kernels/dit.rs @@ -153,7 +153,7 @@ pub fn fft_dit_chunk_8_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let in1_im = f64x4::new(imags_s1[0..4].try_into().unwrap()); // out0.re = (in0.re + w.re * in1.re) - w.im * in1.im - let out0_re = sqrt2_2_im.mul_add(-in1_im, sqrt2_2.mul_add(in1_re, in0_re)); + let out0_re = sqrt2_2_im.mul_neg_add(in1_im, sqrt2_2.mul_add(in1_re, in0_re)); // out0.im = (in0.im + w.re*in1.im) + w.im*in1.re let out0_im = sqrt2_2_im.mul_add(in1_re, sqrt2_2.mul_add(in1_im, in0_im)); @@ -208,7 +208,7 @@ pub fn fft_dit_chunk_8_simd_f32(reals: &mut [f32], imags: &mut [f32]) { let in1_im = f32x4::new(imags_s1[0..4].try_into().unwrap()); // out0.re = (in0.re + w.re * in1.re) - w.im * in1.im - let out0_re = sqrt2_2_im.mul_add(-in1_im, sqrt2_2.mul_add(in1_re, in0_re)); + let out0_re = sqrt2_2_im.mul_neg_add(in1_im, sqrt2_2.mul_add(in1_re, in0_re)); // out0.im = (in0.im + w.re * in1.im) + w.im * in1.re let out0_im = sqrt2_2_im.mul_add(in1_re, sqrt2_2.mul_add(in1_im, in0_im)); @@ -274,7 +274,7 @@ pub fn fft_dit_chunk_16_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let in0_im = f64x8::new(imags_s0[0..8].try_into().unwrap()); let in1_im = f64x8::new(imags_s1[0..8].try_into().unwrap()); - let out0_re = twiddle_im.mul_add(-in1_im, twiddle_re.mul_add(in1_re, in0_re)); + let out0_re = twiddle_im.mul_neg_add(in1_im, twiddle_re.mul_add(in1_re, in0_re)); let out0_im = twiddle_im.mul_add(in1_re, twiddle_re.mul_add(in1_im, in0_im)); // out1 = 2*in0 - out0 @@ -339,7 +339,7 @@ pub fn fft_dit_chunk_16_simd_f32(reals: &mut [f32], imags: &mut [f32]) { let in0_im = f32x8::new(imags_s0[0..8].try_into().unwrap()); let in1_im = f32x8::new(imags_s1[0..8].try_into().unwrap()); - let out0_re = twiddle_im.mul_add(-in1_im, twiddle_re.mul_add(in1_re, in0_re)); + let out0_re = twiddle_im.mul_neg_add(in1_im, twiddle_re.mul_add(in1_re, in0_re)); let out0_im = twiddle_im.mul_add(in1_re, twiddle_re.mul_add(in1_im, in0_im)); // out1 = 2*in0 - out0 @@ -426,8 +426,8 @@ pub fn fft_dit_chunk_32_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let in0_im_0_7 = f64x8::new(imags_s0[0..8].try_into().unwrap()); let in1_im_0_7 = f64x8::new(imags_s1[0..8].try_into().unwrap()); - let out0_re_0_7 = - twiddle_im_0_7.mul_add(-in1_im_0_7, twiddle_re_0_7.mul_add(in1_re_0_7, in0_re_0_7)); + let out0_re_0_7 = twiddle_im_0_7 + .mul_neg_add(in1_im_0_7, twiddle_re_0_7.mul_add(in1_re_0_7, in0_re_0_7)); let out0_im_0_7 = twiddle_im_0_7.mul_add(in1_re_0_7, twiddle_re_0_7.mul_add(in1_im_0_7, in0_im_0_7)); @@ -445,8 +445,8 @@ pub fn fft_dit_chunk_32_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let in0_im_8_15 = f64x8::new(imags_s0[8..16].try_into().unwrap()); let in1_im_8_15 = f64x8::new(imags_s1[8..16].try_into().unwrap()); - let out0_re_8_15 = twiddle_im_8_15.mul_add( - -in1_im_8_15, + let out0_re_8_15 = twiddle_im_8_15.mul_neg_add( + in1_im_8_15, twiddle_re_8_15.mul_add(in1_re_8_15, in0_re_8_15), ); let out0_im_8_15 = twiddle_im_8_15.mul_add( @@ -531,7 +531,7 @@ pub fn fft_dit_chunk_32_simd_f32(reals: &mut [f32], imags: &mut [f32]) { let in0_im = f32x16::new(imags_s0[0..16].try_into().unwrap()); let in1_im = f32x16::new(imags_s1[0..16].try_into().unwrap()); - let out0_re = twiddle_im.mul_add(-in1_im, twiddle_re.mul_add(in1_re, in0_re)); + let out0_re = twiddle_im.mul_neg_add(in1_im, twiddle_re.mul_add(in1_re, in0_re)); let out0_im = twiddle_im.mul_add(in1_re, twiddle_re.mul_add(in1_im, in0_im)); let out1_re = two.mul_sub(in0_re, out0_re); @@ -665,7 +665,8 @@ pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let in0_im = f64x8::new(imags_s0[0..8].try_into().unwrap()); let in1_im = f64x8::new(imags_s1[0..8].try_into().unwrap()); - let out0_re = twiddle_im_0_7.mul_add(-in1_im, twiddle_re_0_7.mul_add(in1_re, in0_re)); + let out0_re = + twiddle_im_0_7.mul_neg_add(in1_im, twiddle_re_0_7.mul_add(in1_re, in0_re)); let out0_im = twiddle_im_0_7.mul_add(in1_re, twiddle_re_0_7.mul_add(in1_im, in0_im)); let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); @@ -681,7 +682,8 @@ pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let in0_im = f64x8::new(imags_s0[8..16].try_into().unwrap()); let in1_im = f64x8::new(imags_s1[8..16].try_into().unwrap()); - let out0_re = twiddle_im_8_15.mul_add(-in1_im, twiddle_re_8_15.mul_add(in1_re, in0_re)); + let out0_re = + twiddle_im_8_15.mul_neg_add(in1_im, twiddle_re_8_15.mul_add(in1_re, in0_re)); let out0_im = twiddle_im_8_15.mul_add(in1_re, twiddle_re_8_15.mul_add(in1_im, in0_im)); let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); @@ -698,7 +700,7 @@ pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let in1_im = f64x8::new(imags_s1[16..24].try_into().unwrap()); let out0_re = - twiddle_im_16_23.mul_add(-in1_im, twiddle_re_16_23.mul_add(in1_re, in0_re)); + twiddle_im_16_23.mul_neg_add(in1_im, twiddle_re_16_23.mul_add(in1_re, in0_re)); let out0_im = twiddle_im_16_23.mul_add(in1_re, twiddle_re_16_23.mul_add(in1_im, in0_im)); let out1_re = two.mul_sub(in0_re, out0_re); @@ -716,7 +718,7 @@ pub fn fft_dit_chunk_64_simd_f64(reals: &mut [f64], imags: &mut [f64]) { let in1_im = f64x8::new(imags_s1[24..32].try_into().unwrap()); let out0_re = - twiddle_im_24_31.mul_add(-in1_im, twiddle_re_24_31.mul_add(in1_re, in0_re)); + twiddle_im_24_31.mul_neg_add(in1_im, twiddle_re_24_31.mul_add(in1_re, in0_re)); let out0_im = twiddle_im_24_31.mul_add(in1_re, twiddle_re_24_31.mul_add(in1_im, in0_im)); let out1_re = two.mul_sub(in0_re, out0_re); @@ -836,7 +838,8 @@ pub fn fft_dit_chunk_64_simd_f32(reals: &mut [f32], imags: &mut [f32]) { let in0_im = f32x16::new(imags_s0[0..16].try_into().unwrap()); let in1_im = f32x16::new(imags_s1[0..16].try_into().unwrap()); - let out0_re = twiddle_im_0_15.mul_add(-in1_im, twiddle_re_0_15.mul_add(in1_re, in0_re)); + let out0_re = + twiddle_im_0_15.mul_neg_add(in1_im, twiddle_re_0_15.mul_add(in1_re, in0_re)); let out0_im = twiddle_im_0_15.mul_add(in1_re, twiddle_re_0_15.mul_add(in1_im, in0_im)); let out1_re = two.mul_sub(in0_re, out0_re); let out1_im = two.mul_sub(in0_im, out0_im); @@ -853,7 +856,7 @@ pub fn fft_dit_chunk_64_simd_f32(reals: &mut [f32], imags: &mut [f32]) { let in1_im = f32x16::new(imags_s1[16..32].try_into().unwrap()); let out0_re = - twiddle_im_16_31.mul_add(-in1_im, twiddle_re_16_31.mul_add(in1_re, in0_re)); + twiddle_im_16_31.mul_neg_add(in1_im, twiddle_re_16_31.mul_add(in1_re, in0_re)); let out0_im = twiddle_im_16_31.mul_add(in1_re, twiddle_re_16_31.mul_add(in1_im, in0_im)); let out1_re = two.mul_sub(in0_re, out0_re); @@ -911,7 +914,7 @@ pub fn fft_dit_64_chunk_n_simd( let tw_im = f64x8::new(*tw_im); // out0.re = (in0.re + tw_re * in1.re) - tw_im * in1.im - let out0_re = tw_im.mul_add(-in1_im, tw_re.mul_add(in1_re, in0_re)); + let out0_re = tw_im.mul_neg_add(in1_im, tw_re.mul_add(in1_re, in0_re)); // out0.im = (in0.im + tw_re * in1.im) + tw_im * in1.re let out0_im = tw_im.mul_add(in1_re, tw_re.mul_add(in1_im, in0_im)); @@ -972,7 +975,7 @@ pub fn fft_dit_32_chunk_n_simd( let tw_im = f32x16::new(*tw_im); // out0.re = (in0.re + tw_re * in1.re) - tw_im * in1.im - let out0_re = tw_im.mul_add(-in1_im, tw_re.mul_add(in1_re, in0_re)); + let out0_re = tw_im.mul_neg_add(in1_im, tw_re.mul_add(in1_re, in0_re)); // out0.im = (in0.im + tw_re * in1.im) + tw_im * in1.re let out0_im = tw_im.mul_add(in1_re, tw_re.mul_add(in1_im, in0_im));