diff --git a/.cargo/config.toml b/.cargo/config.toml index d13388f..eb5b65a 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -4,5 +4,8 @@ rustflags = ["-Ctarget-cpu=native"] [target.x86_64-apple-darwin] rustflags = ["-Ctarget-feature=+sse4.1"] +[target.wasm32-unknown-unknown] +rustflags = ["-C", "target-feature=+simd128"] + [env] DYLD_FALLBACK_LIBRARY_PATH="/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/" \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index f419f2e..aa815f0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -554,7 +554,7 @@ checksum = "03087c2bad5e1034e8cace5926dec053fb3790248370865f5117a7d0213354c8" [[package]] name = "libblur" -version = "0.13.4" +version = "0.13.5" dependencies = [ "colorutils-rs", "erydanos", diff --git a/README.md b/README.md index 2e89f20..d796051 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Fast blur algorithms library for Rust There are some very good and blazing fast algorithms that do blurring images. -Best optimized for NEON and SSE. +Best optimized for NEON and SSE, partially AVX, partially done WASM. You may receive gaussian blur in 100 FPS for 4K photo. diff --git a/src/lib/Cargo.toml b/src/lib/Cargo.toml index f93b3e4..d6b95c5 100644 --- a/src/lib/Cargo.toml +++ b/src/lib/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "libblur" -version = "0.13.4" +version = "0.13.5" edition = "2021" description = "Fast image blurring in pure Rust" readme = "../../README.md" diff --git a/src/lib/fast_gaussian.rs b/src/lib/fast_gaussian.rs index 79b9cfb..8352963 100644 --- a/src/lib/fast_gaussian.rs +++ b/src/lib/fast_gaussian.rs @@ -53,6 +53,8 @@ use crate::sse::{ use crate::threading_policy::ThreadingPolicy; use crate::to_storage::ToStorage; use crate::unsafe_slice::UnsafeSlice; +#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] +use crate::wasm32::{fast_gaussian_horizontal_pass_wasm_u8, fast_gaussian_vertical_pass_wasm_u8}; use crate::{clamp_edge, reflect_101, EdgeMode}; const BASE_RADIUS_I64_CUTOFF: u32 = 180; @@ -708,6 +710,15 @@ fn fast_gaussian_impl< } } } + #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] + { + if std::any::type_name::() == "u8" && BASE_RADIUS_I64_CUTOFF > radius { + _dispatcher_vertical = + fast_gaussian_vertical_pass_wasm_u8::; + _dispatcher_horizontal = + fast_gaussian_horizontal_pass_wasm_u8::; + } + } } if thread_count == 1 { _dispatcher_vertical(&unsafe_image, stride, width, height, radius, 0, width); diff --git a/src/lib/fast_gaussian_next.rs b/src/lib/fast_gaussian_next.rs index fd5180e..d4f66e7 100644 --- a/src/lib/fast_gaussian_next.rs +++ b/src/lib/fast_gaussian_next.rs @@ -42,6 +42,10 @@ use crate::sse::{ }; use crate::to_storage::ToStorage; use crate::unsafe_slice::UnsafeSlice; +#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] +use crate::wasm32::{ + fast_gaussian_next_horizontal_pass_wasm_u8, fast_gaussian_next_vertical_pass_wasm_u8, +}; use crate::{clamp_edge, reflect_101, EdgeMode, FastBlurChannels, ThreadingPolicy}; use colorutils_rs::linear_to_planar::linear_to_plane; use colorutils_rs::planar_to_linear::plane_to_linear; @@ -637,6 +641,15 @@ fn fast_gaussian_next_impl< } } } + #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] + { + if std::any::type_name::() == "u8" && BASE_RADIUS_I64_CUTOFF > radius { + _dispatcher_vertical = + fast_gaussian_next_vertical_pass_wasm_u8::; + _dispatcher_horizontal = + fast_gaussian_next_horizontal_pass_wasm_u8::; + } + } let thread_count = threading_policy.get_threads_count(width, height) as u32; if thread_count == 1 { diff --git a/src/lib/lib.rs b/src/lib/lib.rs index a8a482f..38687f5 100644 --- a/src/lib/lib.rs +++ b/src/lib/lib.rs @@ -48,6 +48,8 @@ mod threading_policy; mod to_storage; mod unsafe_slice; mod cpu_features; +#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] +mod wasm32; pub use channels_configuration::FastBlurChannels; pub use colorutils_rs::TransferFunction; diff --git a/src/lib/neon/fast_gaussian.rs b/src/lib/neon/fast_gaussian.rs index 21074d6..678fae2 100644 --- a/src/lib/neon/fast_gaussian.rs +++ b/src/lib/neon/fast_gaussian.rs @@ -44,68 +44,66 @@ pub fn fast_gaussian_horizontal_pass_neon_u8< start: u32, end: u32, ) { - let edge_mode: EdgeMode = EDGE_MODE.into(); - let bytes: &UnsafeSlice<'_, u8> = unsafe { std::mem::transmute(undefined_slice) }; - let mut buffer: [[i32; 4]; 1024] = [[0; 4]; 1024]; - let initial_sum = ((radius * radius) >> 1) as i32; - - let radius_64 = radius as i64; - let width_wide = width as i64; - let v_weight = unsafe { vdupq_n_f32((1f64 / (radius as f64 * radius as f64)) as f32) }; - for y in start..std::cmp::min(height, end) { - let mut diffs: int32x4_t = unsafe { vdupq_n_s32(0) }; - let mut summs: int32x4_t = unsafe { vdupq_n_s32(initial_sum) }; - - let current_y = ((y as i64) * (stride as i64)) as usize; - - let start_x = 0 - 2 * radius_64; - for x in start_x..(width as i64) { - if x >= 0 { - let current_px = ((std::cmp::max(x, 0) as u32) * CHANNELS_COUNT as u32) as usize; - - let prepared_px_s32 = - unsafe { vreinterpretq_u32_s32(vmulq_s32_f32(summs, v_weight)) }; - let prepared_u16 = unsafe { vqmovn_u32(prepared_px_s32) }; - let prepared_u8 = unsafe { vqmovn_u16(vcombine_u16(prepared_u16, prepared_u16)) }; - - let bytes_offset = current_y + current_px; - unsafe { + unsafe { + let edge_mode: EdgeMode = EDGE_MODE.into(); + let bytes: &UnsafeSlice<'_, u8> = std::mem::transmute(undefined_slice); + let mut buffer: [[i32; 4]; 1024] = [[0; 4]; 1024]; + let initial_sum = ((radius * radius) >> 1) as i32; + + let radius_64 = radius as i64; + let width_wide = width as i64; + let v_weight = vdupq_n_f32((1f64 / (radius as f64 * radius as f64)) as f32); + for y in start..std::cmp::min(height, end) { + let mut diffs: int32x4_t = vdupq_n_s32(0); + let mut summs: int32x4_t = vdupq_n_s32(initial_sum); + + let current_y = ((y as i64) * (stride as i64)) as usize; + + let start_x = 0 - 2 * radius_64; + for x in start_x..(width as i64) { + if x >= 0 { + let current_px = + ((std::cmp::max(x, 0) as u32) * CHANNELS_COUNT as u32) as usize; + + let prepared_px_s32 = vreinterpretq_u32_s32(vmulq_s32_f32(summs, v_weight)); + let prepared_u16 = vqmovn_u32(prepared_px_s32); + let prepared_u8 = vqmovn_u16(vcombine_u16(prepared_u16, prepared_u16)); + + let bytes_offset = current_y + current_px; let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(bytes_offset); - store_u8x8_m4::(dst_ptr, prepared_u8) - } + store_u8x8_m4::(dst_ptr, prepared_u8); - let arr_index = ((x - radius_64) & 1023) as usize; - let d_arr_index = (x & 1023) as usize; + let arr_index = ((x - radius_64) & 1023) as usize; + let d_arr_index = (x & 1023) as usize; - let d_buf_ptr = unsafe { buffer.get_unchecked_mut(d_arr_index).as_mut_ptr() }; - let mut d_stored = unsafe { vld1q_s32(d_buf_ptr) }; - d_stored = unsafe { vshlq_n_s32::<1>(d_stored) }; + let d_buf_ptr = buffer.get_unchecked_mut(d_arr_index).as_mut_ptr(); + let mut d_stored = vld1q_s32(d_buf_ptr); + d_stored = vshlq_n_s32::<1>(d_stored); - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; - let a_stored = unsafe { vld1q_s32(buf_ptr) }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let a_stored = vld1q_s32(buf_ptr); - diffs = unsafe { vaddq_s32(diffs, vsubq_s32(a_stored, d_stored)) }; - } else if x + radius_64 >= 0 { - let arr_index = (x & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; - let mut stored = unsafe { vld1q_s32(buf_ptr) }; - stored = unsafe { vshlq_n_s32::<1>(stored) }; - diffs = unsafe { vsubq_s32(diffs, stored) }; - } + diffs = vaddq_s32(diffs, vsubq_s32(a_stored, d_stored)); + } else if x + radius_64 >= 0 { + let arr_index = (x & 1023) as usize; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let mut stored = vld1q_s32(buf_ptr); + stored = vshlq_n_s32::<1>(stored); + diffs = vsubq_s32(diffs, stored); + } - let next_row_y = (y as usize) * (stride as usize); - let next_row_x = clamp_edge!(edge_mode, x + radius_64, 0, width_wide - 1); - let next_row_px = next_row_x * CHANNELS_COUNT; + let next_row_y = (y as usize) * (stride as usize); + let next_row_x = clamp_edge!(edge_mode, x + radius_64, 0, width_wide - 1); + let next_row_px = next_row_x * CHANNELS_COUNT; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut u8 }; - let pixel_color = unsafe { load_u8_s32_fast::(s_ptr) }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut u8; + let pixel_color = load_u8_s32_fast::(s_ptr); - let arr_index = ((x + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; + let arr_index = ((x + radius_64) & 1023) as usize; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); - diffs = unsafe { vaddq_s32(diffs, pixel_color) }; - summs = unsafe { vaddq_s32(summs, diffs) }; - unsafe { + diffs = vaddq_s32(diffs, pixel_color); + summs = vaddq_s32(summs, diffs); vst1q_s32(buf_ptr, pixel_color); } } @@ -125,70 +123,68 @@ pub(crate) fn fast_gaussian_vertical_pass_neon_u8< start: u32, end: u32, ) { - let edge_mode: EdgeMode = EDGE_MODE.into(); - let bytes: &UnsafeSlice<'_, u8> = unsafe { std::mem::transmute(undefined_slice) }; - let mut buffer: [[i32; 4]; 1024] = [[0; 4]; 1024]; - let initial_sum = ((radius * radius) >> 1) as i32; + unsafe { + let edge_mode: EdgeMode = EDGE_MODE.into(); + let bytes: &UnsafeSlice<'_, u8> = std::mem::transmute(undefined_slice); + let mut buffer: [[i32; 4]; 1024] = [[0; 4]; 1024]; + let initial_sum = ((radius * radius) >> 1) as i32; - let height_wide = height as i64; + let height_wide = height as i64; - let radius_64 = radius as i64; - let v_weight = unsafe { vdupq_n_f32((1f64 / (radius as f64 * radius as f64)) as f32) }; - for x in start..std::cmp::min(width, end) { - let mut diffs: int32x4_t = unsafe { vdupq_n_s32(0) }; - let mut summs: int32x4_t = unsafe { vdupq_n_s32(initial_sum) }; + let radius_64 = radius as i64; + let v_weight = vdupq_n_f32((1f64 / (radius as f64 * radius as f64)) as f32); + for x in start..std::cmp::min(width, end) { + let mut diffs: int32x4_t = vdupq_n_s32(0); + let mut summs: int32x4_t = vdupq_n_s32(initial_sum); - let start_y = 0 - 2 * radius as i64; - for y in start_y..height_wide { - let current_y = (y * (stride as i64)) as usize; + let start_y = 0 - 2 * radius as i64; + for y in start_y..height_wide { + let current_y = (y * (stride as i64)) as usize; - if y >= 0 { - let current_px = ((std::cmp::max(x, 0)) * CHANNELS_COUNT as u32) as usize; + if y >= 0 { + let current_px = ((std::cmp::max(x, 0)) * CHANNELS_COUNT as u32) as usize; - let prepared_px_s32 = - unsafe { vreinterpretq_u32_s32(vmulq_s32_f32(summs, v_weight)) }; - let prepared_u16 = unsafe { vqmovn_u32(prepared_px_s32) }; - let prepared_u8 = unsafe { vqmovn_u16(vcombine_u16(prepared_u16, prepared_u16)) }; + let prepared_px_s32 = vreinterpretq_u32_s32(vmulq_s32_f32(summs, v_weight)); + let prepared_u16 = vqmovn_u32(prepared_px_s32); + let prepared_u8 = vqmovn_u16(vcombine_u16(prepared_u16, prepared_u16)); - let bytes_offset = current_y + current_px; + let bytes_offset = current_y + current_px; - unsafe { let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(bytes_offset); - store_u8x8_m4::(dst_ptr, prepared_u8) - } + store_u8x8_m4::(dst_ptr, prepared_u8); - let arr_index = ((y - radius_64) & 1023) as usize; - let d_arr_index = (y & 1023) as usize; + let arr_index = ((y - radius_64) & 1023) as usize; + let d_arr_index = (y & 1023) as usize; - let d_buf_ptr = unsafe { buffer.get_unchecked_mut(d_arr_index).as_mut_ptr() }; - let mut d_stored = unsafe { vld1q_s32(d_buf_ptr) }; - d_stored = unsafe { vshlq_n_s32::<1>(d_stored) }; + let d_buf_ptr = buffer.get_unchecked_mut(d_arr_index).as_mut_ptr(); + let mut d_stored = vld1q_s32(d_buf_ptr); + d_stored = vshlq_n_s32::<1>(d_stored); - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; - let a_stored = unsafe { vld1q_s32(buf_ptr) }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let a_stored = vld1q_s32(buf_ptr); - diffs = unsafe { vaddq_s32(diffs, vsubq_s32(a_stored, d_stored)) }; - } else if y + radius_64 >= 0 { - let arr_index = (y & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; - let mut stored = unsafe { vld1q_s32(buf_ptr) }; - stored = unsafe { vshlq_n_s32::<1>(stored) }; - diffs = unsafe { vsubq_s32(diffs, stored) }; - } + diffs = vaddq_s32(diffs, vsubq_s32(a_stored, d_stored)); + } else if y + radius_64 >= 0 { + let arr_index = (y & 1023) as usize; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let mut stored = vld1q_s32(buf_ptr); + stored = vshlq_n_s32::<1>(stored); + diffs = vsubq_s32(diffs, stored); + } + + let next_row_y = + clamp_edge!(edge_mode, y + radius_64, 0, height_wide - 1) * (stride as usize); + let next_row_x = (x * CHANNELS_COUNT as u32) as usize; - let next_row_y = - clamp_edge!(edge_mode, y + radius_64, 0, height_wide - 1) * (stride as usize); - let next_row_x = (x * CHANNELS_COUNT as u32) as usize; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut u8; + let pixel_color = load_u8_s32_fast::(s_ptr); - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut u8 }; - let pixel_color = unsafe { load_u8_s32_fast::(s_ptr) }; + let arr_index = ((y + radius_64) & 1023) as usize; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); - let arr_index = ((y + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; + diffs = vaddq_s32(diffs, pixel_color); + summs = vaddq_s32(summs, diffs); - diffs = unsafe { vaddq_s32(diffs, pixel_color) }; - summs = unsafe { vaddq_s32(summs, diffs) }; - unsafe { vst1q_s32(buf_ptr, pixel_color); } } diff --git a/src/lib/neon/fast_gaussian_f16.rs b/src/lib/neon/fast_gaussian_f16.rs index a1c304c..0c2ef13 100644 --- a/src/lib/neon/fast_gaussian_f16.rs +++ b/src/lib/neon/fast_gaussian_f16.rs @@ -46,65 +46,63 @@ pub fn fast_gaussian_vertical_pass_neon_f16< start: u32, end: u32, ) { - let edge_mode: EdgeMode = EDGE_MODE.into(); - let bytes: &UnsafeSlice<'_, f16> = unsafe { std::mem::transmute(undef_bytes) }; - let mut buffer: [[f32; 4]; 1024] = [[0f32; 4]; 1024]; + unsafe { + let edge_mode: EdgeMode = EDGE_MODE.into(); + let bytes: &UnsafeSlice<'_, f16> = std::mem::transmute(undef_bytes); + let mut buffer: [[f32; 4]; 1024] = [[0f32; 4]; 1024]; - let height_wide = height as i64; + let height_wide = height as i64; - let radius_64 = radius as i64; - let weight = 1.0f32 / ((radius as f32) * (radius as f32)); - let f_weight = unsafe { vdupq_n_f32(weight) }; - for x in start..std::cmp::min(width, end) { - let mut diffs: float32x4_t = unsafe { vdupq_n_f32(0f32) }; - let mut summs: float32x4_t = unsafe { vdupq_n_f32(0f32) }; + let radius_64 = radius as i64; + let weight = 1.0f32 / ((radius as f32) * (radius as f32)); + let f_weight = vdupq_n_f32(weight); + for x in start..std::cmp::min(width, end) { + let mut diffs: float32x4_t = vdupq_n_f32(0f32); + let mut summs: float32x4_t = vdupq_n_f32(0f32); - let start_y = 0 - 2 * radius as i64; - for y in start_y..height_wide { - let current_y = (y * (stride as i64)) as usize; + let start_y = 0 - 2 * radius as i64; + for y in start_y..height_wide { + let current_y = (y * (stride as i64)) as usize; - if y >= 0 { - let current_px = (std::cmp::max(x, 0)) as usize * CHANNELS_COUNT; + if y >= 0 { + let current_px = std::cmp::max(x, 0) as usize * CHANNELS_COUNT; - let prepared_px = unsafe { vmulq_f32(summs, f_weight) }; + let prepared_px = vmulq_f32(summs, f_weight); - unsafe { let dst_ptr = bytes.slice.as_ptr().add(current_y + current_px) as *mut f16; store_f32_f16::(dst_ptr, prepared_px); - } - let arr_index = ((y - radius_64) & 1023) as usize; - let d_arr_index = (y & 1023) as usize; + let arr_index = ((y - radius_64) & 1023) as usize; + let d_arr_index = (y & 1023) as usize; - let d_buf_ptr = unsafe { buffer.as_mut_ptr().add(d_arr_index) as *mut f32 }; - let mut d_stored = unsafe { vld1q_f32(d_buf_ptr) }; - d_stored = unsafe { vmulq_n_f32(d_stored, 2f32) }; + let d_buf_ptr = buffer.as_mut_ptr().add(d_arr_index) as *mut f32; + let mut d_stored = vld1q_f32(d_buf_ptr); + d_stored = vmulq_n_f32(d_stored, 2f32); - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; - let a_stored = unsafe { vld1q_f32(buf_ptr) }; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; + let a_stored = vld1q_f32(buf_ptr); - diffs = unsafe { vaddq_f32(diffs, vsubq_f32(a_stored, d_stored)) }; - } else if y + radius_64 >= 0 { - let arr_index = (y & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; - let mut stored = unsafe { vld1q_f32(buf_ptr) }; - stored = unsafe { vmulq_n_f32(stored, 2f32) }; - diffs = unsafe { vsubq_f32(diffs, stored) }; - } + diffs = vaddq_f32(diffs, vsubq_f32(a_stored, d_stored)); + } else if y + radius_64 >= 0 { + let arr_index = (y & 1023) as usize; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; + let mut stored = vld1q_f32(buf_ptr); + stored = vmulq_n_f32(stored, 2f32); + diffs = vsubq_f32(diffs, stored); + } - let next_row_y = - clamp_edge!(edge_mode, y + radius_64, 0, height_wide - 1) * (stride as usize); - let next_row_x = x as usize * CHANNELS_COUNT; + let next_row_y = + clamp_edge!(edge_mode, y + radius_64, 0, height_wide - 1) * (stride as usize); + let next_row_x = x as usize * CHANNELS_COUNT; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut f16 }; - let pixel_color = unsafe { load_f32_f16::(s_ptr) }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut f16; + let pixel_color = load_f32_f16::(s_ptr); - let arr_index = ((y + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; + let arr_index = ((y + radius_64) & 1023) as usize; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; - diffs = unsafe { vaddq_f32(diffs, pixel_color) }; - summs = unsafe { vaddq_f32(summs, diffs) }; - unsafe { + diffs = vaddq_f32(diffs, pixel_color); + summs = vaddq_f32(summs, diffs); vst1q_f32(buf_ptr, pixel_color); } } @@ -124,63 +122,61 @@ pub fn fast_gaussian_horizontal_pass_neon_f16< start: u32, end: u32, ) { - let edge_mode: EdgeMode = EDGE_MODE.into(); - let bytes: &UnsafeSlice<'_, f16> = unsafe { std::mem::transmute(undef_bytes) }; - let mut buffer: [[f32; 4]; 1024] = [[0f32; 4]; 1024]; - let radius_64 = radius as i64; - let width_wide = width as i64; - let weight = 1.0f32 / ((radius as f32) * (radius as f32)); - let f_weight = unsafe { vdupq_n_f32(weight) }; - for y in start..std::cmp::min(height, end) { - let mut diffs: float32x4_t = unsafe { vdupq_n_f32(0f32) }; - let mut summs: float32x4_t = unsafe { vdupq_n_f32(0f32) }; - - let current_y = ((y as i64) * (stride as i64)) as usize; - - let start_x = 0 - 2 * radius_64; - for x in start_x..(width as i64) { - if x >= 0 { - let current_px = (std::cmp::max(x, 0) as u32) as usize * CHANNELS_COUNT; - - let prepared_px = unsafe { vmulq_f32(summs, f_weight) }; - - unsafe { + unsafe { + let edge_mode: EdgeMode = EDGE_MODE.into(); + let bytes: &UnsafeSlice<'_, f16> = std::mem::transmute(undef_bytes); + let mut buffer: [[f32; 4]; 1024] = [[0f32; 4]; 1024]; + let radius_64 = radius as i64; + let width_wide = width as i64; + let weight = 1.0f32 / ((radius as f32) * (radius as f32)); + let f_weight = vdupq_n_f32(weight); + for y in start..std::cmp::min(height, end) { + let mut diffs: float32x4_t = vdupq_n_f32(0f32); + let mut summs: float32x4_t = vdupq_n_f32(0f32); + + let current_y = ((y as i64) * (stride as i64)) as usize; + + let start_x = 0 - 2 * radius_64; + for x in start_x..(width as i64) { + if x >= 0 { + let current_px = (std::cmp::max(x, 0) as u32) as usize * CHANNELS_COUNT; + + let prepared_px = vmulq_f32(summs, f_weight); + let dst_ptr = bytes.slice.as_ptr().add(current_y + current_px) as *mut f16; store_f32_f16::(dst_ptr, prepared_px); - } - let arr_index = ((x - radius_64) & 1023) as usize; - let d_arr_index = (x & 1023) as usize; + let arr_index = ((x - radius_64) & 1023) as usize; + let d_arr_index = (x & 1023) as usize; - let d_buf_ptr = unsafe { buffer.as_mut_ptr().add(d_arr_index) as *mut f32 }; - let mut d_stored = unsafe { vld1q_f32(d_buf_ptr) }; - d_stored = unsafe { vmulq_n_f32(d_stored, 2f32) }; + let d_buf_ptr = buffer.as_mut_ptr().add(d_arr_index) as *mut f32; + let mut d_stored = vld1q_f32(d_buf_ptr); + d_stored = vmulq_n_f32(d_stored, 2f32); - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; - let a_stored = unsafe { vld1q_f32(buf_ptr) }; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; + let a_stored = vld1q_f32(buf_ptr); - diffs = unsafe { vaddq_f32(diffs, vsubq_f32(a_stored, d_stored)) }; - } else if x + radius_64 >= 0 { - let arr_index = (x & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; - let mut stored = unsafe { vld1q_f32(buf_ptr) }; - stored = unsafe { vmulq_n_f32(stored, 2f32) }; - diffs = unsafe { vsubq_f32(diffs, stored) }; - } + diffs = vaddq_f32(diffs, vsubq_f32(a_stored, d_stored)); + } else if x + radius_64 >= 0 { + let arr_index = (x & 1023) as usize; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; + let mut stored = vld1q_f32(buf_ptr); + stored = vmulq_n_f32(stored, 2f32); + diffs = vsubq_f32(diffs, stored); + } - let next_row_y = (y as usize) * (stride as usize); - let next_row_x = clamp_edge!(edge_mode, x + radius_64, 0, width_wide - 1); - let next_row_px = next_row_x * CHANNELS_COUNT; + let next_row_y = (y as usize) * (stride as usize); + let next_row_x = clamp_edge!(edge_mode, x + radius_64, 0, width_wide - 1); + let next_row_px = next_row_x * CHANNELS_COUNT; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut f16 }; - let pixel_color = unsafe { load_f32_f16::(s_ptr) }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut f16; + let pixel_color = load_f32_f16::(s_ptr); - let arr_index = ((x + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; + let arr_index = ((x + radius_64) & 1023) as usize; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; - diffs = unsafe { vaddq_f32(diffs, pixel_color) }; - summs = unsafe { vaddq_f32(summs, diffs) }; - unsafe { + diffs = vaddq_f32(diffs, pixel_color); + summs = vaddq_f32(summs, diffs); vst1q_f32(buf_ptr, pixel_color); } } diff --git a/src/lib/neon/fast_gaussian_f32.rs b/src/lib/neon/fast_gaussian_f32.rs index c79d96c..0c0935b 100644 --- a/src/lib/neon/fast_gaussian_f32.rs +++ b/src/lib/neon/fast_gaussian_f32.rs @@ -44,65 +44,63 @@ pub fn fast_gaussian_vertical_pass_neon_f32< start: u32, end: u32, ) { - let edge_mode: EdgeMode = EDGE_MODE.into(); - let bytes: &UnsafeSlice<'_, f32> = unsafe { std::mem::transmute(undef_bytes) }; - let mut buffer: [[f32; 4]; 1024] = [[0f32; 4]; 1024]; + unsafe { + let edge_mode: EdgeMode = EDGE_MODE.into(); + let bytes: &UnsafeSlice<'_, f32> = std::mem::transmute(undef_bytes); + let mut buffer: [[f32; 4]; 1024] = [[0f32; 4]; 1024]; - let height_wide = height as i64; + let height_wide = height as i64; - let radius_64 = radius as i64; - let weight = 1.0f32 / ((radius as f32) * (radius as f32)); - let f_weight = unsafe { vdupq_n_f32(weight) }; - for x in start..std::cmp::min(width, end) { - let mut diffs: float32x4_t = unsafe { vdupq_n_f32(0f32) }; - let mut summs: float32x4_t = unsafe { vdupq_n_f32(0f32) }; + let radius_64 = radius as i64; + let weight = 1.0f32 / ((radius as f32) * (radius as f32)); + let f_weight = vdupq_n_f32(weight); + for x in start..std::cmp::min(width, end) { + let mut diffs: float32x4_t = vdupq_n_f32(0f32); + let mut summs: float32x4_t = vdupq_n_f32(0f32); - let start_y = 0 - 2 * radius as i64; - for y in start_y..height_wide { - let current_y = (y * (stride as i64)) as usize; + let start_y = 0 - 2 * radius as i64; + for y in start_y..height_wide { + let current_y = (y * (stride as i64)) as usize; - if y >= 0 { - let current_px = (std::cmp::max(x, 0)) as usize * CHANNELS_COUNT; + if y >= 0 { + let current_px = std::cmp::max(x, 0) as usize * CHANNELS_COUNT; - let prepared_px = unsafe { vmulq_f32(summs, f_weight) }; + let prepared_px = vmulq_f32(summs, f_weight); - unsafe { let dst_ptr = bytes.slice.as_ptr().add(current_y + current_px) as *mut f32; store_f32::(dst_ptr, prepared_px); - } - let arr_index = ((y - radius_64) & 1023) as usize; - let d_arr_index = (y & 1023) as usize; + let arr_index = ((y - radius_64) & 1023) as usize; + let d_arr_index = (y & 1023) as usize; - let d_buf_ptr = unsafe { buffer.as_mut_ptr().add(d_arr_index) as *mut f32 }; - let mut d_stored = unsafe { vld1q_f32(d_buf_ptr) }; - d_stored = unsafe { vmulq_n_f32(d_stored, 2f32) }; + let d_buf_ptr = buffer.as_mut_ptr().add(d_arr_index) as *mut f32; + let mut d_stored = vld1q_f32(d_buf_ptr); + d_stored = vmulq_n_f32(d_stored, 2f32); - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; - let a_stored = unsafe { vld1q_f32(buf_ptr) }; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; + let a_stored = vld1q_f32(buf_ptr); - diffs = unsafe { vaddq_f32(diffs, vsubq_f32(a_stored, d_stored)) }; - } else if y + radius_64 >= 0 { - let arr_index = (y & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; - let mut stored = unsafe { vld1q_f32(buf_ptr) }; - stored = unsafe { vmulq_n_f32(stored, 2f32) }; - diffs = unsafe { vsubq_f32(diffs, stored) }; - } + diffs = vaddq_f32(diffs, vsubq_f32(a_stored, d_stored)); + } else if y + radius_64 >= 0 { + let arr_index = (y & 1023) as usize; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; + let mut stored = vld1q_f32(buf_ptr); + stored = vmulq_n_f32(stored, 2f32); + diffs = vsubq_f32(diffs, stored); + } - let next_row_y = - clamp_edge!(edge_mode, y + radius_64, 0, height_wide - 1) * (stride as usize); - let next_row_x = x as usize * CHANNELS_COUNT; + let next_row_y = + clamp_edge!(edge_mode, y + radius_64, 0, height_wide - 1) * (stride as usize); + let next_row_x = x as usize * CHANNELS_COUNT; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut f32 }; - let pixel_color = unsafe { load_f32_fast::(s_ptr) }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut f32; + let pixel_color = load_f32_fast::(s_ptr); - let arr_index = ((y + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; + let arr_index = ((y + radius_64) & 1023) as usize; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; - diffs = unsafe { vaddq_f32(diffs, pixel_color) }; - summs = unsafe { vaddq_f32(summs, diffs) }; - unsafe { + diffs = vaddq_f32(diffs, pixel_color); + summs = vaddq_f32(summs, diffs); vst1q_f32(buf_ptr, pixel_color); } } @@ -122,63 +120,61 @@ pub fn fast_gaussian_horizontal_pass_neon_f32< start: u32, end: u32, ) { - let edge_mode: EdgeMode = EDGE_MODE.into(); - let bytes: &UnsafeSlice<'_, f32> = unsafe { std::mem::transmute(undef_bytes) }; - let mut buffer: [[f32; 4]; 1024] = [[0f32; 4]; 1024]; - let radius_64 = radius as i64; - let width_wide = width as i64; - let weight = 1.0f32 / ((radius as f32) * (radius as f32)); - let f_weight = unsafe { vdupq_n_f32(weight) }; - for y in start..std::cmp::min(height, end) { - let mut diffs: float32x4_t = unsafe { vdupq_n_f32(0f32) }; - let mut summs: float32x4_t = unsafe { vdupq_n_f32(0f32) }; - - let current_y = ((y as i64) * (stride as i64)) as usize; - - let start_x = 0 - 2 * radius_64; - for x in start_x..(width as i64) { - if x >= 0 { - let current_px = (std::cmp::max(x, 0) as u32) as usize * CHANNELS_COUNT; - - let prepared_px = unsafe { vmulq_f32(summs, f_weight) }; - - unsafe { + unsafe { + let edge_mode: EdgeMode = EDGE_MODE.into(); + let bytes: &UnsafeSlice<'_, f32> = std::mem::transmute(undef_bytes); + let mut buffer: [[f32; 4]; 1024] = [[0f32; 4]; 1024]; + let radius_64 = radius as i64; + let width_wide = width as i64; + let weight = 1.0f32 / ((radius as f32) * (radius as f32)); + let f_weight = vdupq_n_f32(weight); + for y in start..std::cmp::min(height, end) { + let mut diffs: float32x4_t = vdupq_n_f32(0f32); + let mut summs: float32x4_t = vdupq_n_f32(0f32); + + let current_y = ((y as i64) * (stride as i64)) as usize; + + let start_x = 0 - 2 * radius_64; + for x in start_x..(width as i64) { + if x >= 0 { + let current_px = (std::cmp::max(x, 0) as u32) as usize * CHANNELS_COUNT; + + let prepared_px = vmulq_f32(summs, f_weight); + let dst_ptr = bytes.slice.as_ptr().add(current_y + current_px) as *mut f32; store_f32::(dst_ptr, prepared_px); - } - let arr_index = ((x - radius_64) & 1023) as usize; - let d_arr_index = (x & 1023) as usize; + let arr_index = ((x - radius_64) & 1023) as usize; + let d_arr_index = (x & 1023) as usize; - let d_buf_ptr = unsafe { buffer.as_mut_ptr().add(d_arr_index) as *mut f32 }; - let mut d_stored = unsafe { vld1q_f32(d_buf_ptr) }; - d_stored = unsafe { vmulq_n_f32(d_stored, 2f32) }; + let d_buf_ptr = buffer.as_mut_ptr().add(d_arr_index) as *mut f32; + let mut d_stored = vld1q_f32(d_buf_ptr); + d_stored = vmulq_n_f32(d_stored, 2f32); - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; - let a_stored = unsafe { vld1q_f32(buf_ptr) }; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; + let a_stored = vld1q_f32(buf_ptr); - diffs = unsafe { vaddq_f32(diffs, vsubq_f32(a_stored, d_stored)) }; - } else if x + radius_64 >= 0 { - let arr_index = (x & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; - let mut stored = unsafe { vld1q_f32(buf_ptr) }; - stored = unsafe { vmulq_n_f32(stored, 2f32) }; - diffs = unsafe { vsubq_f32(diffs, stored) }; - } + diffs = vaddq_f32(diffs, vsubq_f32(a_stored, d_stored)); + } else if x + radius_64 >= 0 { + let arr_index = (x & 1023) as usize; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; + let mut stored = vld1q_f32(buf_ptr); + stored = vmulq_n_f32(stored, 2f32); + diffs = vsubq_f32(diffs, stored); + } - let next_row_y = (y as usize) * (stride as usize); - let next_row_x = clamp_edge!(edge_mode, x + radius_64, 0, width_wide - 1); - let next_row_px = next_row_x * CHANNELS_COUNT; + let next_row_y = (y as usize) * (stride as usize); + let next_row_x = clamp_edge!(edge_mode, x + radius_64, 0, width_wide - 1); + let next_row_px = next_row_x * CHANNELS_COUNT; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut f32 }; - let pixel_color = unsafe { load_f32_fast::(s_ptr) }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut f32; + let pixel_color = load_f32_fast::(s_ptr); - let arr_index = ((x + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; + let arr_index = ((x + radius_64) & 1023) as usize; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; - diffs = unsafe { vaddq_f32(diffs, pixel_color) }; - summs = unsafe { vaddq_f32(summs, diffs) }; - unsafe { + diffs = vaddq_f32(diffs, pixel_color); + summs = vaddq_f32(summs, diffs); vst1q_f32(buf_ptr, pixel_color); } } diff --git a/src/lib/neon/fast_gaussian_next.rs b/src/lib/neon/fast_gaussian_next.rs index 509035f..a621698 100644 --- a/src/lib/neon/fast_gaussian_next.rs +++ b/src/lib/neon/fast_gaussian_next.rs @@ -25,7 +25,7 @@ // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -use crate::neon::{load_u8_s32_fast, store_u8x8_m4}; +use crate::neon::{load_u8_s32_fast, store_u8x8_m4, vmulq_by_3_s32}; use crate::reflect_index; use crate::{clamp_edge, reflect_101, EdgeMode}; use std::arch::aarch64::*; @@ -45,89 +45,86 @@ pub fn fast_gaussian_next_vertical_pass_neon_u8< start: u32, end: u32, ) { - let edge_mode: EdgeMode = EDGE_MODE.into(); - let bytes: &UnsafeSlice<'_, u8> = unsafe { std::mem::transmute(undefined_slice) }; - let mut buffer: [[i32; 4]; 1024] = [[0; 4]; 1024]; + unsafe { + let edge_mode: EdgeMode = EDGE_MODE.into(); + let bytes: &UnsafeSlice<'_, u8> = std::mem::transmute(undefined_slice); + let mut buffer: [[i32; 4]; 1024] = [[0; 4]; 1024]; - let height_wide = height as i64; + let height_wide = height as i64; - let radius_64 = radius as i64; - let weight = 1.0f32 / ((radius as f32) * (radius as f32) * (radius as f32)); - let f_weight = unsafe { vdupq_n_f32(weight) }; - for x in start..std::cmp::min(width, end) { - let mut diffs: int32x4_t = unsafe { vdupq_n_s32(0) }; - let mut ders: int32x4_t = unsafe { vdupq_n_s32(0) }; - let mut summs: int32x4_t = unsafe { vdupq_n_s32(0) }; + let radius_64 = radius as i64; + let weight = 1.0f32 / ((radius as f32) * (radius as f32) * (radius as f32)); + let f_weight = vdupq_n_f32(weight); + for x in start..std::cmp::min(width, end) { + let mut diffs: int32x4_t = vdupq_n_s32(0); + let mut ders: int32x4_t = vdupq_n_s32(0); + let mut summs: int32x4_t = vdupq_n_s32(0); - let start_y = 0 - 3 * radius as i64; - for y in start_y..height_wide { - let current_y = (y * (stride as i64)) as usize; + let start_y = 0 - 3 * radius as i64; + for y in start_y..height_wide { + let current_y = (y * (stride as i64)) as usize; - if y >= 0 { - let current_px = ((std::cmp::max(x, 0)) * CHANNELS_COUNT as u32) as usize; + if y >= 0 { + let current_px = ((std::cmp::max(x, 0)) * CHANNELS_COUNT as u32) as usize; - let prepared_px_s32 = - unsafe { vcvtaq_s32_f32(vmulq_f32(vcvtq_f32_s32(summs), f_weight)) }; - let prepared_u16 = unsafe { vqmovun_s32(prepared_px_s32) }; - let prepared_u8 = unsafe { vqmovn_u16(vcombine_u16(prepared_u16, prepared_u16)) }; + let prepared_px_s32 = vcvtaq_s32_f32(vmulq_f32(vcvtq_f32_s32(summs), f_weight)); + let prepared_u16 = vqmovun_s32(prepared_px_s32); + let prepared_u8 = vqmovn_u16(vcombine_u16(prepared_u16, prepared_u16)); - let bytes_offset = current_y + current_px; + let bytes_offset = current_y + current_px; - unsafe { let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(bytes_offset); - store_u8x8_m4::(dst_ptr, prepared_u8) - } + store_u8x8_m4::(dst_ptr, prepared_u8); - let d_arr_index_1 = ((y + radius_64) & 1023) as usize; - let d_arr_index_2 = ((y - radius_64) & 1023) as usize; - let d_arr_index = (y & 1023) as usize; + let d_arr_index_1 = ((y + radius_64) & 1023) as usize; + let d_arr_index_2 = ((y - radius_64) & 1023) as usize; + let d_arr_index = (y & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(d_arr_index) as *const i32 }; - let stored = unsafe { vld1q_s32(buf_ptr) }; + let buf_ptr = buffer.as_mut_ptr().add(d_arr_index) as *const i32; + let stored = vld1q_s32(buf_ptr); - let buf_ptr_1 = unsafe { buffer.as_mut_ptr().add(d_arr_index_1) as *const i32 }; - let stored_1 = unsafe { vld1q_s32(buf_ptr_1) }; + let buf_ptr_1 = buffer.as_mut_ptr().add(d_arr_index_1) as *const i32; + let stored_1 = vld1q_s32(buf_ptr_1); - let buf_ptr_2 = unsafe { buffer.as_mut_ptr().add(d_arr_index_2) as *const i32 }; - let stored_2 = unsafe { vld1q_s32(buf_ptr_2) }; + let buf_ptr_2 = buffer.as_mut_ptr().add(d_arr_index_2) as *const i32; + let stored_2 = vld1q_s32(buf_ptr_2); - let new_diff = - unsafe { vsubq_s32(vmulq_n_s32(vsubq_s32(stored, stored_1), 3), stored_2) }; - diffs = unsafe { vaddq_s32(diffs, new_diff) }; - } else if y + radius_64 >= 0 { - let arr_index = (y & 1023) as usize; - let arr_index_1 = ((y + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; - let stored = unsafe { vld1q_s32(buf_ptr) }; + let new_diff = vsubq_s32(vmulq_by_3_s32(vsubq_s32(stored, stored_1)), stored_2); + diffs = vaddq_s32(diffs, new_diff); + } else if y + radius_64 >= 0 { + let arr_index = (y & 1023) as usize; + let arr_index_1 = ((y + radius_64) & 1023) as usize; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let stored = vld1q_s32(buf_ptr); - let buf_ptr_1 = unsafe { buffer.get_unchecked_mut(arr_index_1).as_mut_ptr() }; - let stored_1 = unsafe { vld1q_s32(buf_ptr_1) }; + let buf_ptr_1 = buffer.get_unchecked_mut(arr_index_1).as_mut_ptr(); + let stored_1 = vld1q_s32(buf_ptr_1); - let new_diff = unsafe { vmulq_n_s32(vsubq_s32(stored, stored_1), 3) }; + let new_diff = vmulq_by_3_s32(vsubq_s32(stored, stored_1)); - diffs = unsafe { vaddq_s32(diffs, new_diff) }; - } else if y + 2 * radius_64 >= 0 { - let arr_index = ((y + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; - let stored = unsafe { vld1q_s32(buf_ptr) }; - diffs = unsafe { vsubq_s32(diffs, vmulq_n_s32(stored, 3)) }; - } + diffs = vaddq_s32(diffs, new_diff); + } else if y + 2 * radius_64 >= 0 { + let arr_index = ((y + radius_64) & 1023) as usize; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let stored = vld1q_s32(buf_ptr); + diffs = vsubq_s32(diffs, vmulq_by_3_s32(stored)); + } - let next_row_y = clamp_edge!(edge_mode, y + ((3 * radius_64) >> 1), 0, height_wide - 1) - * (stride as usize); - let next_row_x = (x * CHANNELS_COUNT as u32) as usize; + let next_row_y = + clamp_edge!(edge_mode, y + ((3 * radius_64) >> 1), 0, height_wide - 1) + * (stride as usize); + let next_row_x = (x * CHANNELS_COUNT as u32) as usize; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut u8 }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut u8; - let pixel_color = unsafe { load_u8_s32_fast::(s_ptr) }; + let pixel_color = load_u8_s32_fast::(s_ptr); - let arr_index = ((y + 2 * radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; + let arr_index = ((y + 2 * radius_64) & 1023) as usize; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); - diffs = unsafe { vaddq_s32(diffs, pixel_color) }; - ders = unsafe { vaddq_s32(ders, diffs) }; - summs = unsafe { vaddq_s32(summs, ders) }; - unsafe { + diffs = vaddq_s32(diffs, pixel_color); + ders = vaddq_s32(ders, diffs); + summs = vaddq_s32(summs, ders); vst1q_s32(buf_ptr, pixel_color); } } @@ -147,88 +144,84 @@ pub(crate) fn fast_gaussian_next_horizontal_pass_neon_u8< start: u32, end: u32, ) { - let edge_mode: EdgeMode = EDGE_MODE.into(); - let bytes: &UnsafeSlice<'_, u8> = unsafe { std::mem::transmute(undefined_slice) }; - let mut buffer: [[i32; 4]; 1024] = [[0; 4]; 1024]; + unsafe { + let edge_mode: EdgeMode = EDGE_MODE.into(); + let bytes: &UnsafeSlice<'_, u8> = std::mem::transmute(undefined_slice); + let mut buffer: [[i32; 4]; 1024] = [[0; 4]; 1024]; - let width_wide = width as i64; + let width_wide = width as i64; - let radius_64 = radius as i64; - let weight = 1.0f32 / ((radius as f32) * (radius as f32) * (radius as f32)); - let f_weight = unsafe { vdupq_n_f32(weight) }; - for y in start..std::cmp::min(height, end) { - let mut diffs: int32x4_t = unsafe { vdupq_n_s32(0) }; - let mut ders: int32x4_t = unsafe { vdupq_n_s32(0) }; - let mut summs: int32x4_t = unsafe { vdupq_n_s32(0) }; + let radius_64 = radius as i64; + let weight = 1.0f32 / ((radius as f32) * (radius as f32) * (radius as f32)); + let f_weight = vdupq_n_f32(weight); + for y in start..std::cmp::min(height, end) { + let mut diffs: int32x4_t = vdupq_n_s32(0); + let mut ders: int32x4_t = vdupq_n_s32(0); + let mut summs: int32x4_t = vdupq_n_s32(0); - let current_y = ((y as i64) * (stride as i64)) as usize; + let current_y = ((y as i64) * (stride as i64)) as usize; - for x in (0 - 3 * radius_64)..(width as i64) { - if x >= 0 { - let current_px = x as usize * CHANNELS_COUNT; + for x in (0 - 3 * radius_64)..(width as i64) { + if x >= 0 { + let current_px = x as usize * CHANNELS_COUNT; - let prepared_px_s32 = - unsafe { vcvtaq_s32_f32(vmulq_f32(vcvtq_f32_s32(summs), f_weight)) }; - let prepared_u16 = unsafe { vqmovun_s32(prepared_px_s32) }; - let prepared_u8 = unsafe { vqmovn_u16(vcombine_u16(prepared_u16, prepared_u16)) }; + let prepared_px_s32 = vcvtaq_s32_f32(vmulq_f32(vcvtq_f32_s32(summs), f_weight)); + let prepared_u16 = vqmovun_s32(prepared_px_s32); + let prepared_u8 = vqmovn_u16(vcombine_u16(prepared_u16, prepared_u16)); - let bytes_offset = current_y + current_px; + let bytes_offset = current_y + current_px; - unsafe { let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(bytes_offset); - store_u8x8_m4::(dst_ptr, prepared_u8) - } + store_u8x8_m4::(dst_ptr, prepared_u8); - let d_arr_index_1 = ((x + radius_64) & 1023) as usize; - let d_arr_index_2 = ((x - radius_64) & 1023) as usize; - let d_arr_index = (x & 1023) as usize; + let d_arr_index_1 = ((x + radius_64) & 1023) as usize; + let d_arr_index_2 = ((x - radius_64) & 1023) as usize; + let d_arr_index = (x & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(d_arr_index).as_mut_ptr() }; - let stored = unsafe { vld1q_s32(buf_ptr) }; + let buf_ptr = buffer.get_unchecked_mut(d_arr_index).as_mut_ptr(); + let stored = vld1q_s32(buf_ptr); - let buf_ptr_1 = unsafe { buffer.get_unchecked_mut(d_arr_index_1).as_mut_ptr() }; - let stored_1 = unsafe { vld1q_s32(buf_ptr_1) }; + let buf_ptr_1 = buffer.get_unchecked_mut(d_arr_index_1).as_mut_ptr(); + let stored_1 = vld1q_s32(buf_ptr_1); - let buf_ptr_2 = unsafe { buffer.get_unchecked_mut(d_arr_index_2).as_mut_ptr() }; - let stored_2 = unsafe { vld1q_s32(buf_ptr_2) }; + let buf_ptr_2 = buffer.get_unchecked_mut(d_arr_index_2).as_mut_ptr(); + let stored_2 = vld1q_s32(buf_ptr_2); - let new_diff = - unsafe { vsubq_s32(vmulq_n_s32(vsubq_s32(stored, stored_1), 3), stored_2) }; - diffs = unsafe { vaddq_s32(diffs, new_diff) }; - } else if x + radius_64 >= 0 { - let arr_index = (x & 1023) as usize; - let arr_index_1 = ((x + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; - let stored = unsafe { vld1q_s32(buf_ptr) }; + let new_diff = vsubq_s32(vmulq_by_3_s32(vsubq_s32(stored, stored_1)), stored_2); + diffs = vaddq_s32(diffs, new_diff); + } else if x + radius_64 >= 0 { + let arr_index = (x & 1023) as usize; + let arr_index_1 = ((x + radius_64) & 1023) as usize; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let stored = vld1q_s32(buf_ptr); - let buf_ptr_1 = unsafe { buffer.get_unchecked_mut(arr_index_1).as_mut_ptr() }; - let stored_1 = unsafe { vld1q_s32(buf_ptr_1) }; + let buf_ptr_1 = buffer.get_unchecked_mut(arr_index_1).as_mut_ptr(); + let stored_1 = vld1q_s32(buf_ptr_1); - let new_diff = unsafe { vmulq_n_s32(vsubq_s32(stored, stored_1), 3) }; + let new_diff = vmulq_by_3_s32(vsubq_s32(stored, stored_1)); - diffs = unsafe { vaddq_s32(diffs, new_diff) }; - } else if x + 2 * radius_64 >= 0 { - let arr_index = ((x + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; - let stored = unsafe { vld1q_s32(buf_ptr) }; - diffs = unsafe { vsubq_s32(diffs, vmulq_n_s32(stored, 3)) }; - } + diffs = vaddq_s32(diffs, new_diff); + } else if x + 2 * radius_64 >= 0 { + let arr_index = ((x + radius_64) & 1023) as usize; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let stored = vld1q_s32(buf_ptr); + diffs = vsubq_s32(diffs, vmulq_by_3_s32(stored)); + } - let next_row_y = (y as usize) * (stride as usize); - let next_row_x = clamp_edge!(edge_mode, x + 3 * radius_64 / 2, 0, width_wide - 1); - let next_row_px = next_row_x * CHANNELS_COUNT; + let next_row_y = (y as usize) * (stride as usize); + let next_row_x = clamp_edge!(edge_mode, x + 3 * radius_64 / 2, 0, width_wide - 1); + let next_row_px = next_row_x * CHANNELS_COUNT; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut u8 }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut u8; - let pixel_color = unsafe { load_u8_s32_fast::(s_ptr) }; + let pixel_color = load_u8_s32_fast::(s_ptr); - let arr_index = ((x + 2 * radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; + let arr_index = ((x + 2 * radius_64) & 1023) as usize; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); - diffs = unsafe { vaddq_s32(diffs, pixel_color) }; - ders = unsafe { vaddq_s32(ders, diffs) }; - summs = unsafe { vaddq_s32(summs, ders) }; - unsafe { + diffs = vaddq_s32(diffs, pixel_color); + ders = vaddq_s32(ders, diffs); + summs = vaddq_s32(summs, ders); vst1q_s32(buf_ptr, pixel_color); } } diff --git a/src/lib/neon/fast_gaussian_next_f16.rs b/src/lib/neon/fast_gaussian_next_f16.rs index 49adb8c..806430d 100644 --- a/src/lib/neon/fast_gaussian_next_f16.rs +++ b/src/lib/neon/fast_gaussian_next_f16.rs @@ -47,83 +47,82 @@ pub fn fast_gaussian_next_vertical_pass_neon_f16< start: u32, end: u32, ) { - let edge_mode: EdgeMode = EDGE_MODE.into(); - let bytes: &UnsafeSlice<'_, f16> = unsafe { std::mem::transmute(undef_bytes) }; - let mut buffer: [[f32; 4]; 1024] = [[0f32; 4]; 1024]; + unsafe { + let edge_mode: EdgeMode = EDGE_MODE.into(); + let bytes: &UnsafeSlice<'_, f16> = std::mem::transmute(undef_bytes); + let mut buffer: [[f32; 4]; 1024] = [[0f32; 4]; 1024]; - let height_wide = height as i64; + let height_wide = height as i64; - let radius_64 = radius as i64; - let weight = 1.0f32 / ((radius as f32) * (radius as f32) * (radius as f32)); - let f_weight = unsafe { vdupq_n_f32(weight) }; - for x in start..std::cmp::min(width, end) { - let mut diffs: float32x4_t = unsafe { vdupq_n_f32(0f32) }; - let mut ders: float32x4_t = unsafe { vdupq_n_f32(0f32) }; - let mut summs: float32x4_t = unsafe { vdupq_n_f32(0f32) }; + let radius_64 = radius as i64; + let weight = 1.0f32 / ((radius as f32) * (radius as f32) * (radius as f32)); + let f_weight = vdupq_n_f32(weight); + for x in start..std::cmp::min(width, end) { + let mut diffs: float32x4_t = vdupq_n_f32(0f32); + let mut ders: float32x4_t = vdupq_n_f32(0f32); + let mut summs: float32x4_t = vdupq_n_f32(0f32); - let start_y = 0 - 3 * radius as i64; - for y in start_y..height_wide { - let current_y = (y * (stride as i64)) as usize; + let start_y = 0 - 3 * radius as i64; + for y in start_y..height_wide { + let current_y = (y * (stride as i64)) as usize; - if y >= 0 { - let current_px = (std::cmp::max(x, 0)) as usize * CHANNELS_COUNT; + if y >= 0 { + let current_px = (std::cmp::max(x, 0)) as usize * CHANNELS_COUNT; - let prepared_px = unsafe { vmulq_f32(summs, f_weight) }; - unsafe { + let prepared_px = vmulq_f32(summs, f_weight); let dst_ptr = bytes.slice.as_ptr().add(current_y + current_px) as *mut f16; store_f32_f16::(dst_ptr, prepared_px); - } - let d_arr_index_1 = ((y + radius_64) & 1023) as usize; - let d_arr_index_2 = ((y - radius_64) & 1023) as usize; - let d_arr_index = (y & 1023) as usize; + let d_arr_index_1 = ((y + radius_64) & 1023) as usize; + let d_arr_index_2 = ((y - radius_64) & 1023) as usize; + let d_arr_index = (y & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(d_arr_index) as *mut f32 }; - let stored = unsafe { vld1q_f32(buf_ptr) }; + let buf_ptr = buffer.as_mut_ptr().add(d_arr_index) as *mut f32; + let stored = vld1q_f32(buf_ptr); - let buf_ptr_1 = unsafe { buffer.as_mut_ptr().add(d_arr_index_1) as *mut f32 }; - let stored_1 = unsafe { vld1q_f32(buf_ptr_1) }; + let buf_ptr_1 = buffer.as_mut_ptr().add(d_arr_index_1) as *mut f32; + let stored_1 = vld1q_f32(buf_ptr_1); - let buf_ptr_2 = unsafe { buffer.as_mut_ptr().add(d_arr_index_2) as *mut f32 }; - let stored_2 = unsafe { vld1q_f32(buf_ptr_2) }; + let buf_ptr_2 = buffer.as_mut_ptr().add(d_arr_index_2) as *mut f32; + let stored_2 = vld1q_f32(buf_ptr_2); - let new_diff = - unsafe { vsubq_f32(vmulq_n_f32(vsubq_f32(stored, stored_1), 3f32), stored_2) }; - diffs = unsafe { vaddq_f32(diffs, new_diff) }; - } else if y + radius_64 >= 0 { - let arr_index = (y & 1023) as usize; - let arr_index_1 = ((y + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; - let stored = unsafe { vld1q_f32(buf_ptr) }; + let new_diff = + vsubq_f32(vmulq_n_f32(vsubq_f32(stored, stored_1), 3f32), stored_2); + diffs = vaddq_f32(diffs, new_diff); + } else if y + radius_64 >= 0 { + let arr_index = (y & 1023) as usize; + let arr_index_1 = ((y + radius_64) & 1023) as usize; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; + let stored = vld1q_f32(buf_ptr); - let buf_ptr_1 = unsafe { buffer.as_mut_ptr().add(arr_index_1) as *mut f32 }; - let stored_1 = unsafe { vld1q_f32(buf_ptr_1) }; + let buf_ptr_1 = buffer.as_mut_ptr().add(arr_index_1) as *mut f32; + let stored_1 = vld1q_f32(buf_ptr_1); - let new_diff = unsafe { vmulq_n_f32(vsubq_f32(stored, stored_1), 3f32) }; + let new_diff = vmulq_n_f32(vsubq_f32(stored, stored_1), 3f32); - diffs = unsafe { vaddq_f32(diffs, new_diff) }; - } else if y + 2 * radius_64 >= 0 { - let arr_index = ((y + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; - let stored = unsafe { vld1q_f32(buf_ptr) }; - diffs = unsafe { vsubq_f32(diffs, vmulq_n_f32(stored, 3f32)) }; - } + diffs = vaddq_f32(diffs, new_diff); + } else if y + 2 * radius_64 >= 0 { + let arr_index = ((y + radius_64) & 1023) as usize; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; + let stored = vld1q_f32(buf_ptr); + diffs = vsubq_f32(diffs, vmulq_n_f32(stored, 3f32)); + } - let next_row_y = clamp_edge!(edge_mode, y + ((3 * radius_64) >> 1), 0, height_wide - 1) - * (stride as usize); - let next_row_x = x as usize * CHANNELS_COUNT; + let next_row_y = + clamp_edge!(edge_mode, y + ((3 * radius_64) >> 1), 0, height_wide - 1) + * (stride as usize); + let next_row_x = x as usize * CHANNELS_COUNT; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut f16 }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut f16; - let pixel_color = unsafe { load_f32_f16::(s_ptr) }; + let pixel_color = load_f32_f16::(s_ptr); - let arr_index = ((y + 2 * radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; + let arr_index = ((y + 2 * radius_64) & 1023) as usize; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; - diffs = unsafe { vaddq_f32(diffs, pixel_color) }; - ders = unsafe { vaddq_f32(ders, diffs) }; - summs = unsafe { vaddq_f32(summs, ders) }; - unsafe { + diffs = vaddq_f32(diffs, pixel_color); + ders = vaddq_f32(ders, diffs); + summs = vaddq_f32(summs, ders); vst1q_f32(buf_ptr, pixel_color); } } @@ -143,83 +142,81 @@ pub fn fast_gaussian_next_horizontal_pass_neon_f16< start: u32, end: u32, ) { - let edge_mode: EdgeMode = EDGE_MODE.into(); - let mut buffer: [[f32; 4]; 1024] = [[0f32; 4]; 1024]; - let bytes: &UnsafeSlice<'_, f16> = unsafe { std::mem::transmute(undef_bytes) }; + unsafe { + let edge_mode: EdgeMode = EDGE_MODE.into(); + let mut buffer: [[f32; 4]; 1024] = [[0f32; 4]; 1024]; + let bytes: &UnsafeSlice<'_, f16> = std::mem::transmute(undef_bytes); - let width_wide = width as i64; + let width_wide = width as i64; - let radius_64 = radius as i64; - let weight = 1.0f32 / ((radius as f32) * (radius as f32) * (radius as f32)); - let f_weight = unsafe { vdupq_n_f32(weight) }; + let radius_64 = radius as i64; + let weight = 1.0f32 / ((radius as f32) * (radius as f32) * (radius as f32)); + let f_weight = vdupq_n_f32(weight); - for y in start..std::cmp::min(height, end) { - let mut diffs: float32x4_t = unsafe { vdupq_n_f32(0f32) }; - let mut ders: float32x4_t = unsafe { vdupq_n_f32(0f32) }; - let mut summs: float32x4_t = unsafe { vdupq_n_f32(0f32) }; + for y in start..std::cmp::min(height, end) { + let mut diffs: float32x4_t = vdupq_n_f32(0f32); + let mut ders: float32x4_t = vdupq_n_f32(0f32); + let mut summs: float32x4_t = vdupq_n_f32(0f32); - let current_y = ((y as i64) * (stride as i64)) as usize; + let current_y = ((y as i64) * (stride as i64)) as usize; - for x in (0 - 3 * radius_64)..(width as i64) { - if x >= 0 { - let current_px = x as usize * CHANNELS_COUNT; + for x in (0 - 3 * radius_64)..(width as i64) { + if x >= 0 { + let current_px = x as usize * CHANNELS_COUNT; - let prepared_px = unsafe { vmulq_f32(summs, f_weight) }; + let prepared_px = vmulq_f32(summs, f_weight); - unsafe { let dst_ptr = bytes.slice.as_ptr().add(current_y + current_px) as *mut f16; store_f32_f16::(dst_ptr, prepared_px); - } - let d_arr_index_1 = ((x + radius_64) & 1023) as usize; - let d_arr_index_2 = ((x - radius_64) & 1023) as usize; - let d_arr_index = (x & 1023) as usize; + let d_arr_index_1 = ((x + radius_64) & 1023) as usize; + let d_arr_index_2 = ((x - radius_64) & 1023) as usize; + let d_arr_index = (x & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(d_arr_index) as *mut f32 }; - let stored = unsafe { vld1q_f32(buf_ptr) }; + let buf_ptr = buffer.as_mut_ptr().add(d_arr_index) as *mut f32; + let stored = vld1q_f32(buf_ptr); - let buf_ptr_1 = unsafe { buffer.as_mut_ptr().add(d_arr_index_1) as *mut f32 }; - let stored_1 = unsafe { vld1q_f32(buf_ptr_1) }; + let buf_ptr_1 = buffer.as_mut_ptr().add(d_arr_index_1) as *mut f32; + let stored_1 = vld1q_f32(buf_ptr_1); - let buf_ptr_2 = unsafe { buffer.as_mut_ptr().add(d_arr_index_2) as *mut f32 }; - let stored_2 = unsafe { vld1q_f32(buf_ptr_2) }; + let buf_ptr_2 = buffer.as_mut_ptr().add(d_arr_index_2) as *mut f32; + let stored_2 = vld1q_f32(buf_ptr_2); - let new_diff = - unsafe { vsubq_f32(vmulq_n_f32(vsubq_f32(stored, stored_1), 3f32), stored_2) }; - diffs = unsafe { vaddq_f32(diffs, new_diff) }; - } else if x + radius_64 >= 0 { - let arr_index = (x & 1023) as usize; - let arr_index_1 = ((x + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; - let stored = unsafe { vld1q_f32(buf_ptr) }; + let new_diff = + vsubq_f32(vmulq_n_f32(vsubq_f32(stored, stored_1), 3f32), stored_2); + diffs = vaddq_f32(diffs, new_diff); + } else if x + radius_64 >= 0 { + let arr_index = (x & 1023) as usize; + let arr_index_1 = ((x + radius_64) & 1023) as usize; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; + let stored = vld1q_f32(buf_ptr); - let buf_ptr_1 = unsafe { buffer.as_mut_ptr().add(arr_index_1) as *mut f32 }; - let stored_1 = unsafe { vld1q_f32(buf_ptr_1) }; + let buf_ptr_1 = buffer.as_mut_ptr().add(arr_index_1) as *mut f32; + let stored_1 = vld1q_f32(buf_ptr_1); - let new_diff = unsafe { vmulq_n_f32(vsubq_f32(stored, stored_1), 3f32) }; + let new_diff = vmulq_n_f32(vsubq_f32(stored, stored_1), 3f32); - diffs = unsafe { vaddq_f32(diffs, new_diff) }; - } else if x + 2 * radius_64 >= 0 { - let arr_index = ((x + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; - let stored = unsafe { vld1q_f32(buf_ptr) }; - diffs = unsafe { vsubq_f32(diffs, vmulq_n_f32(stored, 3f32)) }; - } + diffs = vaddq_f32(diffs, new_diff); + } else if x + 2 * radius_64 >= 0 { + let arr_index = ((x + radius_64) & 1023) as usize; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; + let stored = vld1q_f32(buf_ptr); + diffs = vsubq_f32(diffs, vmulq_n_f32(stored, 3f32)); + } - let next_row_y = (y as usize) * (stride as usize); - let next_row_x = clamp_edge!(edge_mode, x + 3 * radius_64 / 2, 0, width_wide - 1); - let next_row_px = next_row_x * CHANNELS_COUNT; + let next_row_y = (y as usize) * (stride as usize); + let next_row_x = clamp_edge!(edge_mode, x + 3 * radius_64 / 2, 0, width_wide - 1); + let next_row_px = next_row_x * CHANNELS_COUNT; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut f16 }; - let pixel_color = unsafe { load_f32_f16::(s_ptr) }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut f16; + let pixel_color = load_f32_f16::(s_ptr); - let arr_index = ((x + 2 * radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; + let arr_index = ((x + 2 * radius_64) & 1023) as usize; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; - diffs = unsafe { vaddq_f32(diffs, pixel_color) }; - ders = unsafe { vaddq_f32(ders, diffs) }; - summs = unsafe { vaddq_f32(summs, ders) }; - unsafe { + diffs = vaddq_f32(diffs, pixel_color); + ders = vaddq_f32(ders, diffs); + summs = vaddq_f32(summs, ders); vst1q_f32(buf_ptr, pixel_color); } } diff --git a/src/lib/neon/fast_gaussian_next_f32.rs b/src/lib/neon/fast_gaussian_next_f32.rs index 62e68c5..52ba306 100644 --- a/src/lib/neon/fast_gaussian_next_f32.rs +++ b/src/lib/neon/fast_gaussian_next_f32.rs @@ -45,83 +45,82 @@ pub fn fast_gaussian_next_vertical_pass_neon_f32< start: u32, end: u32, ) { - let edge_mode: EdgeMode = EDGE_MODE.into(); - let bytes: &UnsafeSlice<'_, f32> = unsafe { std::mem::transmute(undef_bytes) }; - let mut buffer: [[f32; 4]; 1024] = [[0f32; 4]; 1024]; + unsafe { + let edge_mode: EdgeMode = EDGE_MODE.into(); + let bytes: &UnsafeSlice<'_, f32> = std::mem::transmute(undef_bytes); + let mut buffer: [[f32; 4]; 1024] = [[0f32; 4]; 1024]; - let height_wide = height as i64; + let height_wide = height as i64; - let radius_64 = radius as i64; - let weight = 1.0f32 / ((radius as f32) * (radius as f32) * (radius as f32)); - let f_weight = unsafe { vdupq_n_f32(weight) }; - for x in start..std::cmp::min(width, end) { - let mut diffs: float32x4_t = unsafe { vdupq_n_f32(0f32) }; - let mut ders: float32x4_t = unsafe { vdupq_n_f32(0f32) }; - let mut summs: float32x4_t = unsafe { vdupq_n_f32(0f32) }; + let radius_64 = radius as i64; + let weight = 1.0f32 / ((radius as f32) * (radius as f32) * (radius as f32)); + let f_weight = vdupq_n_f32(weight); + for x in start..std::cmp::min(width, end) { + let mut diffs: float32x4_t = vdupq_n_f32(0f32); + let mut ders: float32x4_t = vdupq_n_f32(0f32); + let mut summs: float32x4_t = vdupq_n_f32(0f32); - let start_y = 0 - 3 * radius as i64; - for y in start_y..height_wide { - let current_y = (y * (stride as i64)) as usize; + let start_y = 0 - 3 * radius as i64; + for y in start_y..height_wide { + let current_y = (y * (stride as i64)) as usize; - if y >= 0 { - let current_px = (std::cmp::max(x, 0)) as usize * CHANNELS_COUNT; + if y >= 0 { + let current_px = (std::cmp::max(x, 0)) as usize * CHANNELS_COUNT; - let prepared_px = unsafe { vmulq_f32(summs, f_weight) }; - unsafe { + let prepared_px = vmulq_f32(summs, f_weight); let dst_ptr = bytes.slice.as_ptr().add(current_y + current_px) as *mut f32; store_f32::(dst_ptr, prepared_px); - } - let d_arr_index_1 = ((y + radius_64) & 1023) as usize; - let d_arr_index_2 = ((y - radius_64) & 1023) as usize; - let d_arr_index = (y & 1023) as usize; + let d_arr_index_1 = ((y + radius_64) & 1023) as usize; + let d_arr_index_2 = ((y - radius_64) & 1023) as usize; + let d_arr_index = (y & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(d_arr_index) as *mut f32 }; - let stored = unsafe { vld1q_f32(buf_ptr) }; + let buf_ptr = buffer.as_mut_ptr().add(d_arr_index) as *mut f32; + let stored = vld1q_f32(buf_ptr); - let buf_ptr_1 = unsafe { buffer.as_mut_ptr().add(d_arr_index_1) as *mut f32 }; - let stored_1 = unsafe { vld1q_f32(buf_ptr_1) }; + let buf_ptr_1 = buffer.as_mut_ptr().add(d_arr_index_1) as *mut f32; + let stored_1 = vld1q_f32(buf_ptr_1); - let buf_ptr_2 = unsafe { buffer.as_mut_ptr().add(d_arr_index_2) as *mut f32 }; - let stored_2 = unsafe { vld1q_f32(buf_ptr_2) }; + let buf_ptr_2 = buffer.as_mut_ptr().add(d_arr_index_2) as *mut f32; + let stored_2 = vld1q_f32(buf_ptr_2); - let new_diff = - unsafe { vsubq_f32(vmulq_n_f32(vsubq_f32(stored, stored_1), 3f32), stored_2) }; - diffs = unsafe { vaddq_f32(diffs, new_diff) }; - } else if y + radius_64 >= 0 { - let arr_index = (y & 1023) as usize; - let arr_index_1 = ((y + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; - let stored = unsafe { vld1q_f32(buf_ptr) }; + let new_diff = + vsubq_f32(vmulq_n_f32(vsubq_f32(stored, stored_1), 3f32), stored_2); + diffs = vaddq_f32(diffs, new_diff); + } else if y + radius_64 >= 0 { + let arr_index = (y & 1023) as usize; + let arr_index_1 = ((y + radius_64) & 1023) as usize; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; + let stored = vld1q_f32(buf_ptr); - let buf_ptr_1 = unsafe { buffer.as_mut_ptr().add(arr_index_1) as *mut f32 }; - let stored_1 = unsafe { vld1q_f32(buf_ptr_1) }; + let buf_ptr_1 = buffer.as_mut_ptr().add(arr_index_1) as *mut f32; + let stored_1 = vld1q_f32(buf_ptr_1); - let new_diff = unsafe { vmulq_n_f32(vsubq_f32(stored, stored_1), 3f32) }; + let new_diff = vmulq_n_f32(vsubq_f32(stored, stored_1), 3f32); - diffs = unsafe { vaddq_f32(diffs, new_diff) }; - } else if y + 2 * radius_64 >= 0 { - let arr_index = ((y + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; - let stored = unsafe { vld1q_f32(buf_ptr) }; - diffs = unsafe { vsubq_f32(diffs, vmulq_n_f32(stored, 3f32)) }; - } + diffs = vaddq_f32(diffs, new_diff); + } else if y + 2 * radius_64 >= 0 { + let arr_index = ((y + radius_64) & 1023) as usize; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; + let stored = vld1q_f32(buf_ptr); + diffs = vsubq_f32(diffs, vmulq_n_f32(stored, 3f32)); + } - let next_row_y = clamp_edge!(edge_mode, y + ((3 * radius_64) >> 1), 0, height_wide - 1) - * (stride as usize); - let next_row_x = x as usize * CHANNELS_COUNT; + let next_row_y = + clamp_edge!(edge_mode, y + ((3 * radius_64) >> 1), 0, height_wide - 1) + * (stride as usize); + let next_row_x = x as usize * CHANNELS_COUNT; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut f32 }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut f32; - let pixel_color = unsafe { load_f32_fast::(s_ptr) }; + let pixel_color = load_f32_fast::(s_ptr); - let arr_index = ((y + 2 * radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; + let arr_index = ((y + 2 * radius_64) & 1023) as usize; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; - diffs = unsafe { vaddq_f32(diffs, pixel_color) }; - ders = unsafe { vaddq_f32(ders, diffs) }; - summs = unsafe { vaddq_f32(summs, ders) }; - unsafe { + diffs = vaddq_f32(diffs, pixel_color); + ders = vaddq_f32(ders, diffs); + summs = vaddq_f32(summs, ders); vst1q_f32(buf_ptr, pixel_color); } } @@ -141,83 +140,81 @@ pub fn fast_gaussian_next_horizontal_pass_neon_f32< start: u32, end: u32, ) { - let edge_mode: EdgeMode = EDGE_MODE.into(); - let mut buffer: [[f32; 4]; 1024] = [[0f32; 4]; 1024]; - let bytes: &UnsafeSlice<'_, f32> = unsafe { std::mem::transmute(undef_bytes) }; + unsafe { + let edge_mode: EdgeMode = EDGE_MODE.into(); + let mut buffer: [[f32; 4]; 1024] = [[0f32; 4]; 1024]; + let bytes: &UnsafeSlice<'_, f32> = std::mem::transmute(undef_bytes); - let width_wide = width as i64; + let width_wide = width as i64; - let radius_64 = radius as i64; - let weight = 1.0f32 / ((radius as f32) * (radius as f32) * (radius as f32)); - let f_weight = unsafe { vdupq_n_f32(weight) }; + let radius_64 = radius as i64; + let weight = 1.0f32 / ((radius as f32) * (radius as f32) * (radius as f32)); + let f_weight = vdupq_n_f32(weight); - for y in start..std::cmp::min(height, end) { - let mut diffs: float32x4_t = unsafe { vdupq_n_f32(0f32) }; - let mut ders: float32x4_t = unsafe { vdupq_n_f32(0f32) }; - let mut summs: float32x4_t = unsafe { vdupq_n_f32(0f32) }; + for y in start..std::cmp::min(height, end) { + let mut diffs: float32x4_t = vdupq_n_f32(0f32); + let mut ders: float32x4_t = vdupq_n_f32(0f32); + let mut summs: float32x4_t = vdupq_n_f32(0f32); - let current_y = ((y as i64) * (stride as i64)) as usize; + let current_y = ((y as i64) * (stride as i64)) as usize; - for x in (0 - 3 * radius_64)..(width as i64) { - if x >= 0 { - let current_px = x as usize * CHANNELS_COUNT; + for x in (0 - 3 * radius_64)..(width as i64) { + if x >= 0 { + let current_px = x as usize * CHANNELS_COUNT; - let prepared_px = unsafe { vmulq_f32(summs, f_weight) }; + let prepared_px = vmulq_f32(summs, f_weight); - unsafe { let dst_ptr = bytes.slice.as_ptr().add(current_y + current_px) as *mut f32; store_f32::(dst_ptr, prepared_px); - } - let d_arr_index_1 = ((x + radius_64) & 1023) as usize; - let d_arr_index_2 = ((x - radius_64) & 1023) as usize; - let d_arr_index = (x & 1023) as usize; + let d_arr_index_1 = ((x + radius_64) & 1023) as usize; + let d_arr_index_2 = ((x - radius_64) & 1023) as usize; + let d_arr_index = (x & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(d_arr_index) as *mut f32 }; - let stored = unsafe { vld1q_f32(buf_ptr) }; + let buf_ptr = buffer.as_mut_ptr().add(d_arr_index) as *mut f32; + let stored = vld1q_f32(buf_ptr); - let buf_ptr_1 = unsafe { buffer.as_mut_ptr().add(d_arr_index_1) as *mut f32 }; - let stored_1 = unsafe { vld1q_f32(buf_ptr_1) }; + let buf_ptr_1 = buffer.as_mut_ptr().add(d_arr_index_1) as *mut f32; + let stored_1 = vld1q_f32(buf_ptr_1); - let buf_ptr_2 = unsafe { buffer.as_mut_ptr().add(d_arr_index_2) as *mut f32 }; - let stored_2 = unsafe { vld1q_f32(buf_ptr_2) }; + let buf_ptr_2 = buffer.as_mut_ptr().add(d_arr_index_2) as *mut f32; + let stored_2 = vld1q_f32(buf_ptr_2); - let new_diff = - unsafe { vsubq_f32(vmulq_n_f32(vsubq_f32(stored, stored_1), 3f32), stored_2) }; - diffs = unsafe { vaddq_f32(diffs, new_diff) }; - } else if x + radius_64 >= 0 { - let arr_index = (x & 1023) as usize; - let arr_index_1 = ((x + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; - let stored = unsafe { vld1q_f32(buf_ptr) }; + let new_diff = + vsubq_f32(vmulq_n_f32(vsubq_f32(stored, stored_1), 3f32), stored_2); + diffs = vaddq_f32(diffs, new_diff); + } else if x + radius_64 >= 0 { + let arr_index = (x & 1023) as usize; + let arr_index_1 = ((x + radius_64) & 1023) as usize; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; + let stored = vld1q_f32(buf_ptr); - let buf_ptr_1 = unsafe { buffer.as_mut_ptr().add(arr_index_1) as *mut f32 }; - let stored_1 = unsafe { vld1q_f32(buf_ptr_1) }; + let buf_ptr_1 = buffer.as_mut_ptr().add(arr_index_1) as *mut f32; + let stored_1 = vld1q_f32(buf_ptr_1); - let new_diff = unsafe { vmulq_n_f32(vsubq_f32(stored, stored_1), 3f32) }; + let new_diff = vmulq_n_f32(vsubq_f32(stored, stored_1), 3f32); - diffs = unsafe { vaddq_f32(diffs, new_diff) }; - } else if x + 2 * radius_64 >= 0 { - let arr_index = ((x + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; - let stored = unsafe { vld1q_f32(buf_ptr) }; - diffs = unsafe { vsubq_f32(diffs, vmulq_n_f32(stored, 3f32)) }; - } + diffs = vaddq_f32(diffs, new_diff); + } else if x + 2 * radius_64 >= 0 { + let arr_index = ((x + radius_64) & 1023) as usize; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; + let stored = vld1q_f32(buf_ptr); + diffs = vsubq_f32(diffs, vmulq_n_f32(stored, 3f32)); + } - let next_row_y = (y as usize) * (stride as usize); - let next_row_x = clamp_edge!(edge_mode, x + 3 * radius_64 / 2, 0, width_wide - 1); - let next_row_px = next_row_x * CHANNELS_COUNT; + let next_row_y = (y as usize) * (stride as usize); + let next_row_x = clamp_edge!(edge_mode, x + 3 * radius_64 / 2, 0, width_wide - 1); + let next_row_px = next_row_x * CHANNELS_COUNT; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut f32 }; - let pixel_color = unsafe { load_f32_fast::(s_ptr) }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut f32; + let pixel_color = load_f32_fast::(s_ptr); - let arr_index = ((x + 2 * radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; + let arr_index = ((x + 2 * radius_64) & 1023) as usize; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; - diffs = unsafe { vaddq_f32(diffs, pixel_color) }; - ders = unsafe { vaddq_f32(ders, diffs) }; - summs = unsafe { vaddq_f32(summs, ders) }; - unsafe { + diffs = vaddq_f32(diffs, pixel_color); + ders = vaddq_f32(ders, diffs); + summs = vaddq_f32(summs, ders); vst1q_f32(buf_ptr, pixel_color); } } diff --git a/src/lib/neon/utils.rs b/src/lib/neon/utils.rs index 8b6423e..4d95a35 100644 --- a/src/lib/neon/utils.rs +++ b/src/lib/neon/utils.rs @@ -314,13 +314,7 @@ pub(crate) unsafe fn vsplit_rgb_5(px: float32x4x4_t) -> Float32x5T { let second_pixel = vextq_f32::<3>(px.0, px.1); let third_pixel = vextq_f32::<2>(px.1, px.2); let four_pixel = vextq_f32::<1>(px.2, px.3); - Float32x5T( - first_pixel, - second_pixel, - third_pixel, - four_pixel, - px.3, - ) + Float32x5T(first_pixel, second_pixel, third_pixel, four_pixel, px.3) } pub(crate) struct Float32x5T( @@ -419,3 +413,8 @@ pub(crate) unsafe fn store_u8x8_m4( dst_ptr.write_unaligned(bits[0]); } } + +#[inline] +pub(crate) unsafe fn vmulq_by_3_s32(k: int32x4_t) -> int32x4_t { + vaddq_s32(vshlq_n_s32::<1>(k), k) +} diff --git a/src/lib/sse/fast_gaussian.rs b/src/lib/sse/fast_gaussian.rs index 750355e..0d41417 100644 --- a/src/lib/sse/fast_gaussian.rs +++ b/src/lib/sse/fast_gaussian.rs @@ -87,10 +87,10 @@ unsafe fn fast_gaussian_horizontal_pass_sse_u8_impl< let width_wide = width as i64; const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; - let v_weight = unsafe { _mm_set1_ps(1f32 / (radius as f32 * radius as f32)) }; + let v_weight = _mm_set1_ps(1f32 / (radius as f32 * radius as f32)); for y in start..std::cmp::min(height, end) { - let mut diffs = unsafe { _mm_setzero_si128() }; - let mut summs = unsafe { _mm_set1_epi32(initial_sum) }; + let mut diffs = _mm_setzero_si128(); + let mut summs = _mm_set1_epi32(initial_sum); let current_y = ((y as i64) * (stride as i64)) as usize; @@ -99,52 +99,48 @@ unsafe fn fast_gaussian_horizontal_pass_sse_u8_impl< if x >= 0 { let current_px = ((std::cmp::max(x, 0) as u32) * CHANNELS_COUNT as u32) as usize; - let pixel_f32 = unsafe { - _mm_round_ps::(_mm_mul_ps(_mm_cvtepi32_ps(summs), v_weight)) - }; - let pixel_u32 = unsafe { _mm_cvtps_epi32(pixel_f32) }; + let pixel_f32 = + _mm_round_ps::(_mm_mul_ps(_mm_cvtepi32_ps(summs), v_weight)); + let pixel_u32 = _mm_cvtps_epi32(pixel_f32); let bytes_offset = current_y + current_px; - unsafe { - let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(bytes_offset); - store_u8_u32::(dst_ptr, pixel_u32); - } + let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(bytes_offset); + store_u8_u32::(dst_ptr, pixel_u32); let arr_index = ((x - radius_64) & 1023) as usize; let d_arr_index = (x & 1023) as usize; - let d_buf_ptr = unsafe { buffer.as_mut_ptr().add(d_arr_index) as *const i32 }; - let mut d_stored = unsafe { _mm_loadu_si128(d_buf_ptr as *const __m128i) }; - d_stored = unsafe { _mm_slli_epi32::<1>(d_stored) }; + let d_buf_ptr = buffer.as_mut_ptr().add(d_arr_index) as *const i32; + let mut d_stored = _mm_loadu_si128(d_buf_ptr as *const __m128i); + d_stored = _mm_slli_epi32::<1>(d_stored); - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; - let a_stored = unsafe { _mm_loadu_si128(buf_ptr as *const __m128i) }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let a_stored = _mm_loadu_si128(buf_ptr as *const __m128i); - diffs = unsafe { _mm_add_epi32(diffs, _mm_sub_epi32(a_stored, d_stored)) }; + diffs = _mm_add_epi32(diffs, _mm_sub_epi32(a_stored, d_stored)); } else if x + radius_64 >= 0 { let arr_index = (x & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; - let mut stored = unsafe { _mm_loadu_si128(buf_ptr as *const __m128i) }; - stored = unsafe { _mm_slli_epi32::<1>(stored) }; - diffs = unsafe { _mm_sub_epi32(diffs, stored) }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let mut stored = _mm_loadu_si128(buf_ptr as *const __m128i); + stored = _mm_slli_epi32::<1>(stored); + diffs = _mm_sub_epi32(diffs, stored); } let next_row_y = (y as usize) * (stride as usize); let next_row_x = clamp_edge!(edge_mode, x + radius_64, 0, width_wide - 1); let next_row_px = next_row_x * CHANNELS_COUNT; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut u8 }; - let pixel_color = unsafe { load_u8_s32_fast::(s_ptr) }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut u8; + let pixel_color = load_u8_s32_fast::(s_ptr); let arr_index = ((x + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); - diffs = unsafe { _mm_add_epi32(diffs, pixel_color) }; - summs = unsafe { _mm_add_epi32(summs, diffs) }; - unsafe { - _mm_storeu_si128(buf_ptr as *mut __m128i, pixel_color); - } + diffs = _mm_add_epi32(diffs, pixel_color); + summs = _mm_add_epi32(summs, diffs); + + _mm_storeu_si128(buf_ptr as *mut __m128i, pixel_color); } } } @@ -191,7 +187,7 @@ unsafe fn fast_gaussian_vertical_pass_sse_u8_impl< end: u32, ) { let edge_mode: EdgeMode = EDGE_MODE.into(); - let bytes: &UnsafeSlice<'_, u8> = unsafe { std::mem::transmute(undefined_slice) }; + let bytes: &UnsafeSlice<'_, u8> = std::mem::transmute(undefined_slice); let mut buffer: [[i32; 4]; 1024] = [[0; 4]; 1024]; let initial_sum = ((radius * radius) >> 1) as i32; @@ -201,11 +197,11 @@ unsafe fn fast_gaussian_vertical_pass_sse_u8_impl< const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; - let v_weight = unsafe { _mm_set1_ps(1f32 / (radius as f32 * radius as f32)) }; + let v_weight = _mm_set1_ps(1f32 / (radius as f32 * radius as f32)); for x in start..std::cmp::min(width, end) { - let mut diffs = unsafe { _mm_setzero_si128() }; - let mut summs = unsafe { _mm_set1_epi32(initial_sum) }; + let mut diffs = _mm_setzero_si128(); + let mut summs = _mm_set1_epi32(initial_sum); let start_y = 0 - 2 * radius as i64; for y in start_y..height_wide { @@ -214,52 +210,47 @@ unsafe fn fast_gaussian_vertical_pass_sse_u8_impl< if y >= 0 { let current_px = ((std::cmp::max(x, 0)) * CHANNELS_COUNT as u32) as usize; - let pixel_f32 = unsafe { - _mm_round_ps::(_mm_mul_ps(_mm_cvtepi32_ps(summs), v_weight)) - }; - let pixel_u32 = unsafe { _mm_cvtps_epi32(pixel_f32) }; + let pixel_f32 = + _mm_round_ps::(_mm_mul_ps(_mm_cvtepi32_ps(summs), v_weight)); + let pixel_u32 = _mm_cvtps_epi32(pixel_f32); let bytes_offset = current_y + current_px; - unsafe { - let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(bytes_offset); - store_u8_u32::(dst_ptr, pixel_u32); - } + let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(bytes_offset); + store_u8_u32::(dst_ptr, pixel_u32); let arr_index = ((y - radius_64) & 1023) as usize; let d_arr_index = (y & 1023) as usize; - let d_buf_ptr = unsafe { buffer.get_unchecked_mut(d_arr_index).as_mut_ptr() }; - let mut d_stored = unsafe { _mm_loadu_si128(d_buf_ptr as *const __m128i) }; - d_stored = unsafe { _mm_slli_epi32::<1>(d_stored) }; + let d_buf_ptr = buffer.get_unchecked_mut(d_arr_index).as_mut_ptr(); + let mut d_stored = _mm_loadu_si128(d_buf_ptr as *const __m128i); + d_stored = _mm_slli_epi32::<1>(d_stored); - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut i32 }; - let a_stored = unsafe { _mm_loadu_si128(buf_ptr as *const __m128i) }; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut i32; + let a_stored = _mm_loadu_si128(buf_ptr as *const __m128i); - diffs = unsafe { _mm_add_epi32(diffs, _mm_sub_epi32(a_stored, d_stored)) }; + diffs = _mm_add_epi32(diffs, _mm_sub_epi32(a_stored, d_stored)); } else if y + radius_64 >= 0 { let arr_index = (y & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; - let mut stored = unsafe { _mm_loadu_si128(buf_ptr as *const __m128i) }; - stored = unsafe { _mm_slli_epi32::<1>(stored) }; - diffs = unsafe { _mm_sub_epi32(diffs, stored) }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let mut stored = _mm_loadu_si128(buf_ptr as *const __m128i); + stored = _mm_slli_epi32::<1>(stored); + diffs = _mm_sub_epi32(diffs, stored); } let next_row_y = clamp_edge!(edge_mode, y + radius_64, 0, height_wide - 1) * (stride as usize); let next_row_x = (x * CHANNELS_COUNT as u32) as usize; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut u8 }; - let pixel_color = unsafe { load_u8_s32_fast::(s_ptr) }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut u8; + let pixel_color = load_u8_s32_fast::(s_ptr); let arr_index = ((y + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); - diffs = unsafe { _mm_add_epi32(diffs, pixel_color) }; - summs = unsafe { _mm_add_epi32(summs, diffs) }; - unsafe { - _mm_storeu_si128(buf_ptr as *mut __m128i, pixel_color); - } + diffs = _mm_add_epi32(diffs, pixel_color); + summs = _mm_add_epi32(summs, diffs); + _mm_storeu_si128(buf_ptr as *mut __m128i, pixel_color); } } } diff --git a/src/lib/sse/fast_gaussian_f16.rs b/src/lib/sse/fast_gaussian_f16.rs index 4d89b3f..e2811e0 100644 --- a/src/lib/sse/fast_gaussian_f16.rs +++ b/src/lib/sse/fast_gaussian_f16.rs @@ -80,18 +80,18 @@ unsafe fn fast_gaussian_horizontal_pass_sse_f16_impl< end: u32, ) { let edge_mode: EdgeMode = EDGE_MODE.into(); - let bytes: &UnsafeSlice<'_, f16> = unsafe { std::mem::transmute(undefined_slice) }; + let bytes: &UnsafeSlice<'_, f16> = std::mem::transmute(undefined_slice); let mut buffer: [[f32; 4]; 1024] = [[0.; 4]; 1024]; let radius_64 = radius as i64; let width_wide = width as i64; - let v_half = unsafe { _mm_set1_ps(2.) }; - let v_weight = unsafe { _mm_set1_ps(1f32 / (radius as f32 * radius as f32)) }; + let v_half = _mm_set1_ps(2.); + let v_weight = _mm_set1_ps(1f32 / (radius as f32 * radius as f32)); for y in start..std::cmp::min(height, end) { - let mut diffs = unsafe { _mm_setzero_ps() }; - let mut summs = unsafe { _mm_setzero_ps() }; + let mut diffs = _mm_setzero_ps(); + let mut summs = _mm_setzero_ps(); let current_y = ((y as i64) * (stride as i64)) as usize; @@ -100,49 +100,45 @@ unsafe fn fast_gaussian_horizontal_pass_sse_f16_impl< if x >= 0 { let current_px = ((std::cmp::max(x, 0) as u32) * CHANNELS_COUNT as u32) as usize; - let pixel = unsafe { _mm_mul_ps(summs, v_weight) }; + let pixel = _mm_mul_ps(summs, v_weight); let bytes_offset = current_y + current_px; - unsafe { - let dst_ptr = bytes.slice.as_ptr().add(bytes_offset) as *mut f16; - store_f32_f16::(dst_ptr, pixel); - } + let dst_ptr = bytes.slice.as_ptr().add(bytes_offset) as *mut f16; + store_f32_f16::(dst_ptr, pixel); let arr_index = ((x - radius_64) & 1023) as usize; let d_arr_index = (x & 1023) as usize; - let d_buf_ptr = unsafe { buffer.get_unchecked_mut(d_arr_index).as_mut_ptr() }; - let mut d_stored = unsafe { _mm_loadu_ps(d_buf_ptr) }; - d_stored = unsafe { _mm_mul_ps(d_stored, v_half) }; + let d_buf_ptr = buffer.get_unchecked_mut(d_arr_index).as_mut_ptr(); + let mut d_stored = _mm_loadu_ps(d_buf_ptr); + d_stored = _mm_mul_ps(d_stored, v_half); - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; - let a_stored = unsafe { _mm_loadu_ps(buf_ptr) }; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; + let a_stored = _mm_loadu_ps(buf_ptr); - diffs = unsafe { _mm_add_ps(diffs, _mm_sub_ps(a_stored, d_stored)) }; + diffs = _mm_add_ps(diffs, _mm_sub_ps(a_stored, d_stored)); } else if x + radius_64 >= 0 { let arr_index = (x & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; - let mut stored = unsafe { _mm_loadu_ps(buf_ptr) }; - stored = unsafe { _mm_mul_ps(stored, v_half) }; - diffs = unsafe { _mm_sub_ps(diffs, stored) }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let mut stored = _mm_loadu_ps(buf_ptr); + stored = _mm_mul_ps(stored, v_half); + diffs = _mm_sub_ps(diffs, stored); } let next_row_y = (y as usize) * (stride as usize); let next_row_x = clamp_edge!(edge_mode, x + radius_64, 0, width_wide - 1); let next_row_px = next_row_x * CHANNELS_COUNT; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut f16 }; - let pixel_color = unsafe { load_f32_f16::(s_ptr) }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut f16; + let pixel_color = load_f32_f16::(s_ptr); let arr_index = ((x + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); - diffs = unsafe { _mm_add_ps(diffs, pixel_color) }; - summs = unsafe { _mm_add_ps(summs, diffs) }; - unsafe { - _mm_storeu_ps(buf_ptr, pixel_color); - } + diffs = _mm_add_ps(diffs, pixel_color); + summs = _mm_add_ps(summs, diffs); + _mm_storeu_ps(buf_ptr, pixel_color); } } } @@ -189,18 +185,18 @@ unsafe fn fast_gaussian_vertical_pass_sse_f16_impl< end: u32, ) { let edge_mode: EdgeMode = EDGE_MODE.into(); - let bytes: &UnsafeSlice<'_, f16> = unsafe { std::mem::transmute(undefined_slice) }; + let bytes: &UnsafeSlice<'_, f16> = std::mem::transmute(undefined_slice); let mut buffer: [[f32; 4]; 1024] = [[0.; 4]; 1024]; - let v_half = unsafe { _mm_set1_ps(2.) }; - let v_weight = unsafe { _mm_set1_ps(1f32 / (radius as f32 * radius as f32)) }; + let v_half = _mm_set1_ps(2.); + let v_weight = _mm_set1_ps(1f32 / (radius as f32 * radius as f32)); let height_wide = height as i64; let radius_64 = radius as i64; for x in start..std::cmp::min(width, end) { - let mut diffs = unsafe { _mm_setzero_ps() }; - let mut summs = unsafe { _mm_setzero_ps() }; + let mut diffs = _mm_setzero_ps(); + let mut summs = _mm_setzero_ps(); let start_y = 0 - 2 * radius as i64; for y in start_y..height_wide { @@ -209,49 +205,45 @@ unsafe fn fast_gaussian_vertical_pass_sse_f16_impl< if y >= 0 { let current_px = ((std::cmp::max(x, 0)) * CHANNELS_COUNT as u32) as usize; - let pixel = unsafe { _mm_mul_ps(summs, v_weight) }; + let pixel = _mm_mul_ps(summs, v_weight); let bytes_offset = current_y + current_px; - unsafe { - let dst_ptr = bytes.slice.as_ptr().add(bytes_offset) as *mut f16; - store_f32_f16::(dst_ptr, pixel); - } + let dst_ptr = bytes.slice.as_ptr().add(bytes_offset) as *mut f16; + store_f32_f16::(dst_ptr, pixel); let arr_index = ((y - radius_64) & 1023) as usize; let d_arr_index = (y & 1023) as usize; - let d_buf_ptr = unsafe { buffer.get_unchecked_mut(d_arr_index).as_mut_ptr() }; - let mut d_stored = unsafe { _mm_loadu_ps(d_buf_ptr) }; - d_stored = unsafe { _mm_mul_ps(d_stored, v_half) }; + let d_buf_ptr = buffer.get_unchecked_mut(d_arr_index).as_mut_ptr(); + let mut d_stored = _mm_loadu_ps(d_buf_ptr); + d_stored = _mm_mul_ps(d_stored, v_half); - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; - let a_stored = unsafe { _mm_loadu_ps(buf_ptr) }; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; + let a_stored = _mm_loadu_ps(buf_ptr); - diffs = unsafe { _mm_add_ps(diffs, _mm_sub_ps(a_stored, d_stored)) }; + diffs = _mm_add_ps(diffs, _mm_sub_ps(a_stored, d_stored)); } else if y + radius_64 >= 0 { let arr_index = (y & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; - let mut stored = unsafe { _mm_loadu_ps(buf_ptr) }; - stored = unsafe { _mm_mul_ps(stored, v_half) }; - diffs = unsafe { _mm_sub_ps(diffs, stored) }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let mut stored = _mm_loadu_ps(buf_ptr); + stored = _mm_mul_ps(stored, v_half); + diffs = _mm_sub_ps(diffs, stored); } let next_row_y = clamp_edge!(edge_mode, y + radius_64, 0, height_wide - 1) * (stride as usize); let next_row_x = (x * CHANNELS_COUNT as u32) as usize; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut f16 }; - let pixel_color = unsafe { load_f32_f16::(s_ptr) }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut f16; + let pixel_color = load_f32_f16::(s_ptr); let arr_index = ((y + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); - diffs = unsafe { _mm_add_ps(diffs, pixel_color) }; - summs = unsafe { _mm_add_ps(summs, diffs) }; - unsafe { - _mm_storeu_ps(buf_ptr, pixel_color); - } + diffs = _mm_add_ps(diffs, pixel_color); + summs = _mm_add_ps(summs, diffs); + _mm_storeu_ps(buf_ptr, pixel_color); } } } diff --git a/src/lib/sse/fast_gaussian_f32.rs b/src/lib/sse/fast_gaussian_f32.rs index 5665d53..9ac5dfa 100644 --- a/src/lib/sse/fast_gaussian_f32.rs +++ b/src/lib/sse/fast_gaussian_f32.rs @@ -78,18 +78,18 @@ unsafe fn fast_gaussian_horizontal_pass_sse_f32_impl< end: u32, ) { let edge_mode: EdgeMode = EDGE_MODE.into(); - let bytes: &UnsafeSlice<'_, f32> = unsafe { std::mem::transmute(undefined_slice) }; + let bytes: &UnsafeSlice<'_, f32> = std::mem::transmute(undefined_slice); let mut buffer: [[f32; 4]; 1024] = [[0.; 4]; 1024]; let radius_64 = radius as i64; let width_wide = width as i64; - let v_half = unsafe { _mm_set1_ps(2.) }; - let v_weight = unsafe { _mm_set1_ps(1f32 / (radius as f32 * radius as f32)) }; + let v_half = _mm_set1_ps(2.); + let v_weight = _mm_set1_ps(1f32 / (radius as f32 * radius as f32)); for y in start..std::cmp::min(height, end) { - let mut diffs = unsafe { _mm_setzero_ps() }; - let mut summs = unsafe { _mm_setzero_ps() }; + let mut diffs = _mm_setzero_ps(); + let mut summs = _mm_setzero_ps(); let current_y = ((y as i64) * (stride as i64)) as usize; @@ -98,49 +98,45 @@ unsafe fn fast_gaussian_horizontal_pass_sse_f32_impl< if x >= 0 { let current_px = ((std::cmp::max(x, 0) as u32) * CHANNELS_COUNT as u32) as usize; - let pixel = unsafe { _mm_mul_ps(summs, v_weight) }; + let pixel = _mm_mul_ps(summs, v_weight); let bytes_offset = current_y + current_px; - unsafe { - let dst_ptr = bytes.slice.as_ptr().add(bytes_offset) as *mut f32; - store_f32::(dst_ptr, pixel); - } + let dst_ptr = bytes.slice.as_ptr().add(bytes_offset) as *mut f32; + store_f32::(dst_ptr, pixel); let arr_index = ((x - radius_64) & 1023) as usize; let d_arr_index = (x & 1023) as usize; - let d_buf_ptr = unsafe { buffer.get_unchecked_mut(d_arr_index).as_mut_ptr() }; - let mut d_stored = unsafe { _mm_loadu_ps(d_buf_ptr) }; - d_stored = unsafe { _mm_mul_ps(d_stored, v_half) }; + let d_buf_ptr = buffer.get_unchecked_mut(d_arr_index).as_mut_ptr(); + let mut d_stored = _mm_loadu_ps(d_buf_ptr); + d_stored = _mm_mul_ps(d_stored, v_half); - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; - let a_stored = unsafe { _mm_loadu_ps(buf_ptr) }; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; + let a_stored = _mm_loadu_ps(buf_ptr); - diffs = unsafe { _mm_add_ps(diffs, _mm_sub_ps(a_stored, d_stored)) }; + diffs = _mm_add_ps(diffs, _mm_sub_ps(a_stored, d_stored)); } else if x + radius_64 >= 0 { let arr_index = (x & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; - let mut stored = unsafe { _mm_loadu_ps(buf_ptr) }; - stored = unsafe { _mm_mul_ps(stored, v_half) }; - diffs = unsafe { _mm_sub_ps(diffs, stored) }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let mut stored = _mm_loadu_ps(buf_ptr); + stored = _mm_mul_ps(stored, v_half); + diffs = _mm_sub_ps(diffs, stored); } let next_row_y = (y as usize) * (stride as usize); let next_row_x = clamp_edge!(edge_mode, x + radius_64, 0, width_wide - 1); let next_row_px = next_row_x * CHANNELS_COUNT; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut f32 }; - let pixel_color = unsafe { load_f32::(s_ptr) }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut f32; + let pixel_color = load_f32::(s_ptr); let arr_index = ((x + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); - diffs = unsafe { _mm_add_ps(diffs, pixel_color) }; - summs = unsafe { _mm_add_ps(summs, diffs) }; - unsafe { - _mm_storeu_ps(buf_ptr, pixel_color); - } + diffs = _mm_add_ps(diffs, pixel_color); + summs = _mm_add_ps(summs, diffs); + _mm_storeu_ps(buf_ptr, pixel_color); } } } @@ -187,18 +183,18 @@ unsafe fn fast_gaussian_vertical_pass_sse_f32_impl< end: u32, ) { let edge_mode: EdgeMode = EDGE_MODE.into(); - let bytes: &UnsafeSlice<'_, f32> = unsafe { std::mem::transmute(undefined_slice) }; + let bytes: &UnsafeSlice<'_, f32> = std::mem::transmute(undefined_slice); let mut buffer: [[f32; 4]; 1024] = [[0.; 4]; 1024]; - let v_half = unsafe { _mm_set1_ps(2.) }; - let v_weight = unsafe { _mm_set1_ps(1f32 / (radius as f32 * radius as f32)) }; + let v_half = _mm_set1_ps(2.); + let v_weight = _mm_set1_ps(1f32 / (radius as f32 * radius as f32)); let height_wide = height as i64; let radius_64 = radius as i64; for x in start..std::cmp::min(width, end) { - let mut diffs = unsafe { _mm_setzero_ps() }; - let mut summs = unsafe { _mm_setzero_ps() }; + let mut diffs = _mm_setzero_ps(); + let mut summs = _mm_setzero_ps(); let start_y = 0 - 2 * radius as i64; for y in start_y..height_wide { @@ -207,49 +203,45 @@ unsafe fn fast_gaussian_vertical_pass_sse_f32_impl< if y >= 0 { let current_px = ((std::cmp::max(x, 0)) * CHANNELS_COUNT as u32) as usize; - let pixel = unsafe { _mm_mul_ps(summs, v_weight) }; + let pixel = _mm_mul_ps(summs, v_weight); let bytes_offset = current_y + current_px; - unsafe { - let dst_ptr = bytes.slice.as_ptr().add(bytes_offset) as *mut f32; - store_f32::(dst_ptr, pixel); - } + let dst_ptr = bytes.slice.as_ptr().add(bytes_offset) as *mut f32; + store_f32::(dst_ptr, pixel); let arr_index = ((y - radius_64) & 1023) as usize; let d_arr_index = (y & 1023) as usize; - let d_buf_ptr = unsafe { buffer.get_unchecked_mut(d_arr_index).as_mut_ptr() }; - let mut d_stored = unsafe { _mm_loadu_ps(d_buf_ptr) }; - d_stored = unsafe { _mm_mul_ps(d_stored, v_half) }; + let d_buf_ptr = buffer.get_unchecked_mut(d_arr_index).as_mut_ptr(); + let mut d_stored = _mm_loadu_ps(d_buf_ptr); + d_stored = _mm_mul_ps(d_stored, v_half); - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; - let a_stored = unsafe { _mm_loadu_ps(buf_ptr) }; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; + let a_stored = _mm_loadu_ps(buf_ptr); - diffs = unsafe { _mm_add_ps(diffs, _mm_sub_ps(a_stored, d_stored)) }; + diffs = _mm_add_ps(diffs, _mm_sub_ps(a_stored, d_stored)); } else if y + radius_64 >= 0 { let arr_index = (y & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; - let mut stored = unsafe { _mm_loadu_ps(buf_ptr) }; - stored = unsafe { _mm_mul_ps(stored, v_half) }; - diffs = unsafe { _mm_sub_ps(diffs, stored) }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let mut stored = _mm_loadu_ps(buf_ptr); + stored = _mm_mul_ps(stored, v_half); + diffs = _mm_sub_ps(diffs, stored); } let next_row_y = clamp_edge!(edge_mode, y + radius_64, 0, height_wide - 1) * (stride as usize); let next_row_x = (x * CHANNELS_COUNT as u32) as usize; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut f32 }; - let pixel_color = unsafe { load_f32::(s_ptr) }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut f32; + let pixel_color = load_f32::(s_ptr); let arr_index = ((y + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); - diffs = unsafe { _mm_add_ps(diffs, pixel_color) }; - summs = unsafe { _mm_add_ps(summs, diffs) }; - unsafe { - _mm_storeu_ps(buf_ptr, pixel_color); - } + diffs = _mm_add_ps(diffs, pixel_color); + summs = _mm_add_ps(summs, diffs); + _mm_storeu_ps(buf_ptr, pixel_color); } } } diff --git a/src/lib/sse/fast_gaussian_next.rs b/src/lib/sse/fast_gaussian_next.rs index bafeba7..230fdbf 100644 --- a/src/lib/sse/fast_gaussian_next.rs +++ b/src/lib/sse/fast_gaussian_next.rs @@ -27,8 +27,8 @@ use crate::reflect_101; use crate::reflect_index; -use crate::sse::store_u8_u32; use crate::sse::utils::load_u8_s32_fast; +use crate::sse::{_mm_mul_by_3_epi32, store_u8_u32}; use crate::unsafe_slice::UnsafeSlice; use crate::{clamp_edge, EdgeMode}; #[cfg(target_arch = "x86")] @@ -78,20 +78,18 @@ unsafe fn fast_gaussian_next_vertical_pass_sse_u8_impl< end: u32, ) { let edge_mode: EdgeMode = EDGE_MODE.into(); - let bytes: &UnsafeSlice<'_, u8> = unsafe { std::mem::transmute(undefined_slice) }; + let bytes: &UnsafeSlice<'_, u8> = std::mem::transmute(undefined_slice); let mut buffer: [[i32; 4]; 1024] = [[0; 4]; 1024]; let height_wide = height as i64; - let threes = unsafe { _mm_set1_epi32(3) }; - let radius_64 = radius as i64; let weight = 1.0f32 / ((radius as f32) * (radius as f32) * (radius as f32)); - let f_weight = unsafe { _mm_set1_ps(weight) }; + let f_weight = _mm_set1_ps(weight); for x in start..std::cmp::min(width, end) { - let mut diffs = unsafe { _mm_setzero_si128() }; - let mut ders = unsafe { _mm_setzero_si128() }; - let mut summs = unsafe { _mm_setzero_si128() }; + let mut diffs = _mm_setzero_si128(); + let mut ders = _mm_setzero_si128(); + let mut summs = _mm_setzero_si128(); let start_y = 0 - 3 * radius as i64; for y in start_y..height_wide { @@ -100,76 +98,68 @@ unsafe fn fast_gaussian_next_vertical_pass_sse_u8_impl< if y >= 0 { let current_px = ((std::cmp::max(x, 0)) * CHANNELS_COUNT as u32) as usize; const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; - let prepared_px_s32 = unsafe { - _mm_cvtps_epi32(_mm_round_ps::(_mm_mul_ps( - _mm_cvtepi32_ps(summs), - f_weight, - ))) - }; + let prepared_px_s32 = _mm_cvtps_epi32(_mm_round_ps::(_mm_mul_ps( + _mm_cvtepi32_ps(summs), + f_weight, + ))); let bytes_offset = current_y + current_px; - unsafe { - let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(bytes_offset); - store_u8_u32::(dst_ptr, prepared_px_s32); - } + let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(bytes_offset); + store_u8_u32::(dst_ptr, prepared_px_s32); let d_arr_index_1 = ((y + radius_64) & 1023) as usize; let d_arr_index_2 = ((y - radius_64) & 1023) as usize; let d_arr_index = (y & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(d_arr_index).as_mut_ptr() }; - let stored = unsafe { _mm_loadu_si128(buf_ptr as *const __m128i) }; + let buf_ptr = buffer.get_unchecked_mut(d_arr_index).as_mut_ptr(); + let stored = _mm_loadu_si128(buf_ptr as *const __m128i); - let buf_ptr_1 = unsafe { buffer.as_mut_ptr().add(d_arr_index_1) }; - let stored_1 = unsafe { _mm_loadu_si128(buf_ptr_1 as *const __m128i) }; + let buf_ptr_1 = buffer.as_mut_ptr().add(d_arr_index_1); + let stored_1 = _mm_loadu_si128(buf_ptr_1 as *const __m128i); - let buf_ptr_2 = unsafe { buffer.as_mut_ptr().add(d_arr_index_2) }; - let stored_2 = unsafe { _mm_loadu_si128(buf_ptr_2 as *const __m128i) }; + let buf_ptr_2 = buffer.as_mut_ptr().add(d_arr_index_2); + let stored_2 = _mm_loadu_si128(buf_ptr_2 as *const __m128i); - let new_diff = unsafe { - _mm_sub_epi32( - _mm_mullo_epi32(_mm_sub_epi32(stored, stored_1), threes), - stored_2, - ) - }; - diffs = unsafe { _mm_add_epi32(diffs, new_diff) }; + let new_diff = _mm_sub_epi32( + _mm_mul_by_3_epi32(_mm_sub_epi32(stored, stored_1)), + stored_2, + ); + diffs = _mm_add_epi32(diffs, new_diff); } else if y + radius_64 >= 0 { let arr_index = (y & 1023) as usize; let arr_index_1 = ((y + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; - let stored = unsafe { _mm_loadu_si128(buf_ptr as *const __m128i) }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let stored = _mm_loadu_si128(buf_ptr as *const __m128i); - let buf_ptr_1 = unsafe { buffer.get_unchecked_mut(arr_index_1).as_mut_ptr() }; - let stored_1 = unsafe { _mm_loadu_si128(buf_ptr_1 as *const __m128i) }; + let buf_ptr_1 = buffer.get_unchecked_mut(arr_index_1).as_mut_ptr(); + let stored_1 = _mm_loadu_si128(buf_ptr_1 as *const __m128i); - let new_diff = unsafe { _mm_mullo_epi32(_mm_sub_epi32(stored, stored_1), threes) }; + let new_diff = _mm_mul_by_3_epi32(_mm_sub_epi32(stored, stored_1)); - diffs = unsafe { _mm_add_epi32(diffs, new_diff) }; + diffs = _mm_add_epi32(diffs, new_diff); } else if y + 2 * radius_64 >= 0 { let arr_index = ((y + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; - let stored = unsafe { _mm_loadu_si128(buf_ptr as *const __m128i) }; - diffs = unsafe { _mm_sub_epi32(diffs, _mm_mullo_epi32(stored, threes)) }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let stored = _mm_loadu_si128(buf_ptr as *const __m128i); + diffs = _mm_sub_epi32(diffs, _mm_mul_by_3_epi32(stored)); } let next_row_y = clamp_edge!(edge_mode, y + ((3 * radius_64) >> 1), 0, height_wide - 1) * (stride as usize); let next_row_x = (x * CHANNELS_COUNT as u32) as usize; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut u8 }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut u8; - let pixel_color = unsafe { load_u8_s32_fast::(s_ptr) }; + let pixel_color = load_u8_s32_fast::(s_ptr); let arr_index = ((y + 2 * radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); - diffs = unsafe { _mm_add_epi32(diffs, pixel_color) }; - ders = unsafe { _mm_add_epi32(ders, diffs) }; - summs = unsafe { _mm_add_epi32(summs, ders) }; - unsafe { - _mm_storeu_si128(buf_ptr as *mut __m128i, pixel_color); - } + diffs = _mm_add_epi32(diffs, pixel_color); + ders = _mm_add_epi32(ders, diffs); + summs = _mm_add_epi32(summs, ders); + _mm_storeu_si128(buf_ptr as *mut __m128i, pixel_color); } } } @@ -216,20 +206,18 @@ unsafe fn fast_gaussian_next_horizontal_pass_sse_u8_impl< end: u32, ) { let edge_mode: EdgeMode = EDGE_MODE.into(); - let bytes: &UnsafeSlice<'_, u8> = unsafe { std::mem::transmute(undefined_slice) }; + let bytes: &UnsafeSlice<'_, u8> = std::mem::transmute(undefined_slice); let mut buffer: [[i32; 4]; 1024] = [[0; 4]; 1024]; let width_wide = width as i64; - let threes = unsafe { _mm_set1_epi32(3) }; - let radius_64 = radius as i64; let weight = 1.0f32 / ((radius as f32) * (radius as f32) * (radius as f32)); - let f_weight = unsafe { _mm_set1_ps(weight) }; + let f_weight = _mm_set1_ps(weight); for y in start..std::cmp::min(height, end) { - let mut diffs = unsafe { _mm_setzero_si128() }; - let mut ders = unsafe { _mm_setzero_si128() }; - let mut summs = unsafe { _mm_setzero_si128() }; + let mut diffs = _mm_setzero_si128(); + let mut ders = _mm_setzero_si128(); + let mut summs = _mm_setzero_si128(); let current_y = ((y as i64) * (stride as i64)) as usize; @@ -238,76 +226,68 @@ unsafe fn fast_gaussian_next_horizontal_pass_sse_u8_impl< let current_px = x as usize * CHANNELS_COUNT; const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; - let prepared_px_s32 = unsafe { - _mm_cvtps_epi32(_mm_round_ps::(_mm_mul_ps( - _mm_cvtepi32_ps(summs), - f_weight, - ))) - }; + let prepared_px_s32 = _mm_cvtps_epi32(_mm_round_ps::(_mm_mul_ps( + _mm_cvtepi32_ps(summs), + f_weight, + ))); let bytes_offset = current_y + current_px; - unsafe { - let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(bytes_offset); - store_u8_u32::(dst_ptr, prepared_px_s32); - } + let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(bytes_offset); + store_u8_u32::(dst_ptr, prepared_px_s32); let d_arr_index_1 = ((x + radius_64) & 1023) as usize; let d_arr_index_2 = ((x - radius_64) & 1023) as usize; let d_arr_index = (x & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(d_arr_index).as_mut_ptr() }; - let stored = unsafe { _mm_loadu_si128(buf_ptr as *const __m128i) }; + let buf_ptr = buffer.get_unchecked_mut(d_arr_index).as_mut_ptr(); + let stored = _mm_loadu_si128(buf_ptr as *const __m128i); - let buf_ptr_1 = unsafe { buffer.get_unchecked_mut(d_arr_index_1).as_mut_ptr() }; - let stored_1 = unsafe { _mm_loadu_si128(buf_ptr_1 as *const __m128i) }; + let buf_ptr_1 = buffer.get_unchecked_mut(d_arr_index_1).as_mut_ptr(); + let stored_1 = _mm_loadu_si128(buf_ptr_1 as *const __m128i); - let buf_ptr_2 = unsafe { buffer.get_unchecked_mut(d_arr_index_2).as_mut_ptr() }; - let stored_2 = unsafe { _mm_loadu_si128(buf_ptr_2 as *const __m128i) }; + let buf_ptr_2 = buffer.get_unchecked_mut(d_arr_index_2).as_mut_ptr(); + let stored_2 = _mm_loadu_si128(buf_ptr_2 as *const __m128i); - let new_diff = unsafe { - _mm_sub_epi32( - _mm_mullo_epi32(_mm_sub_epi32(stored, stored_1), threes), - stored_2, - ) - }; - diffs = unsafe { _mm_add_epi32(diffs, new_diff) }; + let new_diff = _mm_sub_epi32( + _mm_mul_by_3_epi32(_mm_sub_epi32(stored, stored_1)), + stored_2, + ); + diffs = _mm_add_epi32(diffs, new_diff); } else if x + radius_64 >= 0 { let arr_index = (x & 1023) as usize; let arr_index_1 = ((x + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut i32 }; - let stored = unsafe { _mm_loadu_si128(buf_ptr as *const __m128i) }; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut i32; + let stored = _mm_loadu_si128(buf_ptr as *const __m128i); - let buf_ptr_1 = buffer[arr_index_1].as_mut_ptr(); - let stored_1 = unsafe { _mm_loadu_si128(buf_ptr_1 as *const __m128i) }; + let buf_ptr_1 = buffer.as_mut_ptr().add(arr_index_1); + let stored_1 = _mm_loadu_si128(buf_ptr_1 as *const __m128i); - let new_diff = unsafe { _mm_mullo_epi32(_mm_sub_epi32(stored, stored_1), threes) }; + let new_diff = _mm_mul_by_3_epi32(_mm_sub_epi32(stored, stored_1)); - diffs = unsafe { _mm_add_epi32(diffs, new_diff) }; + diffs = _mm_add_epi32(diffs, new_diff); } else if x + 2 * radius_64 >= 0 { let arr_index = ((x + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) }; - let stored = unsafe { _mm_loadu_si128(buf_ptr as *const __m128i) }; - diffs = unsafe { _mm_sub_epi32(diffs, _mm_mullo_epi32(stored, threes)) }; + let buf_ptr = buffer.as_mut_ptr().add(arr_index); + let stored = _mm_loadu_si128(buf_ptr as *const __m128i); + diffs = _mm_sub_epi32(diffs, _mm_mul_by_3_epi32(stored)); } let next_row_y = (y as usize) * (stride as usize); let next_row_x = clamp_edge!(edge_mode, x + 3 * radius_64 / 2, 0, width_wide - 1); let next_row_px = next_row_x * CHANNELS_COUNT; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut u8 }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut u8; - let pixel_color = unsafe { load_u8_s32_fast::(s_ptr) }; + let pixel_color = load_u8_s32_fast::(s_ptr); let arr_index = ((x + 2 * radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); - diffs = unsafe { _mm_add_epi32(diffs, pixel_color) }; - ders = unsafe { _mm_add_epi32(ders, diffs) }; - summs = unsafe { _mm_add_epi32(summs, ders) }; - unsafe { - _mm_storeu_si128(buf_ptr as *mut __m128i, pixel_color); - } + diffs = _mm_add_epi32(diffs, pixel_color); + ders = _mm_add_epi32(ders, diffs); + summs = _mm_add_epi32(summs, ders); + _mm_storeu_si128(buf_ptr as *mut __m128i, pixel_color); } } } diff --git a/src/lib/sse/fast_gaussian_next_f16.rs b/src/lib/sse/fast_gaussian_next_f16.rs index d8cb9da..b7fc671 100644 --- a/src/lib/sse/fast_gaussian_next_f16.rs +++ b/src/lib/sse/fast_gaussian_next_f16.rs @@ -80,20 +80,20 @@ unsafe fn fast_gaussian_next_vertical_pass_sse_f16_impl< end: u32, ) { let edge_mode: EdgeMode = EDGE_MODE.into(); - let bytes: &UnsafeSlice<'_, f16> = unsafe { std::mem::transmute(undefined_slice) }; + let bytes: &UnsafeSlice<'_, f16> = std::mem::transmute(undefined_slice); let mut buffer: [[f32; 4]; 1024] = [[0.; 4]; 1024]; let height_wide = height as i64; - let threes = unsafe { _mm_set1_ps(3.) }; + let threes = _mm_set1_ps(3.); let radius_64 = radius as i64; let weight = 1.0f32 / ((radius as f32) * (radius as f32) * (radius as f32)); - let v_weight = unsafe { _mm_set1_ps(weight) }; + let v_weight = _mm_set1_ps(weight); for x in start..std::cmp::min(width, end) { - let mut diffs = unsafe { _mm_setzero_ps() }; - let mut ders = unsafe { _mm_setzero_ps() }; - let mut summs = unsafe { _mm_setzero_ps() }; + let mut diffs = _mm_setzero_ps(); + let mut ders = _mm_setzero_ps(); + let mut summs = _mm_setzero_ps(); let start_y = 0 - 3 * radius as i64; for y in start_y..height_wide { @@ -104,65 +104,60 @@ unsafe fn fast_gaussian_next_vertical_pass_sse_f16_impl< let bytes_offset = current_y + current_px; - let pixel = unsafe { _mm_mul_ps(summs, v_weight) }; - unsafe { - let dst_ptr = bytes.slice.as_ptr().add(bytes_offset) as *mut f16; - store_f32_f16::(dst_ptr, pixel); - } + let pixel = _mm_mul_ps(summs, v_weight); + let dst_ptr = bytes.slice.as_ptr().add(bytes_offset) as *mut f16; + store_f32_f16::(dst_ptr, pixel); let d_arr_index_1 = ((y + radius_64) & 1023) as usize; let d_arr_index_2 = ((y - radius_64) & 1023) as usize; let d_arr_index = (y & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(d_arr_index).as_mut_ptr() }; - let stored = unsafe { _mm_loadu_ps(buf_ptr) }; + let buf_ptr = buffer.get_unchecked_mut(d_arr_index).as_mut_ptr(); + let stored = _mm_loadu_ps(buf_ptr); - let buf_ptr_1 = unsafe { buffer.as_mut_ptr().add(d_arr_index_1) as *mut f32 }; - let stored_1 = unsafe { _mm_loadu_ps(buf_ptr_1) }; + let buf_ptr_1 = buffer.as_mut_ptr().add(d_arr_index_1) as *mut f32; + let stored_1 = _mm_loadu_ps(buf_ptr_1); - let buf_ptr_2 = unsafe { buffer.as_mut_ptr().add(d_arr_index_2) as *mut f32 }; - let stored_2 = unsafe { _mm_loadu_ps(buf_ptr_2) }; + let buf_ptr_2 = buffer.as_mut_ptr().add(d_arr_index_2) as *mut f32; + let stored_2 = _mm_loadu_ps(buf_ptr_2); - let new_diff = unsafe { - _mm_sub_ps(_mm_mul_ps(_mm_sub_ps(stored, stored_1), threes), stored_2) - }; - diffs = unsafe { _mm_add_ps(diffs, new_diff) }; + let new_diff = + _mm_sub_ps(_mm_mul_ps(_mm_sub_ps(stored, stored_1), threes), stored_2); + diffs = _mm_add_ps(diffs, new_diff); } else if y + radius_64 >= 0 { let arr_index = (y & 1023) as usize; let arr_index_1 = ((y + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; - let stored = unsafe { _mm_loadu_ps(buf_ptr) }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let stored = _mm_loadu_ps(buf_ptr); - let buf_ptr_1 = unsafe { buffer.get_unchecked_mut(arr_index_1).as_mut_ptr() }; - let stored_1 = unsafe { _mm_loadu_ps(buf_ptr_1) }; + let buf_ptr_1 = buffer.get_unchecked_mut(arr_index_1).as_mut_ptr(); + let stored_1 = _mm_loadu_ps(buf_ptr_1); - let new_diff = unsafe { _mm_mul_ps(_mm_sub_ps(stored, stored_1), threes) }; + let new_diff = _mm_mul_ps(_mm_sub_ps(stored, stored_1), threes); - diffs = unsafe { _mm_add_ps(diffs, new_diff) }; + diffs = _mm_add_ps(diffs, new_diff); } else if y + 2 * radius_64 >= 0 { let arr_index = ((y + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; - let stored = unsafe { _mm_loadu_ps(buf_ptr) }; - diffs = unsafe { _mm_sub_ps(diffs, _mm_mul_ps(stored, threes)) }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let stored = _mm_loadu_ps(buf_ptr); + diffs = _mm_sub_ps(diffs, _mm_mul_ps(stored, threes)); } let next_row_y = clamp_edge!(edge_mode, y + ((3 * radius_64) >> 1), 0, height_wide - 1) * (stride as usize); let next_row_x = (x * CHANNELS_COUNT as u32) as usize; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut f16 }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut f16; - let pixel_color = unsafe { load_f32_f16::(s_ptr) }; + let pixel_color = load_f32_f16::(s_ptr); let arr_index = ((y + 2 * radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); - diffs = unsafe { _mm_add_ps(diffs, pixel_color) }; - ders = unsafe { _mm_add_ps(ders, diffs) }; - summs = unsafe { _mm_add_ps(summs, ders) }; - unsafe { - _mm_storeu_ps(buf_ptr, pixel_color); - } + diffs = _mm_add_ps(diffs, pixel_color); + ders = _mm_add_ps(ders, diffs); + summs = _mm_add_ps(summs, ders); + _mm_storeu_ps(buf_ptr, pixel_color); } } } @@ -209,20 +204,20 @@ unsafe fn fast_gaussian_next_horizontal_pass_sse_f16_impl< end: u32, ) { let edge_mode: EdgeMode = EDGE_MODE.into(); - let bytes: &UnsafeSlice<'_, f16> = unsafe { std::mem::transmute(undefined_slice) }; + let bytes: &UnsafeSlice<'_, f16> = std::mem::transmute(undefined_slice); let mut buffer: [[f32; 4]; 1024] = [[0.; 4]; 1024]; let width_wide = width as i64; - let threes = unsafe { _mm_set1_ps(3.) }; + let threes = _mm_set1_ps(3.); let radius_64 = radius as i64; let weight = 1.0f32 / ((radius as f32) * (radius as f32) * (radius as f32)); - let v_weight = unsafe { _mm_set1_ps(weight) }; + let v_weight = _mm_set1_ps(weight); for y in start..std::cmp::min(height, end) { - let mut diffs = unsafe { _mm_setzero_ps() }; - let mut ders = unsafe { _mm_setzero_ps() }; - let mut summs = unsafe { _mm_setzero_ps() }; + let mut diffs = _mm_setzero_ps(); + let mut ders = _mm_setzero_ps(); + let mut summs = _mm_setzero_ps(); let current_y = ((y as i64) * (stride as i64)) as usize; @@ -232,65 +227,60 @@ unsafe fn fast_gaussian_next_horizontal_pass_sse_f16_impl< let bytes_offset = current_y + current_px; - let pixel = unsafe { _mm_mul_ps(summs, v_weight) }; - unsafe { - let dst_ptr = bytes.slice.as_ptr().add(bytes_offset) as *mut f16; - store_f32_f16::(dst_ptr, pixel); - } + let pixel = _mm_mul_ps(summs, v_weight); + let dst_ptr = bytes.slice.as_ptr().add(bytes_offset) as *mut f16; + store_f32_f16::(dst_ptr, pixel); let d_arr_index_1 = ((x + radius_64) & 1023) as usize; let d_arr_index_2 = ((x - radius_64) & 1023) as usize; let d_arr_index = (x & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(d_arr_index).as_mut_ptr() }; - let stored = unsafe { _mm_loadu_ps(buf_ptr) }; + let buf_ptr = buffer.get_unchecked_mut(d_arr_index).as_mut_ptr(); + let stored = _mm_loadu_ps(buf_ptr); - let buf_ptr_1 = unsafe { buffer.get_unchecked_mut(d_arr_index_1).as_mut_ptr() }; - let stored_1 = unsafe { _mm_loadu_ps(buf_ptr_1) }; + let buf_ptr_1 = buffer.get_unchecked_mut(d_arr_index_1).as_mut_ptr(); + let stored_1 = _mm_loadu_ps(buf_ptr_1); - let buf_ptr_2 = unsafe { buffer.get_unchecked_mut(d_arr_index_2).as_mut_ptr() }; - let stored_2 = unsafe { _mm_loadu_ps(buf_ptr_2) }; + let buf_ptr_2 = buffer.get_unchecked_mut(d_arr_index_2).as_mut_ptr(); + let stored_2 = _mm_loadu_ps(buf_ptr_2); - let new_diff = unsafe { - _mm_sub_ps(_mm_mul_ps(_mm_sub_ps(stored, stored_1), threes), stored_2) - }; - diffs = unsafe { _mm_add_ps(diffs, new_diff) }; + let new_diff = + _mm_sub_ps(_mm_mul_ps(_mm_sub_ps(stored, stored_1), threes), stored_2); + diffs = _mm_add_ps(diffs, new_diff); } else if x + radius_64 >= 0 { let arr_index = (x & 1023) as usize; let arr_index_1 = ((x + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; - let stored = unsafe { _mm_loadu_ps(buf_ptr) }; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; + let stored = _mm_loadu_ps(buf_ptr); - let buf_ptr_1 = unsafe { buffer.as_mut_ptr().add(arr_index_1) }; - let stored_1 = unsafe { _mm_loadu_ps(buf_ptr_1 as *const f32) }; + let buf_ptr_1 = buffer.as_mut_ptr().add(arr_index_1); + let stored_1 = _mm_loadu_ps(buf_ptr_1 as *const f32); - let new_diff = unsafe { _mm_mul_ps(_mm_sub_ps(stored, stored_1), threes) }; + let new_diff = _mm_mul_ps(_mm_sub_ps(stored, stored_1), threes); - diffs = unsafe { _mm_add_ps(diffs, new_diff) }; + diffs = _mm_add_ps(diffs, new_diff); } else if x + 2 * radius_64 >= 0 { let arr_index = ((x + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) }; - let stored = unsafe { _mm_loadu_ps(buf_ptr as *const f32) }; - diffs = unsafe { _mm_sub_ps(diffs, _mm_mul_ps(stored, threes)) }; + let buf_ptr = buffer.as_mut_ptr().add(arr_index); + let stored = _mm_loadu_ps(buf_ptr as *const f32); + diffs = _mm_sub_ps(diffs, _mm_mul_ps(stored, threes)); } let next_row_y = (y as usize) * (stride as usize); let next_row_x = clamp_edge!(edge_mode, x + 3 * radius_64 / 2, 0, width_wide - 1); let next_row_px = next_row_x * CHANNELS_COUNT; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut f16 }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut f16; - let pixel_color = unsafe { load_f32_f16::(s_ptr) }; + let pixel_color = load_f32_f16::(s_ptr); let arr_index = ((x + 2 * radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); - diffs = unsafe { _mm_add_ps(diffs, pixel_color) }; - ders = unsafe { _mm_add_ps(ders, diffs) }; - summs = unsafe { _mm_add_ps(summs, ders) }; - unsafe { - _mm_storeu_ps(buf_ptr, pixel_color); - } + diffs = _mm_add_ps(diffs, pixel_color); + ders = _mm_add_ps(ders, diffs); + summs = _mm_add_ps(summs, ders); + _mm_storeu_ps(buf_ptr, pixel_color); } } } diff --git a/src/lib/sse/fast_gaussian_next_f32.rs b/src/lib/sse/fast_gaussian_next_f32.rs index efa6062..e433b17 100644 --- a/src/lib/sse/fast_gaussian_next_f32.rs +++ b/src/lib/sse/fast_gaussian_next_f32.rs @@ -78,20 +78,20 @@ unsafe fn fast_gaussian_next_vertical_pass_sse_f32_impl< end: u32, ) { let edge_mode: EdgeMode = EDGE_MODE.into(); - let bytes: &UnsafeSlice<'_, f32> = unsafe { std::mem::transmute(undefined_slice) }; + let bytes: &UnsafeSlice<'_, f32> = std::mem::transmute(undefined_slice); let mut buffer: [[f32; 4]; 1024] = [[0.; 4]; 1024]; let height_wide = height as i64; - let threes = unsafe { _mm_set1_ps(3.) }; + let threes = _mm_set1_ps(3.); let radius_64 = radius as i64; let weight = 1.0f32 / ((radius as f32) * (radius as f32) * (radius as f32)); - let v_weight = unsafe { _mm_set1_ps(weight) }; + let v_weight = _mm_set1_ps(weight); for x in start..std::cmp::min(width, end) { - let mut diffs = unsafe { _mm_setzero_ps() }; - let mut ders = unsafe { _mm_setzero_ps() }; - let mut summs = unsafe { _mm_setzero_ps() }; + let mut diffs = _mm_setzero_ps(); + let mut ders = _mm_setzero_ps(); + let mut summs = _mm_setzero_ps(); let start_y = 0 - 3 * radius as i64; for y in start_y..height_wide { @@ -102,65 +102,60 @@ unsafe fn fast_gaussian_next_vertical_pass_sse_f32_impl< let bytes_offset = current_y + current_px; - let pixel = unsafe { _mm_mul_ps(summs, v_weight) }; - unsafe { - let dst_ptr = bytes.slice.as_ptr().add(bytes_offset) as *mut f32; - store_f32::(dst_ptr, pixel); - } + let pixel = _mm_mul_ps(summs, v_weight); + let dst_ptr = bytes.slice.as_ptr().add(bytes_offset) as *mut f32; + store_f32::(dst_ptr, pixel); let d_arr_index_1 = ((y + radius_64) & 1023) as usize; let d_arr_index_2 = ((y - radius_64) & 1023) as usize; let d_arr_index = (y & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(d_arr_index).as_mut_ptr() }; - let stored = unsafe { _mm_loadu_ps(buf_ptr) }; + let buf_ptr = buffer.get_unchecked_mut(d_arr_index).as_mut_ptr(); + let stored = _mm_loadu_ps(buf_ptr); - let buf_ptr_1 = unsafe { buffer.as_mut_ptr().add(d_arr_index_1) as *mut f32 }; - let stored_1 = unsafe { _mm_loadu_ps(buf_ptr_1) }; + let buf_ptr_1 = buffer.as_mut_ptr().add(d_arr_index_1) as *mut f32; + let stored_1 = _mm_loadu_ps(buf_ptr_1); - let buf_ptr_2 = unsafe { buffer.as_mut_ptr().add(d_arr_index_2) as *mut f32 }; - let stored_2 = unsafe { _mm_loadu_ps(buf_ptr_2) }; + let buf_ptr_2 = buffer.as_mut_ptr().add(d_arr_index_2) as *mut f32; + let stored_2 = _mm_loadu_ps(buf_ptr_2); - let new_diff = unsafe { - _mm_sub_ps(_mm_mul_ps(_mm_sub_ps(stored, stored_1), threes), stored_2) - }; - diffs = unsafe { _mm_add_ps(diffs, new_diff) }; + let new_diff = + _mm_sub_ps(_mm_mul_ps(_mm_sub_ps(stored, stored_1), threes), stored_2); + diffs = _mm_add_ps(diffs, new_diff); } else if y + radius_64 >= 0 { let arr_index = (y & 1023) as usize; let arr_index_1 = ((y + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; - let stored = unsafe { _mm_loadu_ps(buf_ptr) }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let stored = _mm_loadu_ps(buf_ptr); - let buf_ptr_1 = unsafe { buffer.get_unchecked_mut(arr_index_1).as_mut_ptr() }; - let stored_1 = unsafe { _mm_loadu_ps(buf_ptr_1) }; + let buf_ptr_1 = buffer.get_unchecked_mut(arr_index_1).as_mut_ptr(); + let stored_1 = _mm_loadu_ps(buf_ptr_1); - let new_diff = unsafe { _mm_mul_ps(_mm_sub_ps(stored, stored_1), threes) }; + let new_diff = _mm_mul_ps(_mm_sub_ps(stored, stored_1), threes); - diffs = unsafe { _mm_add_ps(diffs, new_diff) }; + diffs = _mm_add_ps(diffs, new_diff); } else if y + 2 * radius_64 >= 0 { let arr_index = ((y + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; - let stored = unsafe { _mm_loadu_ps(buf_ptr) }; - diffs = unsafe { _mm_sub_ps(diffs, _mm_mul_ps(stored, threes)) }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let stored = _mm_loadu_ps(buf_ptr); + diffs = _mm_sub_ps(diffs, _mm_mul_ps(stored, threes)); } let next_row_y = clamp_edge!(edge_mode, y + ((3 * radius_64) >> 1), 0, height_wide - 1) * (stride as usize); let next_row_x = (x * CHANNELS_COUNT as u32) as usize; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut f32 }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut f32; - let pixel_color = unsafe { load_f32::(s_ptr) }; + let pixel_color = load_f32::(s_ptr); let arr_index = ((y + 2 * radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); - diffs = unsafe { _mm_add_ps(diffs, pixel_color) }; - ders = unsafe { _mm_add_ps(ders, diffs) }; - summs = unsafe { _mm_add_ps(summs, ders) }; - unsafe { - _mm_storeu_ps(buf_ptr, pixel_color); - } + diffs = _mm_add_ps(diffs, pixel_color); + ders = _mm_add_ps(ders, diffs); + summs = _mm_add_ps(summs, ders); + _mm_storeu_ps(buf_ptr, pixel_color); } } } @@ -207,20 +202,20 @@ unsafe fn fast_gaussian_next_horizontal_pass_sse_f32_impl< end: u32, ) { let edge_mode: EdgeMode = EDGE_MODE.into(); - let bytes: &UnsafeSlice<'_, f32> = unsafe { std::mem::transmute(undefined_slice) }; + let bytes: &UnsafeSlice<'_, f32> = std::mem::transmute(undefined_slice); let mut buffer: [[f32; 4]; 1024] = [[0.; 4]; 1024]; let width_wide = width as i64; - let threes = unsafe { _mm_set1_ps(3.) }; + let threes = _mm_set1_ps(3.); let radius_64 = radius as i64; let weight = 1.0f32 / ((radius as f32) * (radius as f32) * (radius as f32)); - let v_weight = unsafe { _mm_set1_ps(weight) }; + let v_weight = _mm_set1_ps(weight); for y in start..std::cmp::min(height, end) { - let mut diffs = unsafe { _mm_setzero_ps() }; - let mut ders = unsafe { _mm_setzero_ps() }; - let mut summs = unsafe { _mm_setzero_ps() }; + let mut diffs = _mm_setzero_ps(); + let mut ders = _mm_setzero_ps(); + let mut summs = _mm_setzero_ps(); let current_y = ((y as i64) * (stride as i64)) as usize; @@ -230,65 +225,60 @@ unsafe fn fast_gaussian_next_horizontal_pass_sse_f32_impl< let bytes_offset = current_y + current_px; - let pixel = unsafe { _mm_mul_ps(summs, v_weight) }; - unsafe { - let dst_ptr = bytes.slice.as_ptr().add(bytes_offset) as *mut f32; - store_f32::(dst_ptr, pixel); - } + let pixel = _mm_mul_ps(summs, v_weight); + let dst_ptr = bytes.slice.as_ptr().add(bytes_offset) as *mut f32; + store_f32::(dst_ptr, pixel); let d_arr_index_1 = ((x + radius_64) & 1023) as usize; let d_arr_index_2 = ((x - radius_64) & 1023) as usize; let d_arr_index = (x & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(d_arr_index).as_mut_ptr() }; - let stored = unsafe { _mm_loadu_ps(buf_ptr) }; + let buf_ptr = buffer.get_unchecked_mut(d_arr_index).as_mut_ptr(); + let stored = _mm_loadu_ps(buf_ptr); - let buf_ptr_1 = unsafe { buffer.get_unchecked_mut(d_arr_index_1).as_mut_ptr() }; - let stored_1 = unsafe { _mm_loadu_ps(buf_ptr_1) }; + let buf_ptr_1 = buffer.get_unchecked_mut(d_arr_index_1).as_mut_ptr(); + let stored_1 = _mm_loadu_ps(buf_ptr_1); - let buf_ptr_2 = unsafe { buffer.get_unchecked_mut(d_arr_index_2).as_mut_ptr() }; - let stored_2 = unsafe { _mm_loadu_ps(buf_ptr_2) }; + let buf_ptr_2 = buffer.get_unchecked_mut(d_arr_index_2).as_mut_ptr(); + let stored_2 = _mm_loadu_ps(buf_ptr_2); - let new_diff = unsafe { - _mm_sub_ps(_mm_mul_ps(_mm_sub_ps(stored, stored_1), threes), stored_2) - }; - diffs = unsafe { _mm_add_ps(diffs, new_diff) }; + let new_diff = + _mm_sub_ps(_mm_mul_ps(_mm_sub_ps(stored, stored_1), threes), stored_2); + diffs = _mm_add_ps(diffs, new_diff); } else if x + radius_64 >= 0 { let arr_index = (x & 1023) as usize; let arr_index_1 = ((x + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) as *mut f32 }; - let stored = unsafe { _mm_loadu_ps(buf_ptr) }; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut f32; + let stored = _mm_loadu_ps(buf_ptr); - let buf_ptr_1 = unsafe { buffer.as_mut_ptr().add(arr_index_1) }; - let stored_1 = unsafe { _mm_loadu_ps(buf_ptr_1 as *const f32) }; + let buf_ptr_1 = buffer.as_mut_ptr().add(arr_index_1); + let stored_1 = _mm_loadu_ps(buf_ptr_1 as *const f32); - let new_diff = unsafe { _mm_mul_ps(_mm_sub_ps(stored, stored_1), threes) }; + let new_diff = _mm_mul_ps(_mm_sub_ps(stored, stored_1), threes); - diffs = unsafe { _mm_add_ps(diffs, new_diff) }; + diffs = _mm_add_ps(diffs, new_diff); } else if x + 2 * radius_64 >= 0 { let arr_index = ((x + radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.as_mut_ptr().add(arr_index) }; - let stored = unsafe { _mm_loadu_ps(buf_ptr as *const f32) }; - diffs = unsafe { _mm_sub_ps(diffs, _mm_mul_ps(stored, threes)) }; + let buf_ptr = buffer.as_mut_ptr().add(arr_index); + let stored = _mm_loadu_ps(buf_ptr as *const f32); + diffs = _mm_sub_ps(diffs, _mm_mul_ps(stored, threes)); } let next_row_y = (y as usize) * (stride as usize); let next_row_x = clamp_edge!(edge_mode, x + 3 * radius_64 / 2, 0, width_wide - 1); let next_row_px = next_row_x * CHANNELS_COUNT; - let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut f32 }; + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut f32; - let pixel_color = unsafe { load_f32::(s_ptr) }; + let pixel_color = load_f32::(s_ptr); let arr_index = ((x + 2 * radius_64) & 1023) as usize; - let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); - diffs = unsafe { _mm_add_ps(diffs, pixel_color) }; - ders = unsafe { _mm_add_ps(ders, diffs) }; - summs = unsafe { _mm_add_ps(summs, ders) }; - unsafe { - _mm_storeu_ps(buf_ptr, pixel_color); - } + diffs = _mm_add_ps(diffs, pixel_color); + ders = _mm_add_ps(ders, diffs); + summs = _mm_add_ps(summs, ders); + _mm_storeu_ps(buf_ptr, pixel_color); } } } diff --git a/src/lib/sse/utils.rs b/src/lib/sse/utils.rs index 830debc..5ce950d 100644 --- a/src/lib/sse/utils.rs +++ b/src/lib/sse/utils.rs @@ -429,3 +429,9 @@ pub(crate) unsafe fn store_f32_f16( casted_ptr.write_unaligned(item0); } } + +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn _mm_mul_by_3_epi32(v: __m128i) -> __m128i { + _mm_add_epi32(_mm_slli_epi32::<1>(v), v) +} \ No newline at end of file diff --git a/src/lib/stack_blur.rs b/src/lib/stack_blur.rs index f258d5c..98361d7 100644 --- a/src/lib/stack_blur.rs +++ b/src/lib/stack_blur.rs @@ -25,17 +25,19 @@ // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +use crate::cpu_features::{is_x86_avx512dq_supported, is_x86_avx512vl_supported}; use crate::mul_table::{MUL_TABLE_STACK_BLUR, SHR_TABLE_STACK_BLUR}; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] use crate::neon::{stack_blur_pass_neon_i32, stack_blur_pass_neon_i64}; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use crate::sse::{stack_blur_pass_sse, stack_blur_pass_sse_i64}; use crate::unsafe_slice::UnsafeSlice; +#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] +use crate::wasm32::stack_blur_pass_wasm_i32; use crate::{FastBlurChannels, ThreadingPolicy}; use num_traits::{AsPrimitive, FromPrimitive}; use std::ops::AddAssign; -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -use crate::cpu_features::{is_x86_avx512dq_supported, is_x86_avx512vl_supported}; const BASE_RADIUS_I64_CUTOFF: u32 = 150; @@ -740,13 +742,17 @@ fn stack_blur_worker_horizontal( #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] { if _is_sse_available { - if _is_avx512dq_available && _is_avx512vl_available { - _dispatcher = stack_blur_pass_sse::<3, true>; - } else { - _dispatcher = stack_blur_pass_sse::<3, false>; - } + if _is_avx512dq_available && _is_avx512vl_available { + _dispatcher = stack_blur_pass_sse::<3, true>; + } else { + _dispatcher = stack_blur_pass_sse::<3, false>; + } } } + #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] + { + _dispatcher = stack_blur_pass_wasm_i32::<3>; + } } else { #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { @@ -804,6 +810,10 @@ fn stack_blur_worker_horizontal( } } } + #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] + { + _dispatcher = stack_blur_pass_wasm_i32::<4>; + } } else { #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { @@ -907,6 +917,10 @@ fn stack_blur_worker_vertical( } } } + #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] + { + _dispatcher = stack_blur_pass_wasm_i32::<3>; + } } else { #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { @@ -964,6 +978,10 @@ fn stack_blur_worker_vertical( } } } + #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] + { + _dispatcher = stack_blur_pass_wasm_i32::<3>; + } } else { #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { diff --git a/src/lib/wasm32/fast_gaussian.rs b/src/lib/wasm32/fast_gaussian.rs new file mode 100644 index 0000000..f7be825 --- /dev/null +++ b/src/lib/wasm32/fast_gaussian.rs @@ -0,0 +1,249 @@ +// Copyright (c) Radzivon Bartoshyk. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +use crate::unsafe_slice::UnsafeSlice; +use crate::wasm32::utils::{ + load_u8_s32_fast, u16x8_pack_trunc_u8x16, u32x4_pack_trunc_u16x8, w_store_u8x8_m4, +}; +use crate::{clamp_edge, reflect_101, reflect_index, EdgeMode}; +use std::arch::wasm32::*; + +pub fn fast_gaussian_horizontal_pass_wasm_u8< + T, + const CHANNELS_COUNT: usize, + const EDGE_MODE: usize, +>( + undefined_slice: &UnsafeSlice, + stride: u32, + width: u32, + height: u32, + radius: u32, + start: u32, + end: u32, +) { + unsafe { + fast_gaussian_horizontal_pass_impl::( + undefined_slice, + stride, + width, + height, + radius, + start, + end, + ); + } +} + +#[inline] +#[target_feature(enable = "simd128")] +unsafe fn fast_gaussian_horizontal_pass_impl< + T, + const CHANNELS_COUNT: usize, + const EDGE_MODE: usize, +>( + undefined_slice: &UnsafeSlice, + stride: u32, + width: u32, + height: u32, + radius: u32, + start: u32, + end: u32, +) { + let edge_mode: EdgeMode = EDGE_MODE.into(); + let bytes: &UnsafeSlice<'_, u8> = unsafe { std::mem::transmute(undefined_slice) }; + let mut buffer: [[i32; 4]; 1024] = [[0; 4]; 1024]; + let initial_sum = ((radius * radius) >> 1) as i32; + + let radius_64 = radius as i64; + let width_wide = width as i64; + let v_weight = f32x4_splat((1f64 / (radius as f64 * radius as f64)) as f32); + for y in start..std::cmp::min(height, end) { + let mut diffs = i32x4_splat(0); + let mut summs = i32x4_splat(initial_sum); + + let current_y = ((y as i64) * (stride as i64)) as usize; + + let start_x = 0 - 2 * radius_64; + for x in start_x..(width as i64) { + if x >= 0 { + let current_px = ((std::cmp::max(x, 0) as u32) * CHANNELS_COUNT as u32) as usize; + + let prepared_px_s32 = i32x4_trunc_sat_f32x4(f32x4_floor(f32x4_mul( + f32x4_convert_i32x4(summs), + v_weight, + ))); + let prepared_u16 = u32x4_pack_trunc_u16x8(prepared_px_s32, prepared_px_s32); + let prepared_u8 = u16x8_pack_trunc_u8x16(prepared_u16, prepared_u16); + + let bytes_offset = current_y + current_px; + let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(bytes_offset); + w_store_u8x8_m4::(dst_ptr, prepared_u8); + + let arr_index = ((x - radius_64) & 1023) as usize; + let d_arr_index = (x & 1023) as usize; + + let d_buf_ptr = buffer.get_unchecked_mut(d_arr_index).as_mut_ptr(); + let mut d_stored = v128_load(d_buf_ptr as *const v128); + d_stored = i32x4_shr(d_stored, 1); + + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let a_stored = v128_load(buf_ptr as *const v128); + + diffs = i32x4_add(diffs, i32x4_sub(a_stored, d_stored)); + } else if x + radius_64 >= 0 { + let arr_index = (x & 1023) as usize; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let mut stored = v128_load(buf_ptr as *const v128); + stored = i32x4_shr(stored, 1); + diffs = i32x4_sub(diffs, stored); + } + + let next_row_y = (y as usize) * (stride as usize); + let next_row_x = clamp_edge!(edge_mode, x + radius_64, 0, width_wide - 1); + let next_row_px = next_row_x * CHANNELS_COUNT; + + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut u8; + let pixel_color = load_u8_s32_fast::(s_ptr); + + let arr_index = ((x + radius_64) & 1023) as usize; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + + diffs = i32x4_add(diffs, pixel_color); + summs = i32x4_add(summs, diffs); + v128_store(buf_ptr as *mut v128, pixel_color); + } + } +} + +pub fn fast_gaussian_vertical_pass_wasm_u8< + T, + const CHANNELS_COUNT: usize, + const EDGE_MODE: usize, +>( + undefined_slice: &UnsafeSlice, + stride: u32, + width: u32, + height: u32, + radius: u32, + start: u32, + end: u32, +) { + unsafe { + fast_gaussian_vertical_pass_wasm::( + undefined_slice, + stride, + width, + height, + radius, + start, + end, + ); + } +} + +#[inline] +#[target_feature(enable = "simd128")] +unsafe fn fast_gaussian_vertical_pass_wasm< + T, + const CHANNELS_COUNT: usize, + const EDGE_MODE: usize, +>( + undefined_slice: &UnsafeSlice, + stride: u32, + width: u32, + height: u32, + radius: u32, + start: u32, + end: u32, +) { + let edge_mode: EdgeMode = EDGE_MODE.into(); + let bytes: &UnsafeSlice<'_, u8> = unsafe { std::mem::transmute(undefined_slice) }; + let mut buffer: [[i32; 4]; 1024] = [[0; 4]; 1024]; + let initial_sum = ((radius * radius) >> 1) as i32; + + let height_wide = height as i64; + + let radius_64 = radius as i64; + let v_weight = f32x4_splat((1f64 / (radius as f64 * radius as f64)) as f32); + for x in start..std::cmp::min(width, end) { + let mut diffs = i32x4_splat(0); + let mut summs = i32x4_splat(initial_sum); + + let start_y = 0 - 2 * radius as i64; + for y in start_y..height_wide { + let current_y = (y * (stride as i64)) as usize; + + if y >= 0 { + let current_px = ((std::cmp::max(x, 0)) * CHANNELS_COUNT as u32) as usize; + + let prepared_px_s32 = i32x4_trunc_sat_f32x4(f32x4_floor(f32x4_mul( + f32x4_convert_i32x4(summs), + v_weight, + ))); + let prepared_u16 = u32x4_pack_trunc_u16x8(prepared_px_s32, prepared_px_s32); + let prepared_u8 = u16x8_pack_trunc_u8x16(prepared_u16, prepared_u16); + + let bytes_offset = current_y + current_px; + + let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(bytes_offset); + w_store_u8x8_m4::(dst_ptr, prepared_u8); + + let arr_index = ((y - radius_64) & 1023) as usize; + let d_arr_index = (y & 1023) as usize; + + let d_buf_ptr = buffer.get_unchecked_mut(d_arr_index).as_mut_ptr(); + let mut d_stored = v128_load(d_buf_ptr as *const v128); + d_stored = i32x4_shr(d_stored, 1); + + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let a_stored = v128_load(buf_ptr as *const v128); + + diffs = i32x4_add(diffs, i32x4_sub(a_stored, d_stored)); + } else if y + radius_64 >= 0 { + let arr_index = (y & 1023) as usize; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let mut stored = v128_load(buf_ptr as *const v128); + stored = i32x4_shr(stored, 1); + diffs = i32x4_sub(diffs, stored); + } + + let next_row_y = + clamp_edge!(edge_mode, y + radius_64, 0, height_wide - 1) * (stride as usize); + let next_row_x = (x * CHANNELS_COUNT as u32) as usize; + + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut u8; + let pixel_color = load_u8_s32_fast::(s_ptr); + + let arr_index = ((y + radius_64) & 1023) as usize; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + + diffs = i32x4_add(diffs, pixel_color); + summs = i32x4_add(summs, diffs); + v128_store(buf_ptr as *mut v128, pixel_color); + } + } +} diff --git a/src/lib/wasm32/fast_gaussian_next.rs b/src/lib/wasm32/fast_gaussian_next.rs new file mode 100644 index 0000000..fb6c2dd --- /dev/null +++ b/src/lib/wasm32/fast_gaussian_next.rs @@ -0,0 +1,291 @@ +// Copyright (c) Radzivon Bartoshyk. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +use crate::reflect_101; +use crate::reflect_index; +use crate::unsafe_slice::UnsafeSlice; +use crate::wasm32::utils::{ + i32x4_mul_by_3, load_u8_s32_fast, u16x8_pack_trunc_u8x16, u32x4_pack_trunc_u16x8, + w_store_u8x8_m4, +}; +use crate::{clamp_edge, EdgeMode}; +use std::arch::wasm32::*; + +pub fn fast_gaussian_next_vertical_pass_wasm_u8< + T, + const CHANNELS_COUNT: usize, + const EDGE_MODE: usize, +>( + undefined_slice: &UnsafeSlice, + stride: u32, + width: u32, + height: u32, + radius: u32, + start: u32, + end: u32, +) { + unsafe { + fast_gaussian_next_vertical_pass_impl::( + undefined_slice, + stride, + width, + height, + radius, + start, + end, + ); + } +} + +#[inline] +#[target_feature(enable = "simd128")] +unsafe fn fast_gaussian_next_vertical_pass_impl< + T, + const CHANNELS_COUNT: usize, + const EDGE_MODE: usize, +>( + undefined_slice: &UnsafeSlice, + stride: u32, + width: u32, + height: u32, + radius: u32, + start: u32, + end: u32, +) { + let edge_mode: EdgeMode = EDGE_MODE.into(); + let bytes: &UnsafeSlice<'_, u8> = std::mem::transmute(undefined_slice); + let mut buffer: [[i32; 4]; 1024] = [[0; 4]; 1024]; + + let height_wide = height as i64; + + let radius_64 = radius as i64; + let weight = 1.0f32 / ((radius as f32) * (radius as f32) * (radius as f32)); + let f_weight = f32x4_splat(weight); + for x in start..std::cmp::min(width, end) { + let mut diffs = i32x4_splat(0); + let mut ders = i32x4_splat(0); + let mut summs = i32x4_splat(0); + + let start_y = 0 - 3 * radius as i64; + for y in start_y..height_wide { + let current_y = (y * (stride as i64)) as usize; + + if y >= 0 { + let current_px = ((std::cmp::max(x, 0)) * CHANNELS_COUNT as u32) as usize; + let prepared_px_s32 = i32x4_trunc_sat_f32x4(f32x4_floor(f32x4_mul( + f32x4_convert_i32x4(summs), + f_weight, + ))); + let prepared_u16 = u32x4_pack_trunc_u16x8(prepared_px_s32, prepared_px_s32); + let prepared_u8 = u16x8_pack_trunc_u8x16(prepared_u16, prepared_u16); + + let bytes_offset = current_y + current_px; + + let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(bytes_offset); + w_store_u8x8_m4::(dst_ptr, prepared_u8); + + let d_arr_index_1 = ((y + radius_64) & 1023) as usize; + let d_arr_index_2 = ((y - radius_64) & 1023) as usize; + let d_arr_index = (y & 1023) as usize; + + let buf_ptr = buffer.get_unchecked_mut(d_arr_index).as_mut_ptr(); + let stored = v128_load(buf_ptr as *const v128); + + let buf_ptr_1 = buffer.as_mut_ptr().add(d_arr_index_1); + let stored_1 = v128_load(buf_ptr_1 as *const v128); + + let buf_ptr_2 = buffer.as_mut_ptr().add(d_arr_index_2); + let stored_2 = v128_load(buf_ptr_2 as *const v128); + + let new_diff = i32x4_sub(i32x4_mul_by_3(i32x4_sub(stored, stored_1)), stored_2); + diffs = i32x4_add(diffs, new_diff); + } else if y + radius_64 >= 0 { + let arr_index = (y & 1023) as usize; + let arr_index_1 = ((y + radius_64) & 1023) as usize; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let stored = v128_load(buf_ptr as *const v128); + + let buf_ptr_1 = buffer.get_unchecked_mut(arr_index_1).as_mut_ptr(); + let stored_1 = v128_load(buf_ptr_1 as *const v128); + + let new_diff = i32x4_mul_by_3(i32x4_sub(stored, stored_1)); + + diffs = i32x4_add(diffs, new_diff); + } else if y + 2 * radius_64 >= 0 { + let arr_index = ((y + radius_64) & 1023) as usize; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + let stored = v128_load(buf_ptr as *const v128); + diffs = i32x4_sub(diffs, i32x4_mul_by_3(stored)); + } + + let next_row_y = clamp_edge!(edge_mode, y + ((3 * radius_64) >> 1), 0, height_wide - 1) + * (stride as usize); + let next_row_x = (x * CHANNELS_COUNT as u32) as usize; + + let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut u8 }; + + let pixel_color = unsafe { load_u8_s32_fast::(s_ptr) }; + + let arr_index = ((y + 2 * radius_64) & 1023) as usize; + let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() }; + + diffs = i32x4_add(diffs, pixel_color); + ders = i32x4_add(ders, diffs); + summs = i32x4_add(summs, ders); + v128_store(buf_ptr as *mut v128, pixel_color); + } + } +} + +pub fn fast_gaussian_next_horizontal_pass_wasm_u8< + T, + const CHANNELS_COUNT: usize, + const EDGE_MODE: usize, +>( + undefined_slice: &UnsafeSlice, + stride: u32, + width: u32, + height: u32, + radius: u32, + start: u32, + end: u32, +) { + unsafe { + fast_gaussian_next_horizontal_pass_impl::( + undefined_slice, + stride, + width, + height, + radius, + start, + end, + ); + } +} + +#[inline] +#[target_feature(enable = "simd128")] +unsafe fn fast_gaussian_next_horizontal_pass_impl< + T, + const CHANNELS_COUNT: usize, + const EDGE_MODE: usize, +>( + undefined_slice: &UnsafeSlice, + stride: u32, + width: u32, + height: u32, + radius: u32, + start: u32, + end: u32, +) { + let edge_mode: EdgeMode = EDGE_MODE.into(); + let bytes: &UnsafeSlice<'_, u8> = std::mem::transmute(undefined_slice); + let mut buffer: [[i32; 4]; 1024] = [[0; 4]; 1024]; + + let width_wide = width as i64; + + let radius_64 = radius as i64; + let weight = 1.0f32 / ((radius as f32) * (radius as f32) * (radius as f32)); + let f_weight = f32x4_splat(weight); + for y in start..std::cmp::min(height, end) { + let mut diffs = i32x4_splat(0); + let mut ders = i32x4_splat(0); + let mut summs = i32x4_splat(0); + + let current_y = ((y as i64) * (stride as i64)) as usize; + + for x in (0 - 3 * radius_64)..(width as i64) { + if x >= 0 { + let current_px = x as usize * CHANNELS_COUNT; + + let prepared_px_s32 = i32x4_trunc_sat_f32x4(f32x4_floor(f32x4_mul( + f32x4_convert_i32x4(summs), + f_weight, + ))); + let prepared_u16 = u32x4_pack_trunc_u16x8(prepared_px_s32, prepared_px_s32); + let prepared_u8 = u16x8_pack_trunc_u8x16(prepared_u16, prepared_u16); + + let bytes_offset = current_y + current_px; + + unsafe { + let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(bytes_offset); + w_store_u8x8_m4::(dst_ptr, prepared_u8); + } + + let d_arr_index_1 = ((x + radius_64) & 1023) as usize; + let d_arr_index_2 = ((x - radius_64) & 1023) as usize; + let d_arr_index = (x & 1023) as usize; + + let buf_ptr = buffer.get_unchecked_mut(d_arr_index).as_mut_ptr(); + let stored = v128_load(buf_ptr as *const v128); + + let buf_ptr_1 = buffer.get_unchecked_mut(d_arr_index_1).as_mut_ptr(); + let stored_1 = v128_load(buf_ptr_1 as *const v128); + + let buf_ptr_2 = buffer.get_unchecked_mut(d_arr_index_2).as_mut_ptr(); + let stored_2 = v128_load(buf_ptr_2 as *const v128); + + let ck = i32x4_sub(stored, stored_1); + let new_diff = i32x4_sub(i32x4_add(i32x4_shl(ck, 1), ck), stored_2); + diffs = i32x4_add(diffs, new_diff); + } else if x + radius_64 >= 0 { + let arr_index = (x & 1023) as usize; + let arr_index_1 = ((x + radius_64) & 1023) as usize; + let buf_ptr = buffer.as_mut_ptr().add(arr_index) as *mut i32; + let stored = v128_load(buf_ptr as *const v128); + + let buf_ptr_1 = buffer.as_mut_ptr().add(arr_index_1); + let stored_1 = v128_load(buf_ptr_1 as *const v128); + + let new_diff = i32x4_mul_by_3(i32x4_sub(stored, stored_1)); + + diffs = i32x4_add(diffs, new_diff); + } else if x + 2 * radius_64 >= 0 { + let arr_index = ((x + radius_64) & 1023) as usize; + let buf_ptr = buffer.as_mut_ptr().add(arr_index); + let stored = v128_load(buf_ptr as *const v128); + diffs = i32x4_sub(diffs, i32x4_add(i32x4_shl(stored, 1), stored)); + } + + let next_row_y = (y as usize) * (stride as usize); + let next_row_x = clamp_edge!(edge_mode, x + 3 * radius_64 / 2, 0, width_wide - 1); + let next_row_px = next_row_x * CHANNELS_COUNT; + + let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut u8; + + let pixel_color = load_u8_s32_fast::(s_ptr); + + let arr_index = ((x + 2 * radius_64) & 1023) as usize; + let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr(); + + diffs = i32x4_add(diffs, pixel_color); + ders = i32x4_add(ders, diffs); + summs = i32x4_add(summs, ders); + v128_store(buf_ptr as *mut v128, pixel_color); + } + } +} diff --git a/src/lib/wasm32/mod.rs b/src/lib/wasm32/mod.rs new file mode 100644 index 0000000..e38e16d --- /dev/null +++ b/src/lib/wasm32/mod.rs @@ -0,0 +1,40 @@ +/* + * // Copyright (c) Radzivon Bartoshyk. All rights reserved. + * // + * // Redistribution and use in source and binary forms, with or without modification, + * // are permitted provided that the following conditions are met: + * // + * // 1. Redistributions of source code must retain the above copyright notice, this + * // list of conditions and the following disclaimer. + * // + * // 2. Redistributions in binary form must reproduce the above copyright notice, + * // this list of conditions and the following disclaimer in the documentation + * // and/or other materials provided with the distribution. + * // + * // 3. Neither the name of the copyright holder nor the names of its + * // contributors may be used to endorse or promote products derived from + * // this software without specific prior written permission. + * // + * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +mod fast_gaussian; +mod fast_gaussian_next; +mod stack_blur_i32; +mod utils; + +pub use fast_gaussian::{ + fast_gaussian_horizontal_pass_wasm_u8, fast_gaussian_vertical_pass_wasm_u8, +}; +pub use fast_gaussian_next::{ + fast_gaussian_next_horizontal_pass_wasm_u8, fast_gaussian_next_vertical_pass_wasm_u8, +}; +pub use stack_blur_i32::stack_blur_pass_wasm_i32; diff --git a/src/lib/wasm32/stack_blur_i32.rs b/src/lib/wasm32/stack_blur_i32.rs new file mode 100644 index 0000000..c40706a --- /dev/null +++ b/src/lib/wasm32/stack_blur_i32.rs @@ -0,0 +1,280 @@ +// Copyright (c) Radzivon Bartoshyk. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +use crate::mul_table::{MUL_TABLE_STACK_BLUR, SHR_TABLE_STACK_BLUR}; +use crate::stack_blur::StackBlurPass; +use crate::unsafe_slice::UnsafeSlice; +use crate::wasm32::utils::{ + i32x4_pack_trunc_i64x2, load_u8_s32_fast, u16x8_pack_trunc_u8x16, u32x4_pack_trunc_u16x8, + w_store_u8x8_m4, +}; +use std::arch::wasm32::*; + +pub fn stack_blur_pass_wasm_i32( + pixels: &UnsafeSlice, + stride: u32, + width: u32, + height: u32, + radius: u32, + pass: StackBlurPass, + thread: usize, + total_threads: usize, +) { + unsafe { + stack_blur_pass_impl::( + pixels, + stride, + width, + height, + radius, + pass, + thread, + total_threads, + ); + } +} + +#[inline] +#[target_feature(enable = "simd128")] +unsafe fn stack_blur_pass_impl( + pixels: &UnsafeSlice, + stride: u32, + width: u32, + height: u32, + radius: u32, + pass: StackBlurPass, + thread: usize, + total_threads: usize, +) { + let div = ((radius * 2) + 1) as usize; + let (mut xp, mut yp); + let mut sp; + let mut stack_start; + let mut stacks = vec![0i32; 4 * div]; + + let mut sums: v128; + let mut sum_in: v128; + let mut sum_out: v128; + + let wm = width - 1; + let hm = height - 1; + let div = (radius * 2) + 1; + let mul_sum = i64x2_splat(MUL_TABLE_STACK_BLUR[radius as usize] as i64); + let shr_sum = SHR_TABLE_STACK_BLUR[radius as usize] as u32; + + let mut src_ptr; + let mut dst_ptr; + + if pass == StackBlurPass::HORIZONTAL { + let min_y = thread * height as usize / total_threads; + let max_y = (thread + 1) * height as usize / total_threads; + + for y in min_y..max_y { + sums = i32x4_splat(0i32); + sum_in = i32x4_splat(0i32); + sum_out = i32x4_splat(0i32); + + src_ptr = stride as usize * y; // start of line (0,y) + + let src_ld = pixels.slice.as_ptr().add(src_ptr) as *const i32; + let src_pixel = load_u8_s32_fast::(src_ld as *const u8); + + for i in 0..=radius { + let stack_value = stacks.as_mut_ptr().add(i as usize * 4); + v128_store(stack_value as *mut v128, src_pixel); + sums = i32x4_add(sums, i32x4_mul(src_pixel, i32x4_splat(i as i32 + 1))); + sum_out = i32x4_add(sum_out, src_pixel); + } + + for i in 1..=radius { + if i <= wm { + src_ptr += COMPONENTS; + } + let stack_ptr = stacks.as_mut_ptr().add((i + radius) as usize * 4); + let src_ld = pixels.slice.as_ptr().add(src_ptr) as *const i32; + let src_pixel = load_u8_s32_fast::(src_ld as *const u8); + v128_store(stack_ptr as *mut v128, src_pixel); + sums = i32x4_add( + sums, + i32x4_mul(src_pixel, i32x4_splat(radius as i32 + 1 - i as i32)), + ); + + sum_in = i32x4_add(sum_in, src_pixel); + } + + sp = radius; + xp = radius; + if xp > wm { + xp = wm; + } + + src_ptr = COMPONENTS * xp as usize + y * stride as usize; + dst_ptr = y * stride as usize; + for _ in 0..width { + let store_ld = pixels.slice.as_ptr().add(dst_ptr) as *mut u8; + let blurred_lo = + i64x2_shr(i64x2_mul(i64x2_extend_low_i32x4(sums), mul_sum), shr_sum); + let blurred_hi = + i64x2_shr(i64x2_mul(i64x2_extend_high_i32x4(sums), mul_sum), shr_sum); + let blurred_i32 = i32x4_pack_trunc_i64x2(blurred_lo, blurred_hi); + let prepared_u16 = u32x4_pack_trunc_u16x8(blurred_i32, blurred_i32); + let blurred = u16x8_pack_trunc_u8x16(prepared_u16, prepared_u16); + + w_store_u8x8_m4::(store_ld, blurred); + dst_ptr += COMPONENTS; + + sums = i32x4_sub(sums, sum_out); + + stack_start = sp + div - radius; + if stack_start >= div { + stack_start -= div; + } + let stack = stacks.as_mut_ptr().add(stack_start as usize * 4); + + let stack_val = v128_load(stack as *const v128); + + sum_out = i32x4_sub(sum_out, stack_val); + + if xp < wm { + src_ptr += COMPONENTS; + xp += 1; + } + + let src_ld = pixels.slice.as_ptr().add(src_ptr); + let src_pixel = load_u8_s32_fast::(src_ld as *const u8); + v128_store(stack as *mut v128, src_pixel); + + sum_in = i32x4_add(sum_in, src_pixel); + sums = i32x4_add(sums, sum_in); + + sp += 1; + if sp >= div { + sp = 0; + } + let stack = stacks.as_mut_ptr().add(sp as usize * 4); + let stack_val = v128_load(stack as *const v128); + + sum_out = i32x4_add(sum_out, stack_val); + sum_in = i32x4_add(sum_in, stack_val); + } + } + } else if pass == StackBlurPass::VERTICAL { + let min_x = thread * width as usize / total_threads; + let max_x = (thread + 1) * width as usize / total_threads; + + for x in min_x..max_x { + sums = i32x4_splat(0i32); + sum_in = i32x4_splat(0i32); + sum_out = i32x4_splat(0i32); + + src_ptr = COMPONENTS * x; // x,0 + + let src_ld = pixels.slice.as_ptr().add(src_ptr) as *const i32; + + let src_pixel = load_u8_s32_fast::(src_ld as *const u8); + + for i in 0..=radius { + let stack_ptr = stacks.as_mut_ptr().add(i as usize * 4); + v128_store(stack_ptr as *mut v128, src_pixel); + sums = i32x4_add(sums, i32x4_mul(src_pixel, i32x4_splat(i as i32 + 1))); + sum_out = i32x4_add(sum_out, src_pixel); + } + + for i in 1..=radius { + if i <= hm { + src_ptr += stride as usize; + } + + let stack_ptr = stacks.as_mut_ptr().add((i + radius) as usize * 4); + let src_ld = pixels.slice.as_ptr().add(src_ptr) as *const i32; + let src_pixel = load_u8_s32_fast::(src_ld as *const u8); + v128_store(stack_ptr as *mut v128, src_pixel); + sums = i32x4_add( + sums, + i32x4_mul(src_pixel, i32x4_splat(radius as i32 + 1 - i as i32)), + ); + + sum_in = i32x4_add(sum_in, src_pixel); + } + + sp = radius; + yp = radius; + if yp > hm { + yp = hm; + } + src_ptr = COMPONENTS * x + yp as usize * stride as usize; + dst_ptr = COMPONENTS * x; + for _ in 0..height { + let store_ld = pixels.slice.as_ptr().add(dst_ptr) as *mut u8; + let blurred_lo = + i64x2_shr(i64x2_mul(i64x2_extend_low_i32x4(sums), mul_sum), shr_sum); + let blurred_hi = + i64x2_shr(i64x2_mul(i64x2_extend_high_i32x4(sums), mul_sum), shr_sum); + let blurred_i32 = i32x4_pack_trunc_i64x2(blurred_lo, blurred_hi); + let prepared_u16 = u32x4_pack_trunc_u16x8(blurred_i32, blurred_i32); + let blurred = u16x8_pack_trunc_u8x16(prepared_u16, prepared_u16); + w_store_u8x8_m4::(store_ld, blurred); + + dst_ptr += stride as usize; + + sums = i32x4_sub(sums, sum_out); + + stack_start = sp + div - radius; + if stack_start >= div { + stack_start -= div; + } + + let stack_ptr = stacks.as_mut_ptr().add(stack_start as usize * 4); + let stack_val = v128_load(stack_ptr as *const v128); + sum_out = i32x4_sub(sum_out, stack_val); + + if yp < hm { + src_ptr += stride as usize; // stride + yp += 1; + } + + let src_ld = pixels.slice.as_ptr().add(src_ptr); + let src_pixel = load_u8_s32_fast::(src_ld as *const u8); + v128_store(stack_ptr as *mut v128, src_pixel); + + sum_in = i32x4_add(sum_in, src_pixel); + sums = i32x4_add(sums, sum_in); + + sp += 1; + + if sp >= div { + sp = 0; + } + let stack_ptr = stacks.as_mut_ptr().add(sp as usize * 4); + let stack_val = v128_load(stack_ptr as *const v128); + + sum_out = i32x4_add(sum_out, stack_val); + sum_in = i32x4_add(sum_in, stack_val); + } + } + } +} diff --git a/src/lib/wasm32/utils.rs b/src/lib/wasm32/utils.rs new file mode 100644 index 0000000..50218ab --- /dev/null +++ b/src/lib/wasm32/utils.rs @@ -0,0 +1,114 @@ +/* + * // Copyright (c) Radzivon Bartoshyk. All rights reserved. + * // + * // Redistribution and use in source and binary forms, with or without modification, + * // are permitted provided that the following conditions are met: + * // + * // 1. Redistributions of source code must retain the above copyright notice, this + * // list of conditions and the following disclaimer. + * // + * // 2. Redistributions in binary form must reproduce the above copyright notice, + * // this list of conditions and the following disclaimer in the documentation + * // and/or other materials provided with the distribution. + * // + * // 3. Neither the name of the copyright holder nor the names of its + * // contributors may be used to endorse or promote products derived from + * // this software without specific prior written permission. + * // + * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +use std::arch::asm; +use std::arch::wasm32::*; + +/// Stores up to 4 values from uint8x8_t +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn w_store_u8x8_m4(dst_ptr: *mut u8, in_regi: v128) { + let pixel = i32x4_extract_lane::<0>(in_regi) as u32; + + if CHANNELS_COUNT == 4 { + (dst_ptr as *mut u32).write_unaligned(pixel); + } else if CHANNELS_COUNT == 3 { + let bits = pixel.to_le_bytes(); + let first_byte = u16::from_le_bytes([bits[0], bits[1]]); + (dst_ptr as *mut u16).write_unaligned(first_byte); + dst_ptr.add(2).write_unaligned(bits[2]); + } else if CHANNELS_COUNT == 2 { + let bits = pixel.to_le_bytes(); + let first_byte = u16::from_le_bytes([bits[0], bits[1]]); + (dst_ptr as *mut u16).write_unaligned(first_byte); + } else { + let bits = pixel.to_le_bytes(); + dst_ptr.write_unaligned(bits[0]); + } +} + +/// Packs two u32x4 into one u16x8 using truncation +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn u32x4_pack_trunc_u16x8(a: v128, b: v128) -> v128 { + u8x16_shuffle::<0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29>(a, b) +} + +/// Packs two u16x8 into one u8x16 using truncation +#[inline] +#[target_feature(enable = "simd128")] +#[allow(dead_code)] +pub(crate) unsafe fn u16x8_pack_trunc_u8x16(a: v128, b: v128) -> v128 { + u8x16_shuffle::<0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30>(a, b) +} + +/// Packs two i64x2 into one i32x4 using truncation +#[inline] +#[target_feature(enable = "simd128")] +#[allow(dead_code)] +pub(crate) unsafe fn i32x4_pack_trunc_i64x2(a: v128, b: v128) -> v128 { + u8x16_shuffle::<0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27>(a, b) +} + +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn load_u8_s32_fast(ptr: *const u8) -> v128 { + // LLVM generates a little trash code until opt-level is 3 so better here is to use assembly + if CHANNELS_COUNT == 4 { + let mut undef = i32x4_splat(0); + undef = i32x4_replace_lane::<0>(undef, (ptr as *const i32).read_unaligned()); + let k1 = u16x8_extend_low_u8x16(undef); + let k2 = u32x4_extend_low_u16x8(k1); + k2 + } else if CHANNELS_COUNT == 3 { + let mut undef = i32x4_splat(0); + undef = i16x8_replace_lane::<0>(undef, (ptr as *const i16).read_unaligned()); + undef = u8x16_replace_lane::<2>(undef, ptr.add(2).read_unaligned()); + let k1 = u16x8_extend_low_u8x16(undef); + let k2 = u32x4_extend_low_u16x8(k1); + k2 + } else if CHANNELS_COUNT == 2 { + let mut undef = i32x4_splat(0); + undef = i16x8_replace_lane::<0>(undef, (ptr as *const i16).read_unaligned()); + let k1 = u16x8_extend_low_u8x16(undef); + let k2 = u32x4_extend_low_u16x8(k1); + k2 + } else { + let mut undef = i32x4_splat(0); + undef = u8x16_replace_lane::<0>(undef, ptr.read_unaligned()); + let k1 = u16x8_extend_low_u8x16(undef); + let k2 = u32x4_extend_low_u16x8(k1); + k2 + } +} + +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn i32x4_mul_by_3(v: v128) -> v128 { + i32x4_add(i32x4_shl(v, 1), v) +} diff --git a/src/main.rs b/src/main.rs index 98281a8..7f7897c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,11 +7,7 @@ use colorutils_rs::TransferFunction; use half::f16; use image::io::Reader as ImageReader; use image::{EncodableLayout, GenericImageView}; -use libblur::{ - fast_gaussian, fast_gaussian_f16, fast_gaussian_f32, fast_gaussian_next, - fast_gaussian_next_f16, fast_gaussian_next_f32, stack_blur, stack_blur_f16, stack_blur_f32, - stack_blur_in_linear, EdgeMode, FastBlurChannels, ThreadingPolicy, -}; +use libblur::{fast_gaussian, fast_gaussian_f16, fast_gaussian_f32, fast_gaussian_in_linear, fast_gaussian_next, fast_gaussian_next_f16, fast_gaussian_next_f32, stack_blur, stack_blur_f16, stack_blur_f32, stack_blur_in_linear, EdgeMode, FastBlurChannels, ThreadingPolicy}; use std::time::Instant; #[allow(dead_code)] @@ -220,10 +216,10 @@ fn main() { // TransferFunction::Srgb, // ); - let mut f16_bytes: Vec = dst_bytes - .iter() - .map(|&x| f16::from_f32(x as f32 * (1. / 255.))) - .collect(); + // let mut f16_bytes: Vec = dst_bytes + // .iter() + // .map(|&x| f16::from_f32(x as f32 * (1. / 255.))) + // .collect(); // libblur::gaussian_blur_in_linear( // &bytes, @@ -240,18 +236,8 @@ fn main() { // TransferFunction::Srgb, // ); - stack_blur_f16( - &mut f16_bytes, - dimensions.0, - dimensions.1, - 50, - FastBlurChannels::Channels3, - ThreadingPolicy::Single, - ); - - // stack_blur( - // &mut dst_bytes, - // dimensions.0 * components as u32, + // stack_blur_f16( + // &mut f16_bytes, // dimensions.0, // dimensions.1, // 50, @@ -259,10 +245,22 @@ fn main() { // ThreadingPolicy::Single, // ); - dst_bytes = f16_bytes - .iter() - .map(|&x| (x.to_f32() * 255f32) as u8) - .collect(); + fast_gaussian_in_linear( + &mut dst_bytes, + dimensions.0 * components as u32, + dimensions.0, + dimensions.1, + 125, + FastBlurChannels::Channels3, + ThreadingPolicy::Single, + TransferFunction::Rec709, + EdgeMode::Clamp, + ); + // + // dst_bytes = f16_bytes + // .iter() + // .map(|&x| (x.to_f32() * 255f32) as u8) + // .collect(); // dst_bytes = perform_planar_pass_3(&bytes, dimensions.0 as usize, dimensions.1 as usize);