Skip to content

Commit

Permalink
Some updates, added wasm
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Aug 25, 2024
1 parent 9acae9c commit 9304ce8
Show file tree
Hide file tree
Showing 28 changed files with 2,031 additions and 1,097 deletions.
3 changes: 3 additions & 0 deletions .cargo/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,8 @@ rustflags = ["-Ctarget-cpu=native"]
[target.x86_64-apple-darwin]
rustflags = ["-Ctarget-feature=+sse4.1"]

[target.wasm32-unknown-unknown]
rustflags = ["-C", "target-feature=+simd128"]

[env]
DYLD_FALLBACK_LIBRARY_PATH="/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/"
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Fast blur algorithms library for Rust

There are some very good and blazing fast algorithms that do blurring images.
Best optimized for NEON and SSE.
Best optimized for NEON and SSE, partially AVX, partially done WASM.

You may receive gaussian blur in 100 FPS for 4K photo.

Expand Down
2 changes: 1 addition & 1 deletion src/lib/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "libblur"
version = "0.13.4"
version = "0.13.5"
edition = "2021"
description = "Fast image blurring in pure Rust"
readme = "../../README.md"
Expand Down
11 changes: 11 additions & 0 deletions src/lib/fast_gaussian.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ use crate::sse::{
use crate::threading_policy::ThreadingPolicy;
use crate::to_storage::ToStorage;
use crate::unsafe_slice::UnsafeSlice;
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
use crate::wasm32::{fast_gaussian_horizontal_pass_wasm_u8, fast_gaussian_vertical_pass_wasm_u8};
use crate::{clamp_edge, reflect_101, EdgeMode};

const BASE_RADIUS_I64_CUTOFF: u32 = 180;
Expand Down Expand Up @@ -708,6 +710,15 @@ fn fast_gaussian_impl<
}
}
}
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
if std::any::type_name::<T>() == "u8" && BASE_RADIUS_I64_CUTOFF > radius {
_dispatcher_vertical =
fast_gaussian_vertical_pass_wasm_u8::<T, CHANNEL_CONFIGURATION, EDGE_MODE>;
_dispatcher_horizontal =
fast_gaussian_horizontal_pass_wasm_u8::<T, CHANNEL_CONFIGURATION, EDGE_MODE>;
}
}
}
if thread_count == 1 {
_dispatcher_vertical(&unsafe_image, stride, width, height, radius, 0, width);
Expand Down
13 changes: 13 additions & 0 deletions src/lib/fast_gaussian_next.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ use crate::sse::{
};
use crate::to_storage::ToStorage;
use crate::unsafe_slice::UnsafeSlice;
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
use crate::wasm32::{
fast_gaussian_next_horizontal_pass_wasm_u8, fast_gaussian_next_vertical_pass_wasm_u8,
};
use crate::{clamp_edge, reflect_101, EdgeMode, FastBlurChannels, ThreadingPolicy};
use colorutils_rs::linear_to_planar::linear_to_plane;
use colorutils_rs::planar_to_linear::plane_to_linear;
Expand Down Expand Up @@ -637,6 +641,15 @@ fn fast_gaussian_next_impl<
}
}
}
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
if std::any::type_name::<T>() == "u8" && BASE_RADIUS_I64_CUTOFF > radius {
_dispatcher_vertical =
fast_gaussian_next_vertical_pass_wasm_u8::<T, CHANNEL_CONFIGURATION, EDGE_MODE>;
_dispatcher_horizontal =
fast_gaussian_next_horizontal_pass_wasm_u8::<T, CHANNEL_CONFIGURATION, EDGE_MODE>;
}
}

let thread_count = threading_policy.get_threads_count(width, height) as u32;
if thread_count == 1 {
Expand Down
2 changes: 2 additions & 0 deletions src/lib/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ mod threading_policy;
mod to_storage;
mod unsafe_slice;
mod cpu_features;
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
mod wasm32;

pub use channels_configuration::FastBlurChannels;
pub use colorutils_rs::TransferFunction;
Expand Down
198 changes: 97 additions & 101 deletions src/lib/neon/fast_gaussian.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,68 +44,66 @@ pub fn fast_gaussian_horizontal_pass_neon_u8<
start: u32,
end: u32,
) {
let edge_mode: EdgeMode = EDGE_MODE.into();
let bytes: &UnsafeSlice<'_, u8> = unsafe { std::mem::transmute(undefined_slice) };
let mut buffer: [[i32; 4]; 1024] = [[0; 4]; 1024];
let initial_sum = ((radius * radius) >> 1) as i32;

let radius_64 = radius as i64;
let width_wide = width as i64;
let v_weight = unsafe { vdupq_n_f32((1f64 / (radius as f64 * radius as f64)) as f32) };
for y in start..std::cmp::min(height, end) {
let mut diffs: int32x4_t = unsafe { vdupq_n_s32(0) };
let mut summs: int32x4_t = unsafe { vdupq_n_s32(initial_sum) };

let current_y = ((y as i64) * (stride as i64)) as usize;

let start_x = 0 - 2 * radius_64;
for x in start_x..(width as i64) {
if x >= 0 {
let current_px = ((std::cmp::max(x, 0) as u32) * CHANNELS_COUNT as u32) as usize;

let prepared_px_s32 =
unsafe { vreinterpretq_u32_s32(vmulq_s32_f32(summs, v_weight)) };
let prepared_u16 = unsafe { vqmovn_u32(prepared_px_s32) };
let prepared_u8 = unsafe { vqmovn_u16(vcombine_u16(prepared_u16, prepared_u16)) };

let bytes_offset = current_y + current_px;
unsafe {
unsafe {
let edge_mode: EdgeMode = EDGE_MODE.into();
let bytes: &UnsafeSlice<'_, u8> = std::mem::transmute(undefined_slice);
let mut buffer: [[i32; 4]; 1024] = [[0; 4]; 1024];
let initial_sum = ((radius * radius) >> 1) as i32;

let radius_64 = radius as i64;
let width_wide = width as i64;
let v_weight = vdupq_n_f32((1f64 / (radius as f64 * radius as f64)) as f32);
for y in start..std::cmp::min(height, end) {
let mut diffs: int32x4_t = vdupq_n_s32(0);
let mut summs: int32x4_t = vdupq_n_s32(initial_sum);

let current_y = ((y as i64) * (stride as i64)) as usize;

let start_x = 0 - 2 * radius_64;
for x in start_x..(width as i64) {
if x >= 0 {
let current_px =
((std::cmp::max(x, 0) as u32) * CHANNELS_COUNT as u32) as usize;

let prepared_px_s32 = vreinterpretq_u32_s32(vmulq_s32_f32(summs, v_weight));
let prepared_u16 = vqmovn_u32(prepared_px_s32);
let prepared_u8 = vqmovn_u16(vcombine_u16(prepared_u16, prepared_u16));

let bytes_offset = current_y + current_px;
let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(bytes_offset);
store_u8x8_m4::<CHANNELS_COUNT>(dst_ptr, prepared_u8)
}
store_u8x8_m4::<CHANNELS_COUNT>(dst_ptr, prepared_u8);

let arr_index = ((x - radius_64) & 1023) as usize;
let d_arr_index = (x & 1023) as usize;
let arr_index = ((x - radius_64) & 1023) as usize;
let d_arr_index = (x & 1023) as usize;

let d_buf_ptr = unsafe { buffer.get_unchecked_mut(d_arr_index).as_mut_ptr() };
let mut d_stored = unsafe { vld1q_s32(d_buf_ptr) };
d_stored = unsafe { vshlq_n_s32::<1>(d_stored) };
let d_buf_ptr = buffer.get_unchecked_mut(d_arr_index).as_mut_ptr();
let mut d_stored = vld1q_s32(d_buf_ptr);
d_stored = vshlq_n_s32::<1>(d_stored);

let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() };
let a_stored = unsafe { vld1q_s32(buf_ptr) };
let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr();
let a_stored = vld1q_s32(buf_ptr);

diffs = unsafe { vaddq_s32(diffs, vsubq_s32(a_stored, d_stored)) };
} else if x + radius_64 >= 0 {
let arr_index = (x & 1023) as usize;
let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() };
let mut stored = unsafe { vld1q_s32(buf_ptr) };
stored = unsafe { vshlq_n_s32::<1>(stored) };
diffs = unsafe { vsubq_s32(diffs, stored) };
}
diffs = vaddq_s32(diffs, vsubq_s32(a_stored, d_stored));
} else if x + radius_64 >= 0 {
let arr_index = (x & 1023) as usize;
let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr();
let mut stored = vld1q_s32(buf_ptr);
stored = vshlq_n_s32::<1>(stored);
diffs = vsubq_s32(diffs, stored);
}

let next_row_y = (y as usize) * (stride as usize);
let next_row_x = clamp_edge!(edge_mode, x + radius_64, 0, width_wide - 1);
let next_row_px = next_row_x * CHANNELS_COUNT;
let next_row_y = (y as usize) * (stride as usize);
let next_row_x = clamp_edge!(edge_mode, x + radius_64, 0, width_wide - 1);
let next_row_px = next_row_x * CHANNELS_COUNT;

let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut u8 };
let pixel_color = unsafe { load_u8_s32_fast::<CHANNELS_COUNT>(s_ptr) };
let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_px) as *mut u8;
let pixel_color = load_u8_s32_fast::<CHANNELS_COUNT>(s_ptr);

let arr_index = ((x + radius_64) & 1023) as usize;
let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() };
let arr_index = ((x + radius_64) & 1023) as usize;
let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr();

diffs = unsafe { vaddq_s32(diffs, pixel_color) };
summs = unsafe { vaddq_s32(summs, diffs) };
unsafe {
diffs = vaddq_s32(diffs, pixel_color);
summs = vaddq_s32(summs, diffs);
vst1q_s32(buf_ptr, pixel_color);
}
}
Expand All @@ -125,70 +123,68 @@ pub(crate) fn fast_gaussian_vertical_pass_neon_u8<
start: u32,
end: u32,
) {
let edge_mode: EdgeMode = EDGE_MODE.into();
let bytes: &UnsafeSlice<'_, u8> = unsafe { std::mem::transmute(undefined_slice) };
let mut buffer: [[i32; 4]; 1024] = [[0; 4]; 1024];
let initial_sum = ((radius * radius) >> 1) as i32;
unsafe {
let edge_mode: EdgeMode = EDGE_MODE.into();
let bytes: &UnsafeSlice<'_, u8> = std::mem::transmute(undefined_slice);
let mut buffer: [[i32; 4]; 1024] = [[0; 4]; 1024];
let initial_sum = ((radius * radius) >> 1) as i32;

let height_wide = height as i64;
let height_wide = height as i64;

let radius_64 = radius as i64;
let v_weight = unsafe { vdupq_n_f32((1f64 / (radius as f64 * radius as f64)) as f32) };
for x in start..std::cmp::min(width, end) {
let mut diffs: int32x4_t = unsafe { vdupq_n_s32(0) };
let mut summs: int32x4_t = unsafe { vdupq_n_s32(initial_sum) };
let radius_64 = radius as i64;
let v_weight = vdupq_n_f32((1f64 / (radius as f64 * radius as f64)) as f32);
for x in start..std::cmp::min(width, end) {
let mut diffs: int32x4_t = vdupq_n_s32(0);
let mut summs: int32x4_t = vdupq_n_s32(initial_sum);

let start_y = 0 - 2 * radius as i64;
for y in start_y..height_wide {
let current_y = (y * (stride as i64)) as usize;
let start_y = 0 - 2 * radius as i64;
for y in start_y..height_wide {
let current_y = (y * (stride as i64)) as usize;

if y >= 0 {
let current_px = ((std::cmp::max(x, 0)) * CHANNELS_COUNT as u32) as usize;
if y >= 0 {
let current_px = ((std::cmp::max(x, 0)) * CHANNELS_COUNT as u32) as usize;

let prepared_px_s32 =
unsafe { vreinterpretq_u32_s32(vmulq_s32_f32(summs, v_weight)) };
let prepared_u16 = unsafe { vqmovn_u32(prepared_px_s32) };
let prepared_u8 = unsafe { vqmovn_u16(vcombine_u16(prepared_u16, prepared_u16)) };
let prepared_px_s32 = vreinterpretq_u32_s32(vmulq_s32_f32(summs, v_weight));
let prepared_u16 = vqmovn_u32(prepared_px_s32);
let prepared_u8 = vqmovn_u16(vcombine_u16(prepared_u16, prepared_u16));

let bytes_offset = current_y + current_px;
let bytes_offset = current_y + current_px;

unsafe {
let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(bytes_offset);
store_u8x8_m4::<CHANNELS_COUNT>(dst_ptr, prepared_u8)
}
store_u8x8_m4::<CHANNELS_COUNT>(dst_ptr, prepared_u8);

let arr_index = ((y - radius_64) & 1023) as usize;
let d_arr_index = (y & 1023) as usize;
let arr_index = ((y - radius_64) & 1023) as usize;
let d_arr_index = (y & 1023) as usize;

let d_buf_ptr = unsafe { buffer.get_unchecked_mut(d_arr_index).as_mut_ptr() };
let mut d_stored = unsafe { vld1q_s32(d_buf_ptr) };
d_stored = unsafe { vshlq_n_s32::<1>(d_stored) };
let d_buf_ptr = buffer.get_unchecked_mut(d_arr_index).as_mut_ptr();
let mut d_stored = vld1q_s32(d_buf_ptr);
d_stored = vshlq_n_s32::<1>(d_stored);

let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() };
let a_stored = unsafe { vld1q_s32(buf_ptr) };
let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr();
let a_stored = vld1q_s32(buf_ptr);

diffs = unsafe { vaddq_s32(diffs, vsubq_s32(a_stored, d_stored)) };
} else if y + radius_64 >= 0 {
let arr_index = (y & 1023) as usize;
let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() };
let mut stored = unsafe { vld1q_s32(buf_ptr) };
stored = unsafe { vshlq_n_s32::<1>(stored) };
diffs = unsafe { vsubq_s32(diffs, stored) };
}
diffs = vaddq_s32(diffs, vsubq_s32(a_stored, d_stored));
} else if y + radius_64 >= 0 {
let arr_index = (y & 1023) as usize;
let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr();
let mut stored = vld1q_s32(buf_ptr);
stored = vshlq_n_s32::<1>(stored);
diffs = vsubq_s32(diffs, stored);
}

let next_row_y =
clamp_edge!(edge_mode, y + radius_64, 0, height_wide - 1) * (stride as usize);
let next_row_x = (x * CHANNELS_COUNT as u32) as usize;

let next_row_y =
clamp_edge!(edge_mode, y + radius_64, 0, height_wide - 1) * (stride as usize);
let next_row_x = (x * CHANNELS_COUNT as u32) as usize;
let s_ptr = bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut u8;
let pixel_color = load_u8_s32_fast::<CHANNELS_COUNT>(s_ptr);

let s_ptr = unsafe { bytes.slice.as_ptr().add(next_row_y + next_row_x) as *mut u8 };
let pixel_color = unsafe { load_u8_s32_fast::<CHANNELS_COUNT>(s_ptr) };
let arr_index = ((y + radius_64) & 1023) as usize;
let buf_ptr = buffer.get_unchecked_mut(arr_index).as_mut_ptr();

let arr_index = ((y + radius_64) & 1023) as usize;
let buf_ptr = unsafe { buffer.get_unchecked_mut(arr_index).as_mut_ptr() };
diffs = vaddq_s32(diffs, pixel_color);
summs = vaddq_s32(summs, diffs);

diffs = unsafe { vaddq_s32(diffs, pixel_color) };
summs = unsafe { vaddq_s32(summs, diffs) };
unsafe {
vst1q_s32(buf_ptr, pixel_color);
}
}
Expand Down
Loading

0 comments on commit 9304ce8

Please sign in to comment.