Skip to content

Commit

Permalink
Added integral transformation u8 SSE
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Sep 2, 2024
1 parent 7cd3adc commit 65ba6a1
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 39 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -174,13 +174,15 @@ Example comparison time for blurring image 2828x4242 RGBA 8-bit in multithreaded
Excellent results. Have improvements, however, much slower than any approximations slow. Use when use need gaussian
methods - smoothing, anti-alias,
FFT, advanced analysis etc.
There are two methods of convolution, integral approximation and exact,
approximation in integral form is still gaussian with 1-3% of error however about 2x faster.

Kernel size must be odd. Will panic if kernel size is not odd.

O(R) complexity.

```rust
libblur::gaussian_blur( & bytes, src_stride, & mut dst_bytes, dst_stride, width, height, kernel_size, sigma, FastBlurChannels::Channels3);
libblur::gaussian_blur(&bytes, src_stride, & mut dst_bytes, dst_stride, width, height, kernel_size, sigma, FastBlurChannels::Channels3, GaussianPreciseLevel::EXACT);
```

Example comparison time for blurring image 3000x4000 RGB 8-bit in multithreaded mode with 151 kernel size.
Expand Down
58 changes: 37 additions & 21 deletions src/lib/gaussian/sse/horiz_one_approx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,29 @@

use crate::gaussian::gaussian_approx::PRECISION;
use crate::gaussian::gaussian_filter::GaussianFilter;
use crate::sse::{_mm_hsum_epi32, _mm_loadu_si128_x2, load_u8_s16_fast};
use crate::sse::{_mm_loadu_si128_x2, load_u8_s16_fast, shuffle};
use crate::unsafe_slice::UnsafeSlice;
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

#[inline]
#[target_feature(enable = "sse4.1")]
pub unsafe fn _mm_sum_clamp(v: __m128i) -> i32 {
const SHUFFLE_1: i32 = shuffle(1, 0, 3, 2);
let hi64 = _mm_shuffle_epi32::<SHUFFLE_1>(v);
let sum64 = _mm_add_epi32(hi64, v);
let hi32 = _mm_shufflelo_epi16::<SHUFFLE_1>(sum64); // Swap the low two elements
let sum32 = _mm_add_epi32(sum64, hi32);
let cutoff = _mm_set1_epi32(255);
let lowest = _mm_setzero_si128();
_mm_cvtsi128_si32(_mm_min_epi32(
_mm_max_epi32(_mm_srai_epi32::<PRECISION>(sum32), lowest),
cutoff,
))
}

pub fn gaussian_sse_horiz_one_chan_u8_approx(
undef_src: &[u8],
src_stride: u32,
Expand Down Expand Up @@ -266,25 +282,25 @@ unsafe fn gaussian_sse_horiz_one_chan_impl(
r += 1;
}

let agg0 = _mm_hsum_epi32(store0);
let agg0 = _mm_sum_clamp(store0);
let offset0 = y_dst_shift + x as usize;
let dst_ptr0 = unsafe_dst.slice.as_ptr().add(offset0) as *mut u8;
dst_ptr0.write_unaligned((agg0 >> PRECISION).min(255).max(0) as u8);
dst_ptr0.write_unaligned(agg0 as u8);

let agg1 = _mm_hsum_epi32(store1);
let agg1 = _mm_sum_clamp(store1);
let offset1 = offset0 + dst_stride as usize;
let dst_ptr1 = unsafe_dst.slice.as_ptr().add(offset1) as *mut u8;
dst_ptr1.write_unaligned((agg1 >> PRECISION).min(255).max(0) as u8);
dst_ptr1.write_unaligned(agg1 as u8);

let agg2 = _mm_hsum_epi32(store2);
let agg2 = _mm_sum_clamp(store2);
let offset2 = offset1 + dst_stride as usize;
let dst_ptr2 = unsafe_dst.slice.as_ptr().add(offset2) as *mut u8;
dst_ptr2.write_unaligned((agg2 >> PRECISION).min(255).max(0) as u8);
dst_ptr2.write_unaligned(agg2 as u8);

let agg3 = _mm_hsum_epi32(store3);
let agg3 = _mm_sum_clamp(store3);
let offset3 = offset2 + dst_stride as usize;
let dst_ptr3 = unsafe_dst.slice.as_ptr().add(offset3) as *mut u8;
dst_ptr3.write_unaligned((agg3 >> PRECISION).min(255).max(0) as u8);
dst_ptr3.write_unaligned(agg3 as u8);
}
_cy = y;
}
Expand Down Expand Up @@ -415,15 +431,15 @@ unsafe fn gaussian_sse_horiz_one_chan_impl(
r += 1;
}

let agg0 = _mm_hsum_epi32(store0);
let agg0 = _mm_sum_clamp(store0);
let offset0 = y_dst_shift + x as usize;
let dst_ptr0 = unsafe_dst.slice.as_ptr().add(offset0) as *mut u8;
dst_ptr0.write_unaligned((agg0 >> PRECISION).min(255).max(0) as u8);
dst_ptr0.write_unaligned(agg0 as u8);

let agg1 = _mm_hsum_epi32(store1);
let agg1 = _mm_sum_clamp(store1);
let offset1 = offset0 + dst_stride as usize;
let dst_ptr1 = unsafe_dst.slice.as_ptr().add(offset1) as *mut u8;
dst_ptr1.write_unaligned((agg1 >> PRECISION).min(255).max(0) as u8);
dst_ptr1.write_unaligned(agg1 as u8);
}
}
_cy = y;
Expand Down Expand Up @@ -523,10 +539,10 @@ unsafe fn gaussian_sse_horiz_one_chan_impl(
r += 1;
}

let agg = _mm_hsum_epi32(store);
let agg = _mm_sum_clamp(store);
let offset = y_dst_shift + x as usize;
let dst_ptr = unsafe_dst.slice.as_ptr().add(offset) as *mut u8;
dst_ptr.write_unaligned((agg >> PRECISION).min(255).max(0) as u8);
dst_ptr.write_unaligned(agg as u8);
}
}
}
Expand Down Expand Up @@ -689,15 +705,15 @@ unsafe fn gaussian_sse_horiz_one_chan_filter_impl(
r += 1;
}

let agg0 = _mm_hsum_epi32(store0);
let agg0 = _mm_sum_clamp(store0);
let offset0 = y_dst_shift + x as usize;
let dst_ptr0 = unsafe_dst.slice.as_ptr().add(offset0) as *mut u8;
dst_ptr0.write_unaligned((agg0 >> PRECISION).min(255).max(0) as u8);
dst_ptr0.write_unaligned(agg0 as u8);

let agg1 = _mm_hsum_epi32(store1);
let agg1 = _mm_sum_clamp(store1);
let offset1 = offset0 + dst_stride as usize;
let dst_ptr1 = unsafe_dst.slice.as_ptr().add(offset1) as *mut u8;
dst_ptr1.write_unaligned((agg1 >> PRECISION).min(255).max(0) as u8);
dst_ptr1.write_unaligned(agg1 as u8);
}
_cy = y;
}
Expand Down Expand Up @@ -796,10 +812,10 @@ unsafe fn gaussian_sse_horiz_one_chan_filter_impl(
r += 1;
}

let agg = _mm_hsum_epi32(store);
let agg = _mm_sum_clamp(store);
let offset = y_dst_shift + x as usize;
let dst_ptr = unsafe_dst.slice.as_ptr().add(offset) as *mut u8;
dst_ptr.write_unaligned((agg >> PRECISION).min(255).max(0) as u8);
dst_ptr.write_unaligned(agg as u8);
}
}
}
9 changes: 2 additions & 7 deletions src/lib/gaussian/sse/horiz_one_channel_u8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

use crate::gaussian::gaussian_filter::GaussianFilter;
use crate::sse::{_mm_hsum_ps, _mm_loadu_ps_x2, _mm_loadu_ps_x4, _mm_loadu_si128_x2};
use crate::sse::{_mm_hsum_ps, _mm_loadu_ps_x2, _mm_loadu_ps_x4, _mm_loadu_si128_x2, load_u8_s32_fast};
use crate::unsafe_slice::UnsafeSlice;
use erydanos::_mm_prefer_fma_ps;
#[cfg(target_arch = "x86")]
Expand Down Expand Up @@ -856,12 +856,7 @@ unsafe fn gaussian_sse_horiz_one_chan_filter_impl<T>(
) as usize;
let px = current_x;
let s_ptr = src.as_ptr().add(y_src_shift + px);
let pixel_colors_i32 = _mm_setr_epi32(
s_ptr.read_unaligned() as i32,
s_ptr.add(1).read_unaligned() as i32,
s_ptr.add(2).read_unaligned() as i32,
s_ptr.add(3).read_unaligned() as i32,
);
let pixel_colors_i32 = load_u8_s32_fast::<4>(s_ptr);
let pixel_colors_f32 = _mm_cvtepi32_ps(pixel_colors_i32);
let weight = filter_weights.as_ptr().add(r);
let f_weight = _mm_loadu_ps(weight);
Expand Down
20 changes: 10 additions & 10 deletions src/lib/sse/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -350,16 +350,16 @@ pub unsafe fn _mm_hsum_ps(v: __m128) -> f32 {
_mm_cvtss_f32(sums)
}

#[inline]
#[target_feature(enable = "sse4.1")]
pub unsafe fn _mm_hsum_epi32(v: __m128i) -> i32 {
const SHUFFLE_1: i32 = shuffle(1, 0, 3, 2);
let hi64 = _mm_shuffle_epi32::<SHUFFLE_1>(v);
let sum64 = _mm_add_epi32(hi64, v);
let hi32 = _mm_shufflelo_epi16::<SHUFFLE_1>(sum64); // Swap the low two elements
let sum32 = _mm_add_epi32(sum64, hi32);
_mm_cvtsi128_si32(sum32)
}
// #[inline]
// #[target_feature(enable = "sse4.1")]
// pub unsafe fn _mm_hsum_epi32(v: __m128i) -> i32 {
// const SHUFFLE_1: i32 = shuffle(1, 0, 3, 2);
// let hi64 = _mm_shuffle_epi32::<SHUFFLE_1>(v);
// let sum64 = _mm_add_epi32(hi64, v);
// let hi32 = _mm_shufflelo_epi16::<SHUFFLE_1>(sum64); // Swap the low two elements
// let sum32 = _mm_add_epi32(sum64, hi32);
// _mm_cvtsi128_si32(sum32)
// }

#[inline]
#[target_feature(enable = "sse4.1")]
Expand Down

0 comments on commit 65ba6a1

Please sign in to comment.