Skip to content

Commit

Permalink
Added fast gaussian very fast path for neon
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Apr 27, 2024
1 parent 4a3ced8 commit a2c0977
Show file tree
Hide file tree
Showing 7 changed files with 381 additions and 146 deletions.
4 changes: 3 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ set(SPARKYUV_SOURCES
src/Rotate.cpp
src/FastGaussian.cpp
src/FastGaussian.h
src/GaussianBlur.cpp)
src/GaussianBlur.cpp
src/FastGaussianNeon.cpp
src/FastGaussianNeon.h)

set(HWY_SOURCES
highway/hwy/aligned_allocator.cc highway/hwy/targets.cc highway/hwy/targets.cc
Expand Down
216 changes: 108 additions & 108 deletions include/sparkyuv-ycgcor.h

Large diffs are not rendered by default.

53 changes: 35 additions & 18 deletions src/FastGaussian.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
#include "sparkyuv-internal.h"
#include "PixelLoader.h"
#include "concurrency.hpp"
#include "hwy/aligned_allocator.h"
#include "TypeSupport.h"
#include "FastGaussianNeon.h"

namespace sparkyuv {

Expand All @@ -32,7 +32,7 @@ void VerticalGaussianPassChannel(T *data,
const float weight = 1.f / (static_cast<float>(radius) * static_cast<float>(radius));

constexpr int bufLength = 1023;
auto buffer = hwy::AllocateAligned<V>(bufLength + 1);
V buffer[1024];

for (uint32_t x = 0; x < width; ++x) {
V dif = 0, sum = (radius * radius) >> 1;
Expand Down Expand Up @@ -69,7 +69,7 @@ void HorizontalGaussianPassChannel(T *data,
const float weight = 1.f / (static_cast<float>(radius) * static_cast<float>(radius));

constexpr int bufLength = 1023;
auto buffer = hwy::AllocateAligned<V>(bufLength + 1);
V buffer[1024];

for (int y = 0; y < height; ++y) {
V dif = 0, sum = (radius * radius) >> 1;
Expand Down Expand Up @@ -103,9 +103,9 @@ void VerticalGaussianPass(T *data, const uint32_t stride,
const float weight = 1.f / (static_cast<float>(radius) * static_cast<float>(radius));

constexpr int bufLength = 1023;
auto bufferR = hwy::AllocateAligned<V>(bufLength + 1);
auto bufferG = hwy::AllocateAligned<V>(bufLength + 1);
auto bufferB = hwy::AllocateAligned<V>(bufLength + 1);
V bufferR[1024];
V bufferG[1024];
V bufferB[1024];

const int channels = getPixelTypeComponents(PixelType);

Expand Down Expand Up @@ -171,9 +171,9 @@ void HorizontalGaussianPass(T *data,
const float weight = 1.f / (static_cast<float>(radius) * static_cast<float>(radius));

constexpr int bufLength = 1023;
auto bufferR = hwy::AllocateAligned<V>(bufLength + 1);
auto bufferG = hwy::AllocateAligned<V>(bufLength + 1);
auto bufferB = hwy::AllocateAligned<V>(bufLength + 1);
V bufferR[1024];
V bufferG[1024];
V bufferB[1024];

const int channels = getPixelTypeComponents(PixelType);

Expand Down Expand Up @@ -236,9 +236,9 @@ void VerticalFastGaussianPassNext(T *data, const uint32_t stride, const int widt
const float weight = 1.f / (static_cast<float>(radius) * static_cast<float>(radius) * static_cast<float>(radius));

constexpr int bufLength = 1023;
auto bufferR = hwy::AllocateAligned<V>(bufLength + 1);
auto bufferG = hwy::AllocateAligned<V>(bufLength + 1);
auto bufferB = hwy::AllocateAligned<V>(bufLength + 1);
V bufferR[1024];
V bufferG[1024];
V bufferB[1024];

const int channels = getPixelTypeComponents(PixelType);

Expand Down Expand Up @@ -303,9 +303,9 @@ void HorizontalFastGaussianNext(T *data, const uint32_t stride,
const float weight = 1.f / (static_cast<float>(radius) * static_cast<float>(radius) * static_cast<float>(radius));

constexpr int bufLength = 1023;
auto bufferR = hwy::AllocateAligned<V>(bufLength + 1);
auto bufferG = hwy::AllocateAligned<V>(bufLength + 1);
auto bufferB = hwy::AllocateAligned<V>(bufLength + 1);
V bufferR[1024];
V bufferG[1024];
V bufferB[1024];

const int channels = getPixelTypeComponents(PixelType);

Expand Down Expand Up @@ -346,7 +346,7 @@ void HorizontalFastGaussianNext(T *data, const uint32_t stride,
int mPNextRad = (x + 2 * radius) & bufLength;

auto srcNext = reinterpret_cast<T *>(reinterpret_cast<uint8_t *>(data) + y * stride);
int px = std::clamp(x + radius, 0, static_cast<int32_t>(width - 1)) * channels;
int px = std::clamp(x + 3 * radius / 2, 0, static_cast<int32_t>(width - 1)) * channels;
srcNext += px;

V pR, pG, pB, pA;
Expand All @@ -364,18 +364,35 @@ void HorizontalFastGaussianNext(T *data, const uint32_t stride,
}
}

void FastGaussianBlurRGBA(uint8_t *data, uint32_t stride, uint32_t width, uint32_t height, int radius) {
const int threadCount = concurrency::getThreadCounts(width, height);
concurrency::parallel_for_segment(threadCount, width, [&](uint32_t start, uint32_t end) {
#if __aarch64__
VerticalGaussianPassRGBANeon(data, stride, width, height, radius, start, end);
#else
VerticalGaussianPass<uint8_t, int, sparkyuv::PIXEL_RGBA>(data, stride, width, height, radius, start, end);
#endif
});
concurrency::parallel_for_segment(threadCount, height, [&](uint32_t start, uint32_t end) {
#if __aarch64__
HorizontalGaussianPassRGBANeon(data, stride, width, height, radius, start, end);
#else
HorizontalGaussianPass<uint8_t, int, sparkyuv::PIXEL_RGBA>(data, stride, width, height, radius, start, end);
#endif
});
}

#define FAST_GAUSSIAN_DECLARATION_R(pixelName, pixelType, storageType) \
void FastGaussianBlur##pixelName(storageType *data, uint32_t stride, uint32_t width, uint32_t height, int radius) {\
const int threadCount = concurrency::getThreadCounts(width, height);\
concurrency::parallel_for_segment(threadCount, width, [&](int start, int end) {\
concurrency::parallel_for_segment(threadCount, width, [&](int start, int end) { \
VerticalGaussianPass<storageType, int, sparkyuv::PIXEL_##pixelType>(data, stride, width, height, radius, start, end);\
});\
concurrency::parallel_for_segment(threadCount, height, [&](int start, int end) {\
HorizontalGaussianPass<storageType, int, sparkyuv::PIXEL_##pixelType>(data, stride, width, height, radius, start, end);\
});\
}

FAST_GAUSSIAN_DECLARATION_R(RGBA, RGBA, uint8_t)
FAST_GAUSSIAN_DECLARATION_R(RGB, RGB, uint8_t)
#if SPARKYUV_FULL_CHANNELS
FAST_GAUSSIAN_DECLARATION_R(ARGB, ARGB, uint8_t)
Expand Down
168 changes: 168 additions & 0 deletions src/FastGaussianNeon.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
// Copyright (C) 2024 Radzivon Bartoshyk
//
// This file belongs to sparkyuv project
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "FastGaussianNeon.h"

#if __aarch64__

#include <arm_neon.h>
#include <algorithm>

namespace sparkyuv {

#if defined(__GNUC__) || defined(__clang__)
#define FAST_GAUSSIAN_INLINE __attribute__((flatten)) inline
#else
#define FAST_GAUSSIAN_INLINE inline
#endif

FAST_GAUSSIAN_INLINE int32x4_t vsld_u8_to_i32(const uint8_t *src, bool useVld) {
uint8_t safeStore[8] = {0};
const uint8_t *origin;
if (useVld) {
origin = src;
} else {
std::copy(src, src + 3, safeStore);
origin = safeStore;
}
auto fullPx = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(origin))));
return vmovl_s16(fullPx);
}

void VerticalGaussianPassRGBANeon(uint8_t *data,
const uint32_t stride,
const uint32_t width,
const uint32_t height,
const int radius,
const uint32_t start,
const uint32_t end) {
const float weight = 1.f / (static_cast<float>(radius) * static_cast<float>(radius));

constexpr int64_t bufLength = 1023;
int32_t buffer[1024][4] = {{0}};

auto radius64 = static_cast<int64_t>(radius);

auto initialSum = (radius * radius) >> 1;

for (int64_t x = start; x < static_cast<int64_t> (width) && x < static_cast<int64_t>(end); ++x) {
int32x4_t dif = vdupq_n_s32(0);
int32x4_t sum = vdupq_n_s32(initialSum);
int64_t px = x * 4;
for (int64_t y = 0 - 2 * radius64; y < static_cast<int64_t>(height); ++y) {
if (y >= 0) {
auto src = reinterpret_cast<uint8_t *>(reinterpret_cast<uint8_t *>(data) + static_cast<uint32_t >(y) * stride);
int64_t arrIndex = (y - radius64) & bufLength;
int64_t dArrIndex = y & bufLength;

auto p16 = vqmovun_s32(vcvtq_s32_f32(vrndq_f32(vmulq_n_f32(vcvtq_f32_s32(sum), weight))));
auto p8 = vqmovn_u16(vcombine_u16(p16, p16));

src[px] = vget_lane_u8(p8, 0);
src[px + 1] = vget_lane_u8(p8, 1);
src[px + 2] = vget_lane_u8(p8, 2);

int32x4_t bufferValue1 = vld1q_s32(reinterpret_cast<const int *>(&buffer[arrIndex][0]));

int32x4_t bufferValue2 = vld1q_s32(reinterpret_cast<const int *>(&buffer[dArrIndex][0]));
bufferValue2 = vshlq_n_s32(bufferValue2, 1);

dif = vaddq_s32(dif, vsubq_s32(bufferValue1, bufferValue2));
} else if (y + radius >= 0) {
int64_t dArrIndex = y & bufLength;
int32x4_t bufferValue = vld1q_s32(reinterpret_cast<const int *>(&buffer[dArrIndex][0]));
bufferValue = vshlq_n_s32(bufferValue, 1);
dif = vsubq_s32(dif, bufferValue);
}

auto srcNext =
reinterpret_cast<uint8_t *>(reinterpret_cast<uint8_t *>(data)
+ std::clamp(static_cast<int32_t >(y + radius), 0, static_cast<int32_t >(height - 1)) * stride);
int32x4_t p = vsld_u8_to_i32(&srcNext[px], x + 2 < width);
dif = vaddq_s32(dif, p);
sum = vaddq_s32(sum, dif);

auto arrIndex = (y + radius64) & bufLength;

vst1q_s32(&buffer[arrIndex][0], p);
}
}
}

void HorizontalGaussianPassRGBANeon(uint8_t *data,
const uint32_t stride,
const uint32_t width,
const uint32_t height,
const int radius,
const uint32_t start,
const uint32_t end) {
const float weight = 1.f / (static_cast<float>(radius) * static_cast<float>(radius));

constexpr int64_t bufLength = 1023;
int32_t buffer[1024][4] = {{0}};

auto radius64 = static_cast<int64_t>(radius);

auto initialSum = (radius * radius) >> 1;

for (auto y = static_cast<int64_t>(start); y < static_cast<int64_t>(height) && y < static_cast<int64_t>(end); ++y) {
int32x4_t dif = vdupq_n_s32(0);
int32x4_t sum = vdupq_n_s32(initialSum);
for (int64_t x = 0 - 2 * radius64; x < static_cast<int64_t>(width); ++x) {
if (x >= 0) {
const int64_t px = x * 4;

auto src = reinterpret_cast<uint8_t *>(reinterpret_cast<uint8_t *>(data) + static_cast<uint32_t >(y) * stride);
int64_t arrIndex = (x - radius64) & bufLength;
int64_t dArrIndex = x & bufLength;

auto p16 = vqmovun_s32(vcvtq_s32_f32(vrndq_f32(vmulq_n_f32(vcvtq_f32_s32(sum), weight))));
auto p8 = vqmovn_u16(vcombine_u16(p16, p16));

src[px] = vget_lane_u8(p8, 0);
src[px + 1] = vget_lane_u8(p8, 1);
src[px + 2] = vget_lane_u8(p8, 2);

int32x4_t bufferValue1 = vld1q_s32(reinterpret_cast<const int *>(&buffer[arrIndex][0]));

int32x4_t bufferValue2 = vld1q_s32(reinterpret_cast<const int *>(&buffer[dArrIndex][0]));
bufferValue2 = vshlq_n_s32(bufferValue2, 1);

dif = vaddq_s32(dif, vsubq_s32(bufferValue1, bufferValue2));
} else if (x + radius >= 0) {
int64_t dArrIndex = x & bufLength;
int32x4_t bufferValue = vld1q_s32(reinterpret_cast<const int *>(&buffer[dArrIndex][0]));
bufferValue = vshlq_n_s32(bufferValue, 1);
dif = vsubq_s32(dif, bufferValue);
}

auto srcNext = reinterpret_cast<uint8_t *>(reinterpret_cast<uint8_t *>(data) + y * stride);
int64_t px =
std::clamp(static_cast<int64_t >(x + radius), static_cast<int64_t >(0), static_cast<int64_t>(width - 1));
int32x4_t p = vsld_u8_to_i32(&srcNext[px*4], px + 2 < width);
dif = vaddq_s32(dif, p);
sum = vaddq_s32(sum, dif);

auto arrIndex = (x + radius64) & bufLength;

vst1q_s32(&buffer[arrIndex][0], p);
}
}
}

}

#endif
44 changes: 44 additions & 0 deletions src/FastGaussianNeon.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Copyright (C) 2024 Radzivon Bartoshyk
//
// This file belongs to sparkyuv project
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef YUV_SRC_FASTGAUSSIANNEON_H_
#define YUV_SRC_FASTGAUSSIANNEON_H_

#if __aarch64__

#include <cstdint>

namespace sparkyuv {
void VerticalGaussianPassRGBANeon(uint8_t *data,
uint32_t stride,
uint32_t width,
uint32_t height,
int radius,
uint32_t start,
uint32_t end);

void HorizontalGaussianPassRGBANeon(uint8_t *data,
uint32_t stride,
uint32_t width,
uint32_t height,
int radius,
uint32_t start,
uint32_t end);
}

#endif

#endif //YUV_SRC_FASTGAUSSIANNEON_H_
8 changes: 4 additions & 4 deletions src/concurrency.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,12 +147,12 @@ void parallel_for_with_thread_id(const int numThreads, const int numIterations,
}

template<typename Function, typename... Args>
void parallel_for_segment(const int numThreads, const int numIterations, Function &&func, Args &&... args) {
void parallel_for_segment(const int numThreads, const uint32_t numIterations, Function &&func, Args &&... args) {
static_assert(std::is_invocable_v<Function, int, int, Args...>, "func must take an int parameter for iteration id");
#if THREADS_SUPPORTED
std::vector<std::thread> threads;

int segmentHeight = numIterations / numThreads;
int segmentHeight = numIterations / static_cast<uint32_t >(numThreads);

auto parallelWorker = [&](int start, int end) {
std::invoke(func, start, end, std::forward<Args>(args)...);
Expand All @@ -161,8 +161,8 @@ void parallel_for_segment(const int numThreads, const int numIterations, Functio
if (numThreads > 1) {
// Launch N-1 worker threads
for (int i = 1; i < numThreads; ++i) {
int start = i * segmentHeight;
int end = (i + 1) * segmentHeight;
uint32_t start = i * segmentHeight;
uint32_t end = (i + 1) * segmentHeight;
if (i == numThreads - 1) {
end = numIterations;
}
Expand Down
Loading

0 comments on commit a2c0977

Please sign in to comment.