Added fast gaussian very fast path for neon

awxkee · Apr 27, 2024 · a2c0977 · a2c0977
1 parent 4a3ced8
commit a2c0977
Show file tree

Hide file tree

Showing 7 changed files with 381 additions and 146 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -44,7 +44,9 @@ set(SPARKYUV_SOURCES
         src/Rotate.cpp
         src/FastGaussian.cpp
         src/FastGaussian.h
-        src/GaussianBlur.cpp)
+        src/GaussianBlur.cpp
+        src/FastGaussianNeon.cpp
+        src/FastGaussianNeon.h)
 
 set(HWY_SOURCES
         highway/hwy/aligned_allocator.cc highway/hwy/targets.cc highway/hwy/targets.cc

diff --git a/include/sparkyuv-ycgcor.h b/include/sparkyuv-ycgcor.h
diff --git a/src/FastGaussian.cpp b/src/FastGaussian.cpp
@@ -18,8 +18,8 @@
 #include "sparkyuv-internal.h"
 #include "PixelLoader.h"
 #include "concurrency.hpp"
-#include "hwy/aligned_allocator.h"
 #include "TypeSupport.h"
+#include "FastGaussianNeon.h"
 
 namespace sparkyuv {
 
@@ -32,7 +32,7 @@ void VerticalGaussianPassChannel(T *data,
   const float weight = 1.f / (static_cast<float>(radius) * static_cast<float>(radius));
 
   constexpr int bufLength = 1023;
-  auto buffer = hwy::AllocateAligned<V>(bufLength + 1);
+  V buffer[1024];
 
   for (uint32_t x = 0; x < width; ++x) {
     V dif = 0, sum = (radius * radius) >> 1;
@@ -69,7 +69,7 @@ void HorizontalGaussianPassChannel(T *data,
   const float weight = 1.f / (static_cast<float>(radius) * static_cast<float>(radius));
 
   constexpr int bufLength = 1023;
-  auto buffer = hwy::AllocateAligned<V>(bufLength + 1);
+  V buffer[1024];
 
   for (int y = 0; y < height; ++y) {
     V dif = 0, sum = (radius * radius) >> 1;
@@ -103,9 +103,9 @@ void VerticalGaussianPass(T *data, const uint32_t stride,
   const float weight = 1.f / (static_cast<float>(radius) * static_cast<float>(radius));
 
   constexpr int bufLength = 1023;
-  auto bufferR = hwy::AllocateAligned<V>(bufLength + 1);
-  auto bufferG = hwy::AllocateAligned<V>(bufLength + 1);
-  auto bufferB = hwy::AllocateAligned<V>(bufLength + 1);
+  V bufferR[1024];
+  V bufferG[1024];
+  V bufferB[1024];
 
   const int channels = getPixelTypeComponents(PixelType);
 
@@ -171,9 +171,9 @@ void HorizontalGaussianPass(T *data,
   const float weight = 1.f / (static_cast<float>(radius) * static_cast<float>(radius));
 
   constexpr int bufLength = 1023;
-  auto bufferR = hwy::AllocateAligned<V>(bufLength + 1);
-  auto bufferG = hwy::AllocateAligned<V>(bufLength + 1);
-  auto bufferB = hwy::AllocateAligned<V>(bufLength + 1);
+  V bufferR[1024];
+  V bufferG[1024];
+  V bufferB[1024];
 
   const int channels = getPixelTypeComponents(PixelType);
 
@@ -236,9 +236,9 @@ void VerticalFastGaussianPassNext(T *data, const uint32_t stride, const int widt
   const float weight = 1.f / (static_cast<float>(radius) * static_cast<float>(radius) * static_cast<float>(radius));
 
   constexpr int bufLength = 1023;
-  auto bufferR = hwy::AllocateAligned<V>(bufLength + 1);
-  auto bufferG = hwy::AllocateAligned<V>(bufLength + 1);
-  auto bufferB = hwy::AllocateAligned<V>(bufLength + 1);
+  V bufferR[1024];
+  V bufferG[1024];
+  V bufferB[1024];
 
   const int channels = getPixelTypeComponents(PixelType);
 
@@ -303,9 +303,9 @@ void HorizontalFastGaussianNext(T *data, const uint32_t stride,
   const float weight = 1.f / (static_cast<float>(radius) * static_cast<float>(radius) * static_cast<float>(radius));
 
   constexpr int bufLength = 1023;
-  auto bufferR = hwy::AllocateAligned<V>(bufLength + 1);
-  auto bufferG = hwy::AllocateAligned<V>(bufLength + 1);
-  auto bufferB = hwy::AllocateAligned<V>(bufLength + 1);
+  V bufferR[1024];
+  V bufferG[1024];
+  V bufferB[1024];
 
   const int channels = getPixelTypeComponents(PixelType);
 
@@ -346,7 +346,7 @@ void HorizontalFastGaussianNext(T *data, const uint32_t stride,
       int mPNextRad = (x + 2 * radius) & bufLength;
 
       auto srcNext = reinterpret_cast<T *>(reinterpret_cast<uint8_t *>(data) + y * stride);
-      int px = std::clamp(x + radius, 0, static_cast<int32_t>(width - 1)) * channels;
+      int px = std::clamp(x + 3 * radius / 2, 0, static_cast<int32_t>(width - 1)) * channels;
       srcNext += px;
 
       V pR, pG, pB, pA;
@@ -364,18 +364,35 @@ void HorizontalFastGaussianNext(T *data, const uint32_t stride,
   }
 }
 
+void FastGaussianBlurRGBA(uint8_t *data, uint32_t stride, uint32_t width, uint32_t height, int radius) {
+  const int threadCount = concurrency::getThreadCounts(width, height);
+  concurrency::parallel_for_segment(threadCount, width, [&](uint32_t start, uint32_t end) {
+#if __aarch64__
+    VerticalGaussianPassRGBANeon(data, stride, width, height, radius, start, end);
+#else
+    VerticalGaussianPass<uint8_t, int, sparkyuv::PIXEL_RGBA>(data, stride, width, height, radius, start, end);
+#endif
+  });
+  concurrency::parallel_for_segment(threadCount, height, [&](uint32_t start, uint32_t end) {
+#if __aarch64__
+    HorizontalGaussianPassRGBANeon(data, stride, width, height, radius, start, end);
+#else
+    HorizontalGaussianPass<uint8_t, int, sparkyuv::PIXEL_RGBA>(data, stride, width, height, radius, start, end);
+#endif
+  });
+}
+
 #define FAST_GAUSSIAN_DECLARATION_R(pixelName, pixelType, storageType) \
     void FastGaussianBlur##pixelName(storageType *data, uint32_t stride, uint32_t width, uint32_t height, int radius) {\
       const int threadCount = concurrency::getThreadCounts(width, height);\
-      concurrency::parallel_for_segment(threadCount, width, [&](int start, int end) {\
+      concurrency::parallel_for_segment(threadCount, width, [&](int start, int end) { \
         VerticalGaussianPass<storageType, int, sparkyuv::PIXEL_##pixelType>(data, stride, width, height, radius, start, end);\
       });\
       concurrency::parallel_for_segment(threadCount, height, [&](int start, int end) {\
         HorizontalGaussianPass<storageType, int, sparkyuv::PIXEL_##pixelType>(data, stride, width, height, radius, start, end);\
       });\
     }
 
-FAST_GAUSSIAN_DECLARATION_R(RGBA, RGBA, uint8_t)
 FAST_GAUSSIAN_DECLARATION_R(RGB, RGB, uint8_t)
 #if SPARKYUV_FULL_CHANNELS
 FAST_GAUSSIAN_DECLARATION_R(ARGB, ARGB, uint8_t)

diff --git a/src/FastGaussianNeon.cpp b/src/FastGaussianNeon.cpp
@@ -0,0 +1,168 @@
+// Copyright (C) 2024 Radzivon Bartoshyk
+//
+// This file belongs to sparkyuv project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "FastGaussianNeon.h"
+
+#if __aarch64__
+
+#include <arm_neon.h>
+#include <algorithm>
+
+namespace sparkyuv {
+
+#if defined(__GNUC__) || defined(__clang__)
+#define FAST_GAUSSIAN_INLINE __attribute__((flatten)) inline
+#else
+#define FAST_GAUSSIAN_INLINE inline
+#endif
+
+FAST_GAUSSIAN_INLINE int32x4_t vsld_u8_to_i32(const uint8_t *src, bool useVld) {
+  uint8_t safeStore[8] = {0};
+  const uint8_t *origin;
+  if (useVld) {
+    origin = src;
+  } else {
+    std::copy(src, src + 3, safeStore);
+    origin = safeStore;
+  }
+  auto fullPx = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(origin))));
+  return vmovl_s16(fullPx);
+}
+
+void VerticalGaussianPassRGBANeon(uint8_t *data,
+                                  const uint32_t stride,
+                                  const uint32_t width,
+                                  const uint32_t height,
+                                  const int radius,
+                                  const uint32_t start,
+                                  const uint32_t end) {
+  const float weight = 1.f / (static_cast<float>(radius) * static_cast<float>(radius));
+
+  constexpr int64_t bufLength = 1023;
+  int32_t buffer[1024][4] = {{0}};
+
+  auto radius64 = static_cast<int64_t>(radius);
+
+  auto initialSum = (radius * radius) >> 1;
+
+  for (int64_t x = start; x < static_cast<int64_t> (width) && x < static_cast<int64_t>(end); ++x) {
+    int32x4_t dif = vdupq_n_s32(0);
+    int32x4_t sum = vdupq_n_s32(initialSum);
+    int64_t px = x * 4;
+    for (int64_t y = 0 - 2 * radius64; y < static_cast<int64_t>(height); ++y) {
+      if (y >= 0) {
+        auto src = reinterpret_cast<uint8_t *>(reinterpret_cast<uint8_t *>(data) + static_cast<uint32_t >(y) * stride);
+        int64_t arrIndex = (y - radius64) & bufLength;
+        int64_t dArrIndex = y & bufLength;
+
+        auto p16 = vqmovun_s32(vcvtq_s32_f32(vrndq_f32(vmulq_n_f32(vcvtq_f32_s32(sum), weight))));
+        auto p8 = vqmovn_u16(vcombine_u16(p16, p16));
+
+        src[px] = vget_lane_u8(p8, 0);
+        src[px + 1] = vget_lane_u8(p8, 1);
+        src[px + 2] = vget_lane_u8(p8, 2);
+
+        int32x4_t bufferValue1 = vld1q_s32(reinterpret_cast<const int *>(&buffer[arrIndex][0]));
+
+        int32x4_t bufferValue2 = vld1q_s32(reinterpret_cast<const int *>(&buffer[dArrIndex][0]));
+        bufferValue2 = vshlq_n_s32(bufferValue2, 1);
+
+        dif = vaddq_s32(dif, vsubq_s32(bufferValue1, bufferValue2));
+      } else if (y + radius >= 0) {
+        int64_t dArrIndex = y & bufLength;
+        int32x4_t bufferValue = vld1q_s32(reinterpret_cast<const int *>(&buffer[dArrIndex][0]));
+        bufferValue = vshlq_n_s32(bufferValue, 1);
+        dif = vsubq_s32(dif, bufferValue);
+      }
+
+      auto srcNext =
+          reinterpret_cast<uint8_t *>(reinterpret_cast<uint8_t *>(data)
+              + std::clamp(static_cast<int32_t >(y + radius), 0, static_cast<int32_t >(height - 1)) * stride);
+      int32x4_t p = vsld_u8_to_i32(&srcNext[px], x + 2 < width);
+      dif = vaddq_s32(dif, p);
+      sum = vaddq_s32(sum, dif);
+
+      auto arrIndex = (y + radius64) & bufLength;
+
+      vst1q_s32(&buffer[arrIndex][0], p);
+    }
+  }
+}
+
+void HorizontalGaussianPassRGBANeon(uint8_t *data,
+                                    const uint32_t stride,
+                                    const uint32_t width,
+                                    const uint32_t height,
+                                    const int radius,
+                                    const uint32_t start,
+                                    const uint32_t end) {
+  const float weight = 1.f / (static_cast<float>(radius) * static_cast<float>(radius));
+
+  constexpr int64_t bufLength = 1023;
+  int32_t buffer[1024][4] = {{0}};
+
+  auto radius64 = static_cast<int64_t>(radius);
+
+  auto initialSum = (radius * radius) >> 1;
+
+  for (auto y = static_cast<int64_t>(start); y < static_cast<int64_t>(height) && y < static_cast<int64_t>(end); ++y) {
+    int32x4_t dif = vdupq_n_s32(0);
+    int32x4_t sum = vdupq_n_s32(initialSum);
+    for (int64_t x = 0 - 2 * radius64; x < static_cast<int64_t>(width); ++x) {
+      if (x >= 0) {
+        const int64_t px = x * 4;
+
+        auto src = reinterpret_cast<uint8_t *>(reinterpret_cast<uint8_t *>(data) + static_cast<uint32_t >(y) * stride);
+        int64_t arrIndex = (x - radius64) & bufLength;
+        int64_t dArrIndex = x & bufLength;
+
+        auto p16 = vqmovun_s32(vcvtq_s32_f32(vrndq_f32(vmulq_n_f32(vcvtq_f32_s32(sum), weight))));
+        auto p8 = vqmovn_u16(vcombine_u16(p16, p16));
+
+        src[px] = vget_lane_u8(p8, 0);
+        src[px + 1] = vget_lane_u8(p8, 1);
+        src[px + 2] = vget_lane_u8(p8, 2);
+
+        int32x4_t bufferValue1 = vld1q_s32(reinterpret_cast<const int *>(&buffer[arrIndex][0]));
+
+        int32x4_t bufferValue2 = vld1q_s32(reinterpret_cast<const int *>(&buffer[dArrIndex][0]));
+        bufferValue2 = vshlq_n_s32(bufferValue2, 1);
+
+        dif = vaddq_s32(dif, vsubq_s32(bufferValue1, bufferValue2));
+      } else if (x + radius >= 0) {
+        int64_t dArrIndex = x & bufLength;
+        int32x4_t bufferValue = vld1q_s32(reinterpret_cast<const int *>(&buffer[dArrIndex][0]));
+        bufferValue = vshlq_n_s32(bufferValue, 1);
+        dif = vsubq_s32(dif, bufferValue);
+      }
+
+      auto srcNext = reinterpret_cast<uint8_t *>(reinterpret_cast<uint8_t *>(data) + y * stride);
+      int64_t px =
+          std::clamp(static_cast<int64_t >(x + radius), static_cast<int64_t >(0), static_cast<int64_t>(width - 1));
+      int32x4_t p = vsld_u8_to_i32(&srcNext[px*4], px + 2 < width);
+      dif = vaddq_s32(dif, p);
+      sum = vaddq_s32(sum, dif);
+
+      auto arrIndex = (x + radius64) & bufLength;
+
+      vst1q_s32(&buffer[arrIndex][0], p);
+    }
+  }
+}
+
+}
+
+#endif
diff --git a/src/FastGaussianNeon.h b/src/FastGaussianNeon.h
@@ -0,0 +1,44 @@
+// Copyright (C) 2024 Radzivon Bartoshyk
+//
+// This file belongs to sparkyuv project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef YUV_SRC_FASTGAUSSIANNEON_H_
+#define YUV_SRC_FASTGAUSSIANNEON_H_
+
+#if __aarch64__
+
+#include <cstdint>
+
+namespace sparkyuv {
+void VerticalGaussianPassRGBANeon(uint8_t *data,
+                                  uint32_t stride,
+                                  uint32_t width,
+                                  uint32_t height,
+                                  int radius,
+                                  uint32_t start,
+                                  uint32_t end);
+
+void HorizontalGaussianPassRGBANeon(uint8_t *data,
+                                    uint32_t stride,
+                                    uint32_t width,
+                                    uint32_t height,
+                                    int radius,
+                                    uint32_t start,
+                                    uint32_t end);
+}
+
+#endif
+
+#endif //YUV_SRC_FASTGAUSSIANNEON_H_
diff --git a/src/concurrency.hpp b/src/concurrency.hpp
@@ -147,12 +147,12 @@ void parallel_for_with_thread_id(const int numThreads, const int numIterations,
 }
 
 template<typename Function, typename... Args>
-void parallel_for_segment(const int numThreads, const int numIterations, Function &&func, Args &&... args) {
+void parallel_for_segment(const int numThreads, const uint32_t numIterations, Function &&func, Args &&... args) {
   static_assert(std::is_invocable_v<Function, int, int, Args...>, "func must take an int parameter for iteration id");
 #if THREADS_SUPPORTED
   std::vector<std::thread> threads;
 
-  int segmentHeight = numIterations / numThreads;
+  int segmentHeight = numIterations / static_cast<uint32_t >(numThreads);
 
   auto parallelWorker = [&](int start, int end) {
     std::invoke(func, start, end, std::forward<Args>(args)...);
@@ -161,8 +161,8 @@ void parallel_for_segment(const int numThreads, const int numIterations, Functio
   if (numThreads > 1) {
     // Launch N-1 worker threads
     for (int i = 1; i < numThreads; ++i) {
-      int start = i * segmentHeight;
-      int end = (i + 1) * segmentHeight;
+      uint32_t start = i * segmentHeight;
+      uint32_t end = (i + 1) * segmentHeight;
       if (i == numThreads - 1) {
         end = numIterations;
       }