diff --git a/src/GaussianBlur-inl.h b/src/GaussianBlur-inl.h index 557790c..e4515b9 100644 --- a/src/GaussianBlur-inl.h +++ b/src/GaussianBlur-inl.h @@ -105,7 +105,199 @@ GaussianBlurHorizontalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t sr template::type = 0, - typename std::enable_if::value, int>::type = 0> + typename std::enable_if::value, int>::type = 0, + ENABLE_TYPE_IS_F16(T)> +void +GaussianBlurVerticalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride, + T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride, + const uint32_t startY, const uint32_t endY, + const uint32_t width, const uint32_t height, + const float *mKernel, const int kernelSize) { + const int halfOfKernel = kernelSize / 2; + const bool isEven = kernelSize % 2 == 0; + const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel; + + auto mDst = reinterpret_cast(mDestination); + int64_t maxHeight = static_cast(height) - 1; + + const FixedTag d16x8; + const FixedTag d16x4; + const FixedTag df16; + const FixedTag df16x4; + const FixedTag df; + using VF = Vec; + + for (uint32_t y = startY; y < endY; ++y) { + auto dst = reinterpret_cast(mDst + dstStride * y); + for (uint32_t x = 0; x < width; ++x) { + int r = -halfOfKernel; + + VF accumulator = Zero(df); + + auto kx = static_cast(x) * 4; + + for (; r <= maxKernel; ++r) { + uint32_t shiftX = std::clamp(static_cast(y) + static_cast(r), + static_cast(0), + static_cast(maxHeight)); + auto localSource = reinterpret_cast(reinterpret_cast(mSource) + shiftX * srcStride); + float weight = mKernel[halfOfKernel + r]; + VF pixelData; +#if SPARKYUV_ALLOW_FLOAT16 + const auto pxf16 = LoadU(df16x4, &localSource[kx]); + pixelData = PromoteTo(df, pxf16); +#else + const auto pxf16 = BitCast(df16x4, LoadU(d16x4, reinterpret_cast(&localSource[kx]))); + pixelData = PromoteTo(df, pxf16); +#endif + accumulator = Add(accumulator, Mul(pixelData, Set(df, weight))); + } + +#if SPARKYUV_ALLOW_FLOAT16 + StoreU(DemoteTo(df16x4, accumulator), df16x4, dst); +#else + auto duStore = BitCast(d16x4, DemoteTo(df16x4, accumulator)); + StoreU(duStore, d16x4, reinterpret_cast(dst)); +#endif + + dst += 4; + } + } +} + +template::type = 0, + typename std::enable_if::value, int>::type = 0, + ENABLE_TYPE_IS_F16(T)> +void +GaussianBlurHorizontalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride, + T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride, + const uint32_t startY, const uint32_t endY, + const uint32_t width, const uint32_t /* height */, + const float *mKernel, const int kernelSize) { + const int halfOfKernel = kernelSize / 2; + const bool isEven = kernelSize % 2 == 0; + const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel; + + auto mDst = reinterpret_cast(mDestination); + int maxWidth = static_cast(width) - 1; + int sZero = 0; + + const FixedTag d16x8; + const FixedTag d16x4; + const FixedTag df16; + const FixedTag df16x4; + const FixedTag df; + using VF = Vec; + + for (uint32_t y = startY; y < endY; ++y) { + auto dst = reinterpret_cast(mDst + dstStride * y); + auto localSource = reinterpret_cast(reinterpret_cast(mSource) + y * srcStride); + for (uint32_t x = 0; x < width; ++x) { + int r = -halfOfKernel; + + VF accumulator = Zero(df); + auto kx = static_cast(x); + + for (; r + 2 <= maxKernel && kx + x + 2 < width; r += 2) { + int sourcePX = std::clamp(kx + r, sZero, maxWidth) * 4; + auto movedSrc = localSource + sourcePX; + const float weight1 = mKernel[halfOfKernel + r]; + const float weight2 = mKernel[halfOfKernel + r + 1]; + VF pixelData1; + VF pixelData2; + +#if SPARKYUV_ALLOW_FLOAT16 + const auto pxf16 = LoadU(df16, movedSrc); + pixelData1 = PromoteLowerTo(df, pxf16); + pixelData2 = PromoteUpperTo(df, pxf16); +#else + const auto pxf16 = BitCast(df16, LoadU(d16x8, reinterpret_cast(movedSrc))); + pixelData1 = PromoteLowerTo(df, pxf16); + pixelData2 = PromoteUpperTo(df, pxf16); +#endif + + accumulator = Add(accumulator, Mul(pixelData1, Set(df, weight1))); + accumulator = Add(accumulator, Mul(pixelData2, Set(df, weight2))); + } + + for (; r <= maxKernel; ++r) { + float weight = mKernel[halfOfKernel + r]; + int sourcePX = std::clamp(kx + r, sZero, maxWidth) * 4; + auto movedSrc = localSource + sourcePX; + VF pixelData; +#if SPARKYUV_ALLOW_FLOAT16 + const auto pxf16 = LoadU(df16x4, movedSrc); + pixelData = PromoteTo(df, pxf16); +#else + const auto pxf16 = BitCast(df16x4, LoadU(d16x4, reinterpret_cast(movedSrc))); + pixelData = PromoteTo(df, pxf16); +#endif + accumulator = Add(accumulator, Mul(pixelData, Set(df, weight))); + } + +#if SPARKYUV_ALLOW_FLOAT16 + StoreU(DemoteTo(df16x4, accumulator), df16x4, dst); +#else + auto duStore = BitCast(d16x4, DemoteTo(df16x4, accumulator)); + StoreU(duStore, d16x4, reinterpret_cast(dst)); +#endif + + dst += 4; + } + } +} + +template::type = 0, + typename std::enable_if::value, int>::type = 0, + ENABLE_TYPE_IS_F16(T)> +void +GaussianBlurVerticalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride, + T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride, + const uint32_t startY, const uint32_t endY, + const uint32_t width, const uint32_t height, + const float *mKernel, const int kernelSize) { + const int halfOfKernel = kernelSize / 2; + const bool isEven = kernelSize % 2 == 0; + const int maxKernel = isEven ? halfOfKernel - 1 : halfOfKernel; + + auto mDst = reinterpret_cast(mDestination); + int64_t maxHeight = static_cast(height) - 1; + + const FixedTag d8x4; + const FixedTag d32; + const FixedTag df; + using VF = Vec; + + for (uint32_t y = startY; y < endY; ++y) { + auto dst = reinterpret_cast(mDst + dstStride * y); + for (uint32_t x = 0; x < width; ++x) { + int r = -halfOfKernel; + VF acc = Zero(df); + + for (; r <= maxKernel; ++r) { + uint32_t shiftX = std::clamp(static_cast(y) + static_cast(r), + static_cast(0), + static_cast(maxHeight)); + auto localSource = reinterpret_cast(reinterpret_cast(mSource) + shiftX * srcStride); + float weight = mKernel[halfOfKernel + r]; + uint32_t sourcePX = x * 4; + auto vx = ConvertTo(df, PromoteTo(d32, LoadU(d8x4, &localSource[sourcePX]))); + acc = Add(acc, Mul(vx, Set(df, weight))); + } + acc = Round(acc); + auto newPX = DemoteTo(d8x4, ConvertTo(d32, acc)); + StoreU(newPX, d8x4, dst); + dst += 4; + } + } +} + +template::type = 0, + typename std::enable_if::value, int>::type = 0, + ENABLE_TYPE_IS_NOT_F16(T)> void GaussianBlurHorizontalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride, T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride, @@ -326,7 +518,8 @@ GaussianBlurVerticalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcS template::type = 0, - typename std::enable_if::value, int>::type = 0> + typename std::enable_if::value, int>::type = 0, + HWY_IF_NOT_F16(T)> void GaussianBlurVerticalPass(const T *SPARKYUV_RESTRICT mSource, const uint32_t srcStride, T *SPARKYUV_RESTRICT mDestination, const uint32_t dstStride, diff --git a/tools/main.cpp b/tools/main.cpp index 47db508..b5d8802 100644 --- a/tools/main.cpp +++ b/tools/main.cpp @@ -327,8 +327,9 @@ int main() { height); bench(1, ANSI_COLOR_GREEN, "Fast Gaussian", [&]() { - sparkyuv::FastGaussianNextBlurRGBAF16(reinterpret_cast(f16Store.data()), width * 4 * sizeof(uint16_t), - width, height, 15); + sparkyuv::GaussianBlurRGBAF16(reinterpret_cast(f16Store.data()), width * 4 * sizeof(uint16_t), + reinterpret_cast(f16Store.data()), width * 4 * sizeof(uint16_t), + width, height, 15 * 6.f, 6.f); }); sparkyuv::RGBAF16ToRGBA(reinterpret_cast(f16Store.data()), width * 4 * sizeof(uint16_t), rgbaData.data(),