Skip to content

Commit

Permalink
added sve
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Apr 22, 2024
1 parent d42b52e commit 5ad3577
Show file tree
Hide file tree
Showing 10 changed files with 127 additions and 27 deletions.
4 changes: 3 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ project(yuv)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_STANDARD 20)

add_definitions(-DHWY_DISABLED_TARGETS=\(HWY_SVE|HWY_SVE2|HWY_SVE_256|HWY_SVE2_128|HWY_SCALAR\))
add_definitions(-DHWY_DISABLED_TARGETS=\(HWY_SCALAR\))
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+simd+crypto+sve")
#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+simd+crypto+sve")

include(CheckCXXCompilerFlag)
check_cxx_compiler_flag("-pthread" COMPILER_SUPPORTS_THREADS)
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM --platform=linux/ppc64le ubuntu:latest
FROM ubuntu:latest

RUN apt-get -y update && apt-get -y install build-essential ninja-build cmake

Expand Down
3 changes: 2 additions & 1 deletion src/BlendAlpha-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,11 +161,12 @@ UnpremultiplyAlpha8HWY(const uint8_t *SPARKYUV_RESTRICT src, const uint32_t srcS
auto Ah = PromoteUpperTo(du16, A);
auto Al = PromoteLowerTo(du16, A);

#if HWY_TARGET != HWY_SVE && HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128
const auto ahMask = Ah == zeros;
const auto alMask = Ah == zeros;

Ah = IfThenElse(ahMask, mScale, Ah);
Al = IfThenElse(alMask, mScale, Al);
#endif

const auto Rh = Div(WidenMulHigh(du8, R, scale), Ah);
const auto Rl = Div(WidenMul(dhu8, LowerHalf(R), lScale), Al);
Expand Down
2 changes: 1 addition & 1 deletion src/ITUR/eotf-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ static float HLGOetf(float linear) {
} else if (linear <= (1.0f / 12.0f)) {
return ::sqrtf(3.0f * linear);
} else {
return 0.17883277f * logf(12.0f * linear - 0.28466892f) + 0.55991073f;
return 0.17883277f * ::logf(12.0f * linear - 0.28466892f) + 0.55991073f;
}
}

Expand Down
23 changes: 20 additions & 3 deletions src/YcCbcCrc-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@
#include "sparkyuv-internal.h"
#include "Eotf-inl.h"

#if HWY_TARGET != HWY_SVE && HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128
#define YCCBCCRC_HWY_ENABLED 1
#else
#define YCCBCCRC_HWY_ENABLED 0
#endif

HWY_BEFORE_NAMESPACE();
namespace sparkyuv::HWY_NAMESPACE {

Expand All @@ -42,13 +48,15 @@ void computeYcCbcCrcCutoffs(const SparkYuvTransferFunction transferFunction,
Pr = 1.f - Oetf(kr, transferFunction);
}

#if YCCBCCRC_HWY_ENABLED
template<typename D, typename V = Vec<D>, HWY_IF_FLOAT_D(D)>
EOTF_INLINE V computeYcCbcCrcEquation(D d, V dx, V low, V high, V scaleLow, V scaleHigh) {
const auto zeros = Zero(d);
auto v = IfThenElse(And(dx >= low, dx <= zeros), Mul(dx, scaleLow), zeros);
v = IfThenElse(And(dx > zeros, dx < high), Mul(dx, scaleHigh), v);
return v;
}
#endif

EOTF_INLINE float computeYcCbcCrcEquation(float dx, float low, float high, float scaleLow, float scaleHigh) {
float Eb;
Expand Down Expand Up @@ -113,17 +121,17 @@ PixelToYcCbcCrcHWY(const T *SPARKYUV_RESTRICT src, const uint32_t srcStride,
const float divisor2 = 1.f / 2.f;
const float divisor4 = 1.f / 4.f;

#if YCCBCCRC_HWY_ENABLED
const ScalableTag<uint16_t> du16;
const Half<decltype(du16)> dhu16;
const Rebind<T, decltype(du16)> d;
const Half<decltype(d)> dh;
const int lanes = Lanes(d);
const int uvLanes = (chromaSubsample == YUV_SAMPLE_444) ? lanes : Lanes(dh);
const Rebind<uint32_t, decltype(dhu16)> du32;
const Rebind<float, decltype(du32)> df;
using VF = Vec<decltype(df)>;
using VU = Vec<decltype(du16)>;

const int lanes = Lanes(d);
const int uvLanes = (chromaSubsample == YUV_SAMPLE_444) ? lanes : Lanes(dh);
const auto vScaleRangeY = Set(df, rangeY);
const auto vScaleRangeUV = Set(df, rangeUV);
const auto vBiasY = Set(df, biasY);
Expand All @@ -142,6 +150,7 @@ PixelToYcCbcCrcHWY(const T *SPARKYUV_RESTRICT src, const uint32_t srcStride,
const auto vKg = Set(df, kg);
const auto alpha = Set(d, maxColors);
const auto v420Scale = Set(df, 0.5f);
#endif

for (uint32_t y = 0; y < height; ++y) {
uint32_t x = 0;
Expand All @@ -151,6 +160,7 @@ PixelToYcCbcCrcHWY(const T *SPARKYUV_RESTRICT src, const uint32_t srcStride,
auto vDst = reinterpret_cast<T *>(vStore);
auto mSrc = reinterpret_cast<const T *>(mSource);

#if YCCBCCRC_HWY_ENABLED
for (; x + lanes < width; x += lanes) {
VF Rh, Rl, Gh, Gl, Bh, Bl;
if (std::is_same<T, uint16_t>::value) {
Expand Down Expand Up @@ -317,6 +327,7 @@ PixelToYcCbcCrcHWY(const T *SPARKYUV_RESTRICT src, const uint32_t srcStride,
vDst += uvLanes;
mSrc += lanes*components;
}
#endif

for (; x < width; x += lanesForward) {
float r;
Expand Down Expand Up @@ -570,6 +581,7 @@ EOTF_INLINE T YcCbcCrcInverse(T Y, T v, T scaleLow, T scaleHigh, T low, T high)
return x;
}

#if YCCBCCRC_HWY_ENABLED
template<class D, class V = Vec<D>>
EOTF_INLINE V YcCbcCrcInverse(D d, V Y, V v, V scaleLow, V scaleHigh, V low, V high) {
const V l = Mul(v, scaleLow);
Expand All @@ -581,6 +593,7 @@ EOTF_INLINE V YcCbcCrcInverse(D d, V Y, V v, V scaleLow, V scaleHigh, V low, V h
cv = IfThenElse(And(diffh > zeros, diffh <= high), Add(h, Y), cv);
return cv;
}
#endif

/**
* @brief It will be good to declare a type of transfer function
Expand Down Expand Up @@ -632,6 +645,7 @@ void YcCbcCrcToXRGB(T *SPARKYUV_RESTRICT rgbaData, const uint32_t dstStride,

const float ekg = 1.f / kg;

#if YCCBCCRC_HWY_ENABLED
const ScalableTag<uint16_t> du16;
const Half<decltype(du16)> dhu16;
const Rebind<T, decltype(du16)> d;
Expand Down Expand Up @@ -690,6 +704,7 @@ void YcCbcCrcToXRGB(T *SPARKYUV_RESTRICT rgbaData, const uint32_t dstStride,
#endif

const auto A = Set(du16, maxColors);
#endif

for (int y = 0; y < height; ++y) {
auto CbSource = reinterpret_cast<const T *>(mUSrc);
Expand All @@ -699,6 +714,7 @@ void YcCbcCrcToXRGB(T *SPARKYUV_RESTRICT rgbaData, const uint32_t dstStride,

uint32_t x = 0;

#if YCCBCCRC_HWY_ENABLED
for (; x + lanes < width; x += lanes) {
#if SPARKYUV_ALLOW_FLOAT16
VF Y, Cb, Cr;
Expand Down Expand Up @@ -835,6 +851,7 @@ void YcCbcCrcToXRGB(T *SPARKYUV_RESTRICT rgbaData, const uint32_t dstStride,
CrSource += uvLanes;
store += lanes * components;
}
#endif

for (; x < width; x += lanesForward) {
const T uValue = reinterpret_cast<const T *>(CbSource)[0];
Expand Down
24 changes: 21 additions & 3 deletions src/sampler/BilinearRowSampler-inl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,22 @@

#include <hwy/highway.h>
#include "ScaleRowSampler.hpp"
#include "sampler.h"
#include "sampler-inl.h"
#include "../yuv-inl.h"
#include "sampler.h"
#include <cstdint>
#include <algorithm>
#include <cmath>

#if HWY_TARGET != HWY_SVE && HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128
#define BILINEAR_ENABLE_HWY 1
#else
#define BILINEAR_ENABLE_HWY 0
#endif

#if BILINEAR_ENABLE_HWY
#include "sampler-inl.h"
#endif

HWY_BEFORE_NAMESPACE();
namespace sparkyuv::HWY_NAMESPACE {
using namespace sparkyuv;
Expand Down Expand Up @@ -61,6 +70,7 @@ class BilinearRowSampler4Chan8Bit : public ScaleRowSampler<uint8_t> {
~BilinearRowSampler4Chan8Bit() override = default;

void sample(const int row) override {
#if BILINEAR_ENABLE_HWY
const FixedTag<float32_t, 4> dfx4;
const FixedTag<uint32_t, 4> dix4;
const FixedTag<uint8_t, 4> du8x4;
Expand All @@ -82,7 +92,7 @@ class BilinearRowSampler4Chan8Bit : public ScaleRowSampler<uint8_t> {
const VF4 vfZeros = Zero(dfx4);
const VI4 srcStrideV = Set(dix4, this->srcStride);
const VF4 maxColorsV = Set(dfx4, maxColors);

#endif
auto dst8 = reinterpret_cast<uint8_t *>(reinterpret_cast<uint8_t *>(this->mDestination) + row * this->dstStride);
auto dst = reinterpret_cast<uint8_t *>(dst8);

Expand All @@ -91,6 +101,7 @@ class BilinearRowSampler4Chan8Bit : public ScaleRowSampler<uint8_t> {

uint32_t x = 0;

#if BILINEAR_ENABLE_HWY
#if !NOACCELERATED_SAMPLER
for (; x + 8 < this->outputWidth && components == 4; ++x) {
VI4 currentX = Set(dix4, x);
Expand Down Expand Up @@ -135,6 +146,7 @@ class BilinearRowSampler4Chan8Bit : public ScaleRowSampler<uint8_t> {

x += components - 1;
}
#endif
#endif

for (; x < this->outputWidth; ++x) {
Expand Down Expand Up @@ -196,6 +208,7 @@ class BilinearRowSamplerF16Bit : public ScaleRowSampler<uint16_t> {
~BilinearRowSamplerF16Bit() override = default;

void sample(const int y) override {
#if BILINEAR_ENABLE_HWY
const FixedTag<float32_t, 4> dfx4;
const FixedTag<int32_t, 4> dix4;
const FixedTag<hwy::float16_t, 4> df16x4;
Expand All @@ -213,6 +226,7 @@ class BilinearRowSamplerF16Bit : public ScaleRowSampler<uint16_t> {
const VI4 maxHeight = Set(dix4, this->inputHeight - 1);
const VF4 vfZeros = Zero(dfx4);
const VI4 srcStrideV = Set(dix4, this->srcStride);
#endif

const auto src8 = reinterpret_cast<const uint8_t *>(this->mSource);
auto dst16 = reinterpret_cast<uint16_t *>(reinterpret_cast<uint8_t *>(this->mDestination) + y * this->dstStride);
Expand All @@ -221,6 +235,7 @@ class BilinearRowSamplerF16Bit : public ScaleRowSampler<uint16_t> {

uint32_t x = 0;

#if BILINEAR_ENABLE_HWY
#if !NOACCELERATED_SAMPLER
for (; x + 8 < this->outputWidth && components == 4; ++x) {
VI4 currentX = Set(dix4, x);
Expand Down Expand Up @@ -263,6 +278,7 @@ class BilinearRowSamplerF16Bit : public ScaleRowSampler<uint16_t> {

x += components - 1;
}
#endif
#endif

for (; x < this->outputWidth; ++x) {
Expand Down Expand Up @@ -460,4 +476,6 @@ class BilinearRowSampler10Bit : public ScaleRowSampler<uint32_t> {
} // sparkyuv
HWY_AFTER_NAMESPACE();

#undef BILINEAR_ENABLE_HWY

#endif //SPARKYUV_BILINEAR_ROW_SAMPLER
1 change: 0 additions & 1 deletion src/sampler/NearestRowSampler-inl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@

#include "hwy/highway.h"
#include "ScaleRowSampler.hpp"
#include "sampler-inl.h"
#include "sampler.h"
#include <cstdint>
#include <algorithm>
Expand Down
Loading

0 comments on commit 5ad3577

Please sign in to comment.