Skip to content

Commit

Permalink
F16 load improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Apr 20, 2024
1 parent 2edeb67 commit 2d34960
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 33 deletions.
31 changes: 15 additions & 16 deletions src/Channels-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "hwy/highway.h"
#include "yuv-inl.h"
#include "sparkyuv-internal.h"
#include "TypeSupport.h"

HWY_BEFORE_NAMESPACE();
namespace sparkyuv::HWY_NAMESPACE {
Expand Down Expand Up @@ -319,47 +320,45 @@ ReformatSurfaceF16ToU(const uint16_t *SPARKYUV_RESTRICT src, const uint32_t srcS

uint16_t out0, out1, out2, out3;

auto f16Source = reinterpret_cast<const hwy::float16_t*>(srcPixels);

switch (Surface) {
case SURFACE_CHANNEL: {
r = hwy::F32FromF16(hwy::float16_t::FromBits(srcPixels[0])) * scale;
r = LoadFloat(&f16Source[0]) * scale;
out0 = static_cast<uint16_t>(r);
}
break;
case SURFACE_CHANNELS_3: {
r = hwy::F32FromF16(hwy::float16_t::FromBits(srcPixels[0])) * scale;
r = LoadFloat(&f16Source[0]) * scale;
out0 = static_cast<uint16_t>(r);
g = hwy::F32FromF16(hwy::float16_t::FromBits(srcPixels[1])) * scale;
g = LoadFloat(&f16Source[1]) * scale;
out1 = static_cast<uint16_t>(g);
b = hwy::F32FromF16(hwy::float16_t::FromBits(srcPixels[2])) * scale;
b = LoadFloat(&f16Source[2]) * scale;
out2 = static_cast<uint16_t>(b);
}
break;
case SURFACE_CHANNELS_4: {
r = hwy::F32FromF16(hwy::float16_t::FromBits(srcPixels[0])) * scale;
r = LoadFloat(&f16Source[0]) * scale;
out0 = static_cast<uint16_t>(r);
g = hwy::F32FromF16(hwy::float16_t::FromBits(srcPixels[1])) * scale;
g = LoadFloat(&f16Source[1]) * scale;
out1 = static_cast<uint16_t>(g);
b = hwy::F32FromF16(hwy::float16_t::FromBits(srcPixels[2])) * scale;
b = LoadFloat(&f16Source[2]) * scale;
out2 = static_cast<uint16_t>(b);
a = hwy::F32FromF16(hwy::float16_t::FromBits(srcPixels[3])) * scale;
a = LoadFloat(&f16Source[3]) * scale;
out3 = static_cast<uint16_t>(a);
}
break;
case SURFACE_RGBA1010102: {
uint16_t bits0 = srcPixels[0];
float s1 = hwy::F32FromF16(hwy::float16_t::FromBits(bits0));
float s1 = LoadFloat(&f16Source[0]);
r = ::roundf(s1 * scale);
out0 = static_cast<uint16_t>(r);
uint16_t bits1 = srcPixels[1];
float s2 = hwy::F32FromF16(hwy::float16_t::FromBits(bits1));
float s2 = LoadFloat(&f16Source[1]);
g = ::roundf(s2 * scale);
out1 = static_cast<uint16_t>(g);
uint16_t bits2 = srcPixels[2];
float s3 = hwy::F32FromF16(hwy::float16_t::FromBits(bits2));
float s3 = LoadFloat(&f16Source[2]);
b = ::roundf(s3 * scale);
out2 = static_cast<uint16_t>(b);
uint16_t bits3 = srcPixels[3];
float s4 = hwy::F32FromF16(hwy::float16_t::FromBits(bits3));
float s4 = LoadFloat(&f16Source[3]);
a = std::min(::roundf(s4 * 3.f), 3.f);
out3 = static_cast<uint16_t>(a);
}
Expand Down
27 changes: 15 additions & 12 deletions src/RGB565Reformat-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "hwy/highway.h"
#include "src/yuv-inl.h"
#include "src/sparkyuv-internal.h"
#include "TypeSupport.h"

HWY_BEFORE_NAMESPACE();
namespace sparkyuv::HWY_NAMESPACE {
Expand Down Expand Up @@ -340,31 +341,33 @@ ReformatF16ToRGB565Impl(const uint16_t *SPARKYUV_RESTRICT src, const uint32_t sr
float g;
float b;

auto castedSource = reinterpret_cast<const hwy::float16_t*>(srcPixels);

switch (PixelType) {
case REFORMAT_RGBA:
case REFORMAT_RGB: {
r = hwy::F32FromF16(hwy::float16_t::FromBits(srcPixels[0]));
g = hwy::F32FromF16(hwy::float16_t::FromBits(srcPixels[1]));
b = hwy::F32FromF16(hwy::float16_t::FromBits(srcPixels[2]));
r = LoadFloat(&castedSource[0]);
g = LoadFloat(&castedSource[1]);
b = LoadFloat(&castedSource[2]);
}
break;
case REFORMAT_BGRA:
case REFORMAT_BGR: {
b = hwy::F32FromF16(hwy::float16_t::FromBits(srcPixels[0]));
g = hwy::F32FromF16(hwy::float16_t::FromBits(srcPixels[1]));
r = hwy::F32FromF16(hwy::float16_t::FromBits(srcPixels[2]));
b = LoadFloat(&castedSource[0]);
g = LoadFloat(&castedSource[1]);
r = LoadFloat(&castedSource[2]);
}
break;
case REFORMAT_ARGB: {
r = hwy::F32FromF16(hwy::float16_t::FromBits(srcPixels[1]));
g = hwy::F32FromF16(hwy::float16_t::FromBits(srcPixels[2]));
b = hwy::F32FromF16(hwy::float16_t::FromBits(srcPixels[3]));
r = LoadFloat(&castedSource[1]);
g = LoadFloat(&castedSource[2]);
b = LoadFloat(&castedSource[3]);
}
break;
case REFORMAT_ABGR: {
b = hwy::F32FromF16(hwy::float16_t::FromBits(srcPixels[1]));
g = hwy::F32FromF16(hwy::float16_t::FromBits(srcPixels[2]));
r = hwy::F32FromF16(hwy::float16_t::FromBits(srcPixels[3]));
b = LoadFloat(&castedSource[1]);
g = LoadFloat(&castedSource[2]);
r = LoadFloat(&castedSource[3]);
}
break;
}
Expand Down
10 changes: 5 additions & 5 deletions src/TypeSupport.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@

template<typename T, ENABLE_TYPE_IS_F16(T)>
TYPE_INLINE float LoadFloat(const T *src) {
#if HWY_HAVE_FLOAT16
#if HWY_HAVE_FLOAT16 || HWY_HAVE_SCALAR_F16_OPERATORS
return static_cast<float>(src[0]);
#else
auto uSource = reinterpret_cast<const uint16_t *>(src);
Expand All @@ -47,7 +47,7 @@ TYPE_INLINE float LoadFloat(const T *src) {

template<typename V, typename T, ENABLE_TYPE_IS_F16(T)>
TYPE_INLINE V LoadPixel(const T *src) {
#if HWY_HAVE_FLOAT16
#if HWY_HAVE_FLOAT16 || HWY_HAVE_SCALAR_F16_OPERATORS
return static_cast<V>(src[0]);
#else
auto uSource = reinterpret_cast<const uint16_t *>(src);
Expand All @@ -67,7 +67,7 @@ TYPE_INLINE V TransformCast(T t) {

template<typename V, typename T, ENABLE_TYPE_IS_F16(V)>
TYPE_INLINE V TransformCast(T t) {
#if HWY_HAVE_FLOAT16
#if HWY_HAVE_FLOAT16 || HWY_HAVE_SCALAR_F16_OPERATORS
return static_cast<hwy::float16_t>(t);
#else
return hwy::F16FromF32(t);
Expand All @@ -81,7 +81,7 @@ TYPE_INLINE void StoreRoundedFloat(V *v, T t) {

template<typename T, typename V, ENABLE_TYPE_IS_F16(V)>
TYPE_INLINE void StoreRoundedFloat(V *v, T t) {
#if HWY_HAVE_FLOAT16
#if HWY_HAVE_FLOAT16 || HWY_HAVE_SCALAR_F16_OPERATORS
v[0] = static_cast<V>(::roundf(t));
#else
reinterpret_cast<uint16_t *>(v)[0] = hwy::F16FromF32(::roundf(t)).bits;
Expand All @@ -95,7 +95,7 @@ TYPE_INLINE void StoreFloat(V *v, T t) {

template<typename T, typename V, ENABLE_TYPE_IS_F16(V)>
TYPE_INLINE void StoreFloat(V *v, T t) {
#if HWY_HAVE_FLOAT16
#if HWY_HAVE_FLOAT16 || HWY_HAVE_SCALAR_F16_OPERATORS
v[0] = static_cast<V>(t);
#else
reinterpret_cast<uint16_t *>(v)[0] = hwy::F16FromF32(t).bits;
Expand Down

0 comments on commit 2d34960

Please sign in to comment.