Skip to content

Commit

Permalink
Added support for hardware half float conversions.
Browse files Browse the repository at this point in the history
Check at startup if hardware half floats are supported for the current CPU
and use it when available. This will be used for modern x86 and all ARM
CPUs. Should improve the performance for both uncompressed half float and
BC6H formats.
  • Loading branch information
akb825 committed Jan 23, 2023
1 parent 27607ef commit 40dd18b
Show file tree
Hide file tree
Showing 8 changed files with 427 additions and 40 deletions.
38 changes: 37 additions & 1 deletion lib/include/cuttlefish/Config.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2016 Aaron Barany
* Copyright 2016-2023 Aaron Barany
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -91,6 +91,42 @@
#define CUTTLEFISH_64BIT 0
#endif

/**
* @brief Macro defined to whether or not the system is 64-bit x86.
*/
#if defined(__x86_64__) || defined(_M_AMD64)
#define CUTTLEFISH_X86_64 1
#else
#define CUTTLEFISH_X86_64 0
#endif

/**
* @brief Macro defined to whether or not the system is 32-bit x86.
*/
#if defined(__i386__) || defined(_M_IX86)
#define CUTTLEFISH_X86_32 1
#else
#define CUTTLEFISH_X86_32 0
#endif

/**
* @brief Macro defined to whether or not the system is 64-bit ARM.
*/
#if defined(__arm64__) || defined(__aarch64__)
#define CUTTLEFISH_ARM_64 1
#else
#define CUTTLEFISH_ARM_64 0
#endif

/**
* @brief Macro defined to whether or not the system is 32-bit ARM.
*/
#if defined(__arm__) || defined(_M_ARM)
#define CUTTLEFISH_ARM_32 1
#else
#define CUTTLEFISH_ARM_32 0
#endif

/**
* @brief Define for whether or not this is a debug build.
*/
Expand Down
18 changes: 9 additions & 9 deletions lib/src/Converter.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2017-2022 Aaron Barany
* Copyright 2017-2023 Aaron Barany
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -209,7 +209,7 @@ static std::unique_ptr<Converter> createConverter(const Texture& texture, const
case Texture::Type::Int:
return std::unique_ptr<Converter>(new IntConverter<std::int16_t, 1>(image));
case Texture::Type::Float:
return std::unique_ptr<Converter>(new FloatConverter<std::uint16_t, 1>(image));
return std::unique_ptr<Converter>(new HalfConverter<1>(image));
default:
return nullptr;
}
Expand All @@ -227,7 +227,7 @@ static std::unique_ptr<Converter> createConverter(const Texture& texture, const
case Texture::Type::Int:
return std::unique_ptr<Converter>(new IntConverter<std::int16_t, 2>(image));
case Texture::Type::Float:
return std::unique_ptr<Converter>(new FloatConverter<std::uint16_t, 2>(image));
return std::unique_ptr<Converter>(new HalfConverter<2>(image));
default:
return nullptr;
}
Expand All @@ -245,7 +245,7 @@ static std::unique_ptr<Converter> createConverter(const Texture& texture, const
case Texture::Type::Int:
return std::unique_ptr<Converter>(new IntConverter<std::int16_t, 3>(image));
case Texture::Type::Float:
return std::unique_ptr<Converter>(new FloatConverter<std::uint16_t, 3>(image));
return std::unique_ptr<Converter>(new HalfConverter<3>(image));
default:
return nullptr;
}
Expand All @@ -263,7 +263,7 @@ static std::unique_ptr<Converter> createConverter(const Texture& texture, const
case Texture::Type::Int:
return std::unique_ptr<Converter>(new IntConverter<std::int16_t, 4>(image));
case Texture::Type::Float:
return std::unique_ptr<Converter>(new FloatConverter<std::uint16_t, 4>(image));
return std::unique_ptr<Converter>(new HalfConverter<4>(image));
default:
return nullptr;
}
Expand All @@ -277,7 +277,7 @@ static std::unique_ptr<Converter> createConverter(const Texture& texture, const
case Texture::Type::Int:
return std::unique_ptr<Converter>(new IntConverter<std::int32_t, 1>(image));
case Texture::Type::Float:
return std::unique_ptr<Converter>(new FloatConverter<float, 1>(image));
return std::unique_ptr<Converter>(new FloatConverter<1>(image));
default:
return nullptr;
}
Expand All @@ -291,7 +291,7 @@ static std::unique_ptr<Converter> createConverter(const Texture& texture, const
case Texture::Type::Int:
return std::unique_ptr<Converter>(new IntConverter<std::int32_t, 2>(image));
case Texture::Type::Float:
return std::unique_ptr<Converter>(new FloatConverter<float, 2>(image));
return std::unique_ptr<Converter>(new FloatConverter<2>(image));
default:
return nullptr;
}
Expand All @@ -305,7 +305,7 @@ static std::unique_ptr<Converter> createConverter(const Texture& texture, const
case Texture::Type::Int:
return std::unique_ptr<Converter>(new IntConverter<std::int32_t, 3>(image));
case Texture::Type::Float:
return std::unique_ptr<Converter>(new FloatConverter<float, 3>(image));
return std::unique_ptr<Converter>(new FloatConverter<3>(image));
default:
return nullptr;
}
Expand All @@ -319,7 +319,7 @@ static std::unique_ptr<Converter> createConverter(const Texture& texture, const
case Texture::Type::Int:
return std::unique_ptr<Converter>(new IntConverter<std::int32_t, 4>(image));
case Texture::Type::Float:
return std::unique_ptr<Converter>(new FloatConverter<float, 4>(image));
return std::unique_ptr<Converter>(new FloatConverter<4>(image));
default:
return nullptr;
}
Expand Down
54 changes: 54 additions & 0 deletions lib/src/HalfFloat.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Copyright 2023 Aaron Barany
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "HalfFloat.h"

#if CUTTLEFISH_WINDOWS
#include <intrin.h>
#endif

namespace cuttlefish
{

#if CUTTLEFISH_SSE && !CUTTLEFISH_WINDOWS
static void __cpuid(int cpuInfo[4], int function)
{
cpuInfo[0] = function;
cpuInfo[2] = 0;
asm volatile("cpuid\n\t" : "+a"(cpuInfo[0]), "=b"(cpuInfo[1]), "+c"(cpuInfo[2]),
"=d"(cpuInfo[3]));
}
#endif

bool checkHasHardwareHalfFloat()
{
#if CUTTLEFISH_SSE
const int f16cBit = 1 << 29;

int cpuInfo[4];
__cpuid(cpuInfo, 1);
int ecx = cpuInfo[2];
return (ecx & f16cBit) != 0;
#elif CUTTLEFISH_NEON
return true;
#else
return false;
#endif
}

const bool hasHardwareHalfFloat = checkHasHardwareHalfFloat();

} // cuttlefish
138 changes: 138 additions & 0 deletions lib/src/HalfFloat.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
/*
* Copyright 2023 Aaron Barany
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cuttlefish/Config.h>
#include <cuttlefish/Export.h>

#include <cassert>
#include <cstdint>

#if CUTTLEFISH_X86_32 || CUTTLEFISH_X86_64
#include <immintrin.h>
#define CUTTLEFISH_SSE 1
#define CUTTLEFISH_NEON 0
#elif CUTTLEFISH_ARM_32 || CUTTLEFISH_ARM_64
#include <arm_neon.h>
#define CUTTLEFISH_SSE 0
#define CUTTLEFISH_NEON 1
#else
#define CUTTLEFISH_SSE 0
#define CUTTLEFISH_NEON 0
#endif

#if CUTTLEFISH_SSE && CUTTLEFISH_CLANG
#define CUTTLEFISH_START_HALF_FLOAT() \
_Pragma("clang attribute push(__attribute__((target(\"sse,sse2,f16c\"))), apply_to = function)")
#define CUTTLEFISH_END_HALF_FLOAT() _Pragma("clang attribute pop")
#elif CUTTLEFISH_SSE && CUTTLEFISH_GCC
#define CUTTLEFISH_START_HALF_FLOAT() \
_Pragma("GCC push_options") \
_Pragma("GCC target(\"sse,sse2,f16c\")")
#define CUTTLEFISH_END_HALF_FLOAT() _Pragma("GCC pop_options")
#else
#define CUTTLEFISH_START_HALF_FLOAT()
#define CUTTLEFISH_END_HALF_FLOAT()
#endif

namespace cuttlefish
{

// Export for unit tests.
CUTTLEFISH_EXPORT extern const bool hasHardwareHalfFloat;

CUTTLEFISH_START_HALF_FLOAT()

// NOTE: Always assume input has 4 floats, though output may be different.
inline void packHardwareHalfFloat1(std::uint16_t* result, const float* value)
{
#if CUTTLEFISH_SSE
__m128 f = _mm_loadu_ps(value);
__m128i h = _mm_cvtps_ph(f, 0);
*result = static_cast<std::uint16_t>(_mm_cvtsi128_si32(h));
#elif CUTTLEFISH_NEON
float32x4_t f = vld1q_f32(value);
float16x4_t h = vcvt_f16_f32(f);
vst1_lane_f16(reinterpret_cast<float16_t*>(result), h, 0);
#else
CUTTLEFISH_UNUSED(result);
CUTTLEFISH_UNUSED(value);
assert(false);
#endif
}

inline void packHardwareHalfFloat2(std::uint16_t* result, const float* value)
{
#if CUTTLEFISH_SSE
__m128 f = _mm_loadu_ps(value);
__m128i h = _mm_cvtps_ph(f, 0);
*reinterpret_cast<std::uint32_t*>(result) = _mm_cvtsi128_si32(h);
#elif CUTTLEFISH_NEON
float32x4_t f = vld1q_f32(value);
float16x4_t h = vcvt_f16_f32(f);
vst1_lane_f16(reinterpret_cast<float16_t*>(result), h, 0);
vst1_lane_f16(reinterpret_cast<float16_t*>(result) + 1, h, 1);
#else
CUTTLEFISH_UNUSED(result);
CUTTLEFISH_UNUSED(value);
assert(false);
#endif
}

inline void packHardwareHalfFloat3(std::uint16_t* result, const float* value)
{
#if CUTTLEFISH_SSE
__m128 f = _mm_loadu_ps(value);
__m128i h = _mm_cvtps_ph(f, 0);
std::uint64_t temp;
_mm_storeu_si64(&temp, h);
result[0] = reinterpret_cast<std::uint16_t*>(&temp)[0];
result[1] = reinterpret_cast<std::uint16_t*>(&temp)[1];
result[2] = reinterpret_cast<std::uint16_t*>(&temp)[2];
#elif CUTTLEFISH_NEON
float32x4_t f = vld1q_f32(value);
float16x4_t h = vcvt_f16_f32(f);
vst1_lane_f16(reinterpret_cast<float16_t*>(result), h, 0);
vst1_lane_f16(reinterpret_cast<float16_t*>(result) + 1, h, 1);
vst1_lane_f16(reinterpret_cast<float16_t*>(result) + 2, h, 2);
#else
CUTTLEFISH_UNUSED(result);
CUTTLEFISH_UNUSED(value);
assert(false);
#endif
}

inline void packHardwareHalfFloat4(std::uint16_t* result, const float* value)
{
#if CUTTLEFISH_SSE
__m128 f = _mm_loadu_ps(value);
__m128i h = _mm_cvtps_ph(f, 0);
_mm_storeu_si64(result, h);
#elif CUTTLEFISH_NEON
float32x4_t f = vld1q_f32(value);
float16x4_t h = vcvt_f16_f32(f);
vst1_f16(reinterpret_cast<float16_t*>(result), h);
#else
CUTTLEFISH_UNUSED(result);
CUTTLEFISH_UNUSED(value);
assert(false);
#endif
}

CUTTLEFISH_END_HALF_FLOAT()

} // namespace cuttlefish
Loading

0 comments on commit 40dd18b

Please sign in to comment.