From 40dd18bbe6647116201a4be4988d00fa04fba2f1 Mon Sep 17 00:00:00 2001 From: Aaron Barany Date: Sun, 22 Jan 2023 18:42:39 -0800 Subject: [PATCH] Added support for hardware half float conversions. Check at startup if hardware half floats are supported for the current CPU and use it when available. This will be used for modern x86 and all ARM CPUs. Should improve the performance for both uncompressed half float and BC6H formats. --- lib/include/cuttlefish/Config.h | 38 ++++++++- lib/src/Converter.cpp | 18 ++--- lib/src/HalfFloat.cpp | 54 +++++++++++++ lib/src/HalfFloat.h | 138 ++++++++++++++++++++++++++++++++ lib/src/S3tcConverter.cpp | 39 ++++++--- lib/src/StandardConverter.h | 104 +++++++++++++++++++----- lib/test/CMakeLists.txt | 5 +- lib/test/HalfFloatTest.cpp | 71 ++++++++++++++++ 8 files changed, 427 insertions(+), 40 deletions(-) create mode 100644 lib/src/HalfFloat.cpp create mode 100644 lib/src/HalfFloat.h create mode 100644 lib/test/HalfFloatTest.cpp diff --git a/lib/include/cuttlefish/Config.h b/lib/include/cuttlefish/Config.h index 0c81bb9..40c5253 100644 --- a/lib/include/cuttlefish/Config.h +++ b/lib/include/cuttlefish/Config.h @@ -1,5 +1,5 @@ /* - * Copyright 2016 Aaron Barany + * Copyright 2016-2023 Aaron Barany * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -91,6 +91,42 @@ #define CUTTLEFISH_64BIT 0 #endif +/** + * @brief Macro defined to whether or not the system is 64-bit x86. + */ +#if defined(__x86_64__) || defined(_M_AMD64) +#define CUTTLEFISH_X86_64 1 +#else +#define CUTTLEFISH_X86_64 0 +#endif + +/** + * @brief Macro defined to whether or not the system is 32-bit x86. + */ +#if defined(__i386__) || defined(_M_IX86) +#define CUTTLEFISH_X86_32 1 +#else +#define CUTTLEFISH_X86_32 0 +#endif + +/** + * @brief Macro defined to whether or not the system is 64-bit ARM. + */ +#if defined(__arm64__) || defined(__aarch64__) +#define CUTTLEFISH_ARM_64 1 +#else +#define CUTTLEFISH_ARM_64 0 +#endif + +/** + * @brief Macro defined to whether or not the system is 32-bit ARM. + */ +#if defined(__arm__) || defined(_M_ARM) +#define CUTTLEFISH_ARM_32 1 +#else +#define CUTTLEFISH_ARM_32 0 +#endif + /** * @brief Define for whether or not this is a debug build. */ diff --git a/lib/src/Converter.cpp b/lib/src/Converter.cpp index d2c20da..e6fbb77 100644 --- a/lib/src/Converter.cpp +++ b/lib/src/Converter.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Aaron Barany + * Copyright 2017-2023 Aaron Barany * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -209,7 +209,7 @@ static std::unique_ptr createConverter(const Texture& texture, const case Texture::Type::Int: return std::unique_ptr(new IntConverter(image)); case Texture::Type::Float: - return std::unique_ptr(new FloatConverter(image)); + return std::unique_ptr(new HalfConverter<1>(image)); default: return nullptr; } @@ -227,7 +227,7 @@ static std::unique_ptr createConverter(const Texture& texture, const case Texture::Type::Int: return std::unique_ptr(new IntConverter(image)); case Texture::Type::Float: - return std::unique_ptr(new FloatConverter(image)); + return std::unique_ptr(new HalfConverter<2>(image)); default: return nullptr; } @@ -245,7 +245,7 @@ static std::unique_ptr createConverter(const Texture& texture, const case Texture::Type::Int: return std::unique_ptr(new IntConverter(image)); case Texture::Type::Float: - return std::unique_ptr(new FloatConverter(image)); + return std::unique_ptr(new HalfConverter<3>(image)); default: return nullptr; } @@ -263,7 +263,7 @@ static std::unique_ptr createConverter(const Texture& texture, const case Texture::Type::Int: return std::unique_ptr(new IntConverter(image)); case Texture::Type::Float: - return std::unique_ptr(new FloatConverter(image)); + return std::unique_ptr(new HalfConverter<4>(image)); default: return nullptr; } @@ -277,7 +277,7 @@ static std::unique_ptr createConverter(const Texture& texture, const case Texture::Type::Int: return std::unique_ptr(new IntConverter(image)); case Texture::Type::Float: - return std::unique_ptr(new FloatConverter(image)); + return std::unique_ptr(new FloatConverter<1>(image)); default: return nullptr; } @@ -291,7 +291,7 @@ static std::unique_ptr createConverter(const Texture& texture, const case Texture::Type::Int: return std::unique_ptr(new IntConverter(image)); case Texture::Type::Float: - return std::unique_ptr(new FloatConverter(image)); + return std::unique_ptr(new FloatConverter<2>(image)); default: return nullptr; } @@ -305,7 +305,7 @@ static std::unique_ptr createConverter(const Texture& texture, const case Texture::Type::Int: return std::unique_ptr(new IntConverter(image)); case Texture::Type::Float: - return std::unique_ptr(new FloatConverter(image)); + return std::unique_ptr(new FloatConverter<3>(image)); default: return nullptr; } @@ -319,7 +319,7 @@ static std::unique_ptr createConverter(const Texture& texture, const case Texture::Type::Int: return std::unique_ptr(new IntConverter(image)); case Texture::Type::Float: - return std::unique_ptr(new FloatConverter(image)); + return std::unique_ptr(new FloatConverter<4>(image)); default: return nullptr; } diff --git a/lib/src/HalfFloat.cpp b/lib/src/HalfFloat.cpp new file mode 100644 index 0000000..3fe112b --- /dev/null +++ b/lib/src/HalfFloat.cpp @@ -0,0 +1,54 @@ +/* + * Copyright 2023 Aaron Barany + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "HalfFloat.h" + +#if CUTTLEFISH_WINDOWS +#include +#endif + +namespace cuttlefish +{ + +#if CUTTLEFISH_SSE && !CUTTLEFISH_WINDOWS +static void __cpuid(int cpuInfo[4], int function) +{ + cpuInfo[0] = function; + cpuInfo[2] = 0; + asm volatile("cpuid\n\t" : "+a"(cpuInfo[0]), "=b"(cpuInfo[1]), "+c"(cpuInfo[2]), + "=d"(cpuInfo[3])); +} +#endif + +bool checkHasHardwareHalfFloat() +{ +#if CUTTLEFISH_SSE + const int f16cBit = 1 << 29; + + int cpuInfo[4]; + __cpuid(cpuInfo, 1); + int ecx = cpuInfo[2]; + return (ecx & f16cBit) != 0; +#elif CUTTLEFISH_NEON + return true; +#else + return false; +#endif +} + +const bool hasHardwareHalfFloat = checkHasHardwareHalfFloat(); + +} // cuttlefish diff --git a/lib/src/HalfFloat.h b/lib/src/HalfFloat.h new file mode 100644 index 0000000..55fc9df --- /dev/null +++ b/lib/src/HalfFloat.h @@ -0,0 +1,138 @@ +/* + * Copyright 2023 Aaron Barany + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +#if CUTTLEFISH_X86_32 || CUTTLEFISH_X86_64 +#include +#define CUTTLEFISH_SSE 1 +#define CUTTLEFISH_NEON 0 +#elif CUTTLEFISH_ARM_32 || CUTTLEFISH_ARM_64 +#include +#define CUTTLEFISH_SSE 0 +#define CUTTLEFISH_NEON 1 +#else +#define CUTTLEFISH_SSE 0 +#define CUTTLEFISH_NEON 0 +#endif + +#if CUTTLEFISH_SSE && CUTTLEFISH_CLANG +#define CUTTLEFISH_START_HALF_FLOAT() \ + _Pragma("clang attribute push(__attribute__((target(\"sse,sse2,f16c\"))), apply_to = function)") +#define CUTTLEFISH_END_HALF_FLOAT() _Pragma("clang attribute pop") +#elif CUTTLEFISH_SSE && CUTTLEFISH_GCC +#define CUTTLEFISH_START_HALF_FLOAT() \ + _Pragma("GCC push_options") \ + _Pragma("GCC target(\"sse,sse2,f16c\")") +#define CUTTLEFISH_END_HALF_FLOAT() _Pragma("GCC pop_options") +#else +#define CUTTLEFISH_START_HALF_FLOAT() +#define CUTTLEFISH_END_HALF_FLOAT() +#endif + +namespace cuttlefish +{ + +// Export for unit tests. +CUTTLEFISH_EXPORT extern const bool hasHardwareHalfFloat; + +CUTTLEFISH_START_HALF_FLOAT() + +// NOTE: Always assume input has 4 floats, though output may be different. +inline void packHardwareHalfFloat1(std::uint16_t* result, const float* value) +{ +#if CUTTLEFISH_SSE + __m128 f = _mm_loadu_ps(value); + __m128i h = _mm_cvtps_ph(f, 0); + *result = static_cast(_mm_cvtsi128_si32(h)); +#elif CUTTLEFISH_NEON + float32x4_t f = vld1q_f32(value); + float16x4_t h = vcvt_f16_f32(f); + vst1_lane_f16(reinterpret_cast(result), h, 0); +#else + CUTTLEFISH_UNUSED(result); + CUTTLEFISH_UNUSED(value); + assert(false); +#endif +} + +inline void packHardwareHalfFloat2(std::uint16_t* result, const float* value) +{ +#if CUTTLEFISH_SSE + __m128 f = _mm_loadu_ps(value); + __m128i h = _mm_cvtps_ph(f, 0); + *reinterpret_cast(result) = _mm_cvtsi128_si32(h); +#elif CUTTLEFISH_NEON + float32x4_t f = vld1q_f32(value); + float16x4_t h = vcvt_f16_f32(f); + vst1_lane_f16(reinterpret_cast(result), h, 0); + vst1_lane_f16(reinterpret_cast(result) + 1, h, 1); +#else + CUTTLEFISH_UNUSED(result); + CUTTLEFISH_UNUSED(value); + assert(false); +#endif +} + +inline void packHardwareHalfFloat3(std::uint16_t* result, const float* value) +{ +#if CUTTLEFISH_SSE + __m128 f = _mm_loadu_ps(value); + __m128i h = _mm_cvtps_ph(f, 0); + std::uint64_t temp; + _mm_storeu_si64(&temp, h); + result[0] = reinterpret_cast(&temp)[0]; + result[1] = reinterpret_cast(&temp)[1]; + result[2] = reinterpret_cast(&temp)[2]; +#elif CUTTLEFISH_NEON + float32x4_t f = vld1q_f32(value); + float16x4_t h = vcvt_f16_f32(f); + vst1_lane_f16(reinterpret_cast(result), h, 0); + vst1_lane_f16(reinterpret_cast(result) + 1, h, 1); + vst1_lane_f16(reinterpret_cast(result) + 2, h, 2); +#else + CUTTLEFISH_UNUSED(result); + CUTTLEFISH_UNUSED(value); + assert(false); +#endif +} + +inline void packHardwareHalfFloat4(std::uint16_t* result, const float* value) +{ +#if CUTTLEFISH_SSE + __m128 f = _mm_loadu_ps(value); + __m128i h = _mm_cvtps_ph(f, 0); + _mm_storeu_si64(result, h); +#elif CUTTLEFISH_NEON + float32x4_t f = vld1q_f32(value); + float16x4_t h = vcvt_f16_f32(f); + vst1_f16(reinterpret_cast(result), h); +#else + CUTTLEFISH_UNUSED(result); + CUTTLEFISH_UNUSED(value); + assert(false); +#endif +} + +CUTTLEFISH_END_HALF_FLOAT() + +} // namespace cuttlefish diff --git a/lib/src/S3tcConverter.cpp b/lib/src/S3tcConverter.cpp index 0a3d138..6fcbd6a 100644 --- a/lib/src/S3tcConverter.cpp +++ b/lib/src/S3tcConverter.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2017-2021 Aaron Barany + * Copyright 2017-2023 Aaron Barany * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,8 @@ #if CUTTLEFISH_HAS_S3TC #include "S3tcConverter.h" + +#include "HalfFloat.h" #include "Shared.h" #include @@ -524,18 +526,27 @@ Bc6HConverter::~Bc6HConverter() DestroyOptionsBC6(m_compressonatorOptions); } +CUTTLEFISH_START_HALF_FLOAT() void Bc6HConverter::compressBlock(void* block, ColorRGBAf* blockColors) { #if CUTTLEFISH_ISPC if (m_ispcTexcompSettings) { std::uint16_t colorBlock[blockPixels][4]; - for (unsigned int i = 0; i < blockPixels; ++i) + if (hasHardwareHalfFloat) + { + for (unsigned int i = 0; i < blockPixels; ++i) + packHardwareHalfFloat4(colorBlock[i], reinterpret_cast(blockColors + i)); + } + else { - for (unsigned int j = 0; j < 4; ++j) + for (unsigned int i = 0; i < blockPixels; ++i) { - colorBlock[i][j] = - glm::packHalf(glm::vec1(reinterpret_cast(blockColors + i)[j])).x; + for (unsigned int j = 0; j < 4; ++j) + { + colorBlock[i][j] = + glm::packHalf(glm::vec1(reinterpret_cast(blockColors + i)[j])).x; + } } } @@ -548,19 +559,27 @@ void Bc6HConverter::compressBlock(void* block, ColorRGBAf* blockColors) assert(m_compressonatorOptions); std::uint16_t colorBlock[blockPixels][3]; - for (unsigned int i = 0; i < blockPixels; ++i) + if (hasHardwareHalfFloat) + { + for (unsigned int i = 0; i < blockPixels; ++i) + packHardwareHalfFloat3(colorBlock[i], reinterpret_cast(blockColors + i)); + } + else { - for (unsigned int j = 0; j < 3; ++j) + for (unsigned int i = 0; i < blockPixels; ++i) { - colorBlock[i][j] = - glm::packHalf(glm::vec1(reinterpret_cast(blockColors + i)[j])).x; + for (unsigned int j = 0; j < 3; ++j) + { + colorBlock[i][j] = + glm::packHalf(glm::vec1(reinterpret_cast(blockColors + i)[j])).x; + } } } CompressBlockBC6(reinterpret_cast(colorBlock), 3*blockDim, reinterpret_cast(block), m_compressonatorOptions); } - +CUTTLEFISH_END_HALF_FLOAT() Bc7Converter::Bc7Converter(const Texture& texture, const Image& image, Texture::Quality quality) : S3tcConverter(texture, image, 16, quality), m_params(nullptr) diff --git a/lib/src/StandardConverter.h b/lib/src/StandardConverter.h index 043de43..f218d10 100644 --- a/lib/src/StandardConverter.h +++ b/lib/src/StandardConverter.h @@ -1,5 +1,5 @@ /* - * Copyright 2017 Aaron Barany + * Copyright 2017-2023 Aaron Barany * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,9 +19,11 @@ #include #include "Converter.h" +#include "HalfFloat.h" #include "Shared.h" #include #include +#include #include #include @@ -39,18 +41,6 @@ namespace cuttlefish { -template -inline T toFloat(float f) -{ - return f; -} - -template <> -inline std::uint16_t toFloat(float f) -{ - return glm::packHalf(glm::vec1(f)).x; -} - template class StandardConverter : public Converter { @@ -200,23 +190,23 @@ class IntConverter : public StandardConverter } }; -template -class FloatConverter : public StandardConverter +template +class FloatConverter : public StandardConverter { public: - using StandardConverter::batchSize; + using StandardConverter::batchSize; using typename Converter::ThreadData; using Converter::data; using Converter::image; explicit FloatConverter(const Image& image) - : StandardConverter(image) + : StandardConverter(image) { } void process(unsigned int x, unsigned int, ThreadData*) override { - T* curData = reinterpret_cast(data().data()) + x*batchSize*C; + float* curData = reinterpret_cast(data().data()) + x*batchSize*C; unsigned int row = x*batchSize/image().width(); const float* scanline = reinterpret_cast(image().scanline(row)); for (unsigned int i = 0; i < batchSize; ++i) @@ -232,10 +222,86 @@ class FloatConverter : public StandardConverter unsigned int col = (x*batchSize + i) % image().width(); for (unsigned int c = 0; c < C; ++c) - curData[i*C + c] = toFloat(scanline[col*4 + c]); + curData[i*C + c] = scanline[col*4 + c]; + } + } +}; + +CUTTLEFISH_START_HALF_FLOAT() +template +class HalfConverter : public StandardConverter +{ +public: + using StandardConverter::batchSize; + using typename Converter::ThreadData; + using Converter::data; + using Converter::image; + + explicit HalfConverter(const Image& image) + : StandardConverter(image) + { + } + + void process(unsigned int x, unsigned int, ThreadData*) override + { + std::uint16_t* curData = reinterpret_cast(data().data()) + x*batchSize*C; + unsigned int row = x*batchSize/image().width(); + const float* scanline = reinterpret_cast(image().scanline(row)); + + if (hasHardwareHalfFloat) + { + for (unsigned int i = 0; i < batchSize; ++i) + { + unsigned int curRow = (x*batchSize + i)/image().width(); + if (curRow != row) + { + if (curRow >= image().height()) + break; + row = curRow; + scanline = reinterpret_cast(image().scanline(row)); + } + + unsigned int col = (x*batchSize + i) % image().width(); + switch (C) + { + case 1: + packHardwareHalfFloat1(curData + i*C, scanline + col*4); + break; + case 2: + packHardwareHalfFloat2(curData + i*C, scanline + col*4); + break; + case 3: + packHardwareHalfFloat3(curData + i*C, scanline + col*4); + break; + case 4: + packHardwareHalfFloat4(curData + i*C, scanline + col*4); + break; + default: + assert(false); + } + } + } + else + { + for (unsigned int i = 0; i < batchSize; ++i) + { + unsigned int curRow = (x*batchSize + i)/image().width(); + if (curRow != row) + { + if (curRow >= image().height()) + break; + row = curRow; + scanline = reinterpret_cast(image().scanline(row)); + } + + unsigned int col = (x*batchSize + i) % image().width(); + for (unsigned int c = 0; c < C; ++c) + curData[i*C + c] = glm::packHalf(glm::vec1(scanline[col*4 + c])).x; + } } } }; +CUTTLEFISH_END_HALF_FLOAT() class R4G4Converter : public StandardConverter { diff --git a/lib/test/CMakeLists.txt b/lib/test/CMakeLists.txt index a509dd3..9ea1041 100644 --- a/lib/test/CMakeLists.txt +++ b/lib/test/CMakeLists.txt @@ -7,7 +7,10 @@ find_package(Threads) file(GLOB_RECURSE sources *.cpp *.h) add_executable(cuttlefish_lib_test ${sources}) -target_include_directories(cuttlefish_lib_test PRIVATE ${GTEST_INCLUDE_DIRS}) +target_include_directories(cuttlefish_lib_test PRIVATE + ${GTEST_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR}/../src + ${CMAKE_CURRENT_SOURCE_DIR}/../glm) target_link_libraries(cuttlefish_lib_test PRIVATE Cuttlefish::lib ${GTEST_BOTH_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) diff --git a/lib/test/HalfFloatTest.cpp b/lib/test/HalfFloatTest.cpp new file mode 100644 index 0000000..9de4ba2 --- /dev/null +++ b/lib/test/HalfFloatTest.cpp @@ -0,0 +1,71 @@ +/* + * Copyright 2023 Aaron Barany + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "HalfFloat.h" +#include + +#if CUTTLEFISH_GCC +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + +#include + +#if CUTTLEFISH_GCC +#pragma GCC diagnostic pop +#endif + +namespace cuttlefish +{ + +TEST(HalfFloatTest, PackHardwareHalfFloat) +{ + if (!hasHardwareHalfFloat) + return; + + float floatValues[4] = {1.2f, -3.4f, 5.6f, -7.8f}; + std::uint16_t halfFloatValues[4]; + *reinterpret_cast(halfFloatValues) = + glm::packHalf4x16(*reinterpret_cast(floatValues)); + + std::uint16_t convertedValues[4] = {0, 0, 0, 0}; + + packHardwareHalfFloat1(convertedValues, floatValues); + EXPECT_EQ(halfFloatValues[0], convertedValues[0]); + EXPECT_EQ(0, convertedValues[1]); + EXPECT_EQ(0, convertedValues[2]); + EXPECT_EQ(0, convertedValues[3]); + + packHardwareHalfFloat2(convertedValues, floatValues); + EXPECT_EQ(halfFloatValues[0], convertedValues[0]); + EXPECT_EQ(halfFloatValues[1], convertedValues[1]); + EXPECT_EQ(0, convertedValues[2]); + EXPECT_EQ(0, convertedValues[3]); + + packHardwareHalfFloat3(convertedValues, floatValues); + EXPECT_EQ(halfFloatValues[0], convertedValues[0]); + EXPECT_EQ(halfFloatValues[1], convertedValues[1]); + EXPECT_EQ(halfFloatValues[2], convertedValues[2]); + EXPECT_EQ(0, convertedValues[3]); + + packHardwareHalfFloat4(convertedValues, floatValues); + EXPECT_EQ(halfFloatValues[0], convertedValues[0]); + EXPECT_EQ(halfFloatValues[1], convertedValues[1]); + EXPECT_EQ(halfFloatValues[2], convertedValues[2]); + EXPECT_EQ(halfFloatValues[3], convertedValues[3]); +} + +} // cuttlefish