Added support for hardware half float conversions.

Check at startup if hardware half floats are supported for the current CPU and use it when available. This will be used for modern x86 and all ARM CPUs. Should improve the performance for both uncompressed half float and BC6H formats.
akb825 · Jan 23, 2023 · 40dd18b · 40dd18b
1 parent 27607ef
commit 40dd18b
Show file tree

Hide file tree

Showing 8 changed files with 427 additions and 40 deletions.
diff --git a/lib/include/cuttlefish/Config.h b/lib/include/cuttlefish/Config.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2016 Aaron Barany
+ * Copyright 2016-2023 Aaron Barany
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -91,6 +91,42 @@
 #define CUTTLEFISH_64BIT 0
 #endif
 
+/**
+ * @brief Macro defined to whether or not the system is 64-bit x86.
+ */
+#if defined(__x86_64__) || defined(_M_AMD64)
+#define CUTTLEFISH_X86_64 1
+#else
+#define CUTTLEFISH_X86_64 0
+#endif
+
+/**
+ * @brief Macro defined to whether or not the system is 32-bit x86.
+ */
+#if defined(__i386__) || defined(_M_IX86)
+#define CUTTLEFISH_X86_32 1
+#else
+#define CUTTLEFISH_X86_32 0
+#endif
+
+/**
+ * @brief Macro defined to whether or not the system is 64-bit ARM.
+ */
+#if defined(__arm64__) || defined(__aarch64__)
+#define CUTTLEFISH_ARM_64 1
+#else
+#define CUTTLEFISH_ARM_64 0
+#endif
+
+/**
+ * @brief Macro defined to whether or not the system is 32-bit ARM.
+ */
+#if defined(__arm__) || defined(_M_ARM)
+#define CUTTLEFISH_ARM_32 1
+#else
+#define CUTTLEFISH_ARM_32 0
+#endif
+
 /**
  * @brief Define for whether or not this is a debug build.
  */

diff --git a/lib/src/Converter.cpp b/lib/src/Converter.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright 2017-2022 Aaron Barany
+ * Copyright 2017-2023 Aaron Barany
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -209,7 +209,7 @@ static std::unique_ptr<Converter> createConverter(const Texture& texture, const
 				case Texture::Type::Int:
 					return std::unique_ptr<Converter>(new IntConverter<std::int16_t, 1>(image));
 				case Texture::Type::Float:
-					return std::unique_ptr<Converter>(new FloatConverter<std::uint16_t, 1>(image));
+					return std::unique_ptr<Converter>(new HalfConverter<1>(image));
 				default:
 					return nullptr;
 			}
@@ -227,7 +227,7 @@ static std::unique_ptr<Converter> createConverter(const Texture& texture, const
 				case Texture::Type::Int:
 					return std::unique_ptr<Converter>(new IntConverter<std::int16_t, 2>(image));
 				case Texture::Type::Float:
-					return std::unique_ptr<Converter>(new FloatConverter<std::uint16_t, 2>(image));
+					return std::unique_ptr<Converter>(new HalfConverter<2>(image));
 				default:
 					return nullptr;
 			}
@@ -245,7 +245,7 @@ static std::unique_ptr<Converter> createConverter(const Texture& texture, const
 				case Texture::Type::Int:
 					return std::unique_ptr<Converter>(new IntConverter<std::int16_t, 3>(image));
 				case Texture::Type::Float:
-					return std::unique_ptr<Converter>(new FloatConverter<std::uint16_t, 3>(image));
+					return std::unique_ptr<Converter>(new HalfConverter<3>(image));
 				default:
 					return nullptr;
 			}
@@ -263,7 +263,7 @@ static std::unique_ptr<Converter> createConverter(const Texture& texture, const
 				case Texture::Type::Int:
 					return std::unique_ptr<Converter>(new IntConverter<std::int16_t, 4>(image));
 				case Texture::Type::Float:
-					return std::unique_ptr<Converter>(new FloatConverter<std::uint16_t, 4>(image));
+					return std::unique_ptr<Converter>(new HalfConverter<4>(image));
 				default:
 					return nullptr;
 			}
@@ -277,7 +277,7 @@ static std::unique_ptr<Converter> createConverter(const Texture& texture, const
 				case Texture::Type::Int:
 					return std::unique_ptr<Converter>(new IntConverter<std::int32_t, 1>(image));
 				case Texture::Type::Float:
-					return std::unique_ptr<Converter>(new FloatConverter<float, 1>(image));
+					return std::unique_ptr<Converter>(new FloatConverter<1>(image));
 				default:
 					return nullptr;
 			}
@@ -291,7 +291,7 @@ static std::unique_ptr<Converter> createConverter(const Texture& texture, const
 				case Texture::Type::Int:
 					return std::unique_ptr<Converter>(new IntConverter<std::int32_t, 2>(image));
 				case Texture::Type::Float:
-					return std::unique_ptr<Converter>(new FloatConverter<float, 2>(image));
+					return std::unique_ptr<Converter>(new FloatConverter<2>(image));
 				default:
 					return nullptr;
 			}
@@ -305,7 +305,7 @@ static std::unique_ptr<Converter> createConverter(const Texture& texture, const
 				case Texture::Type::Int:
 					return std::unique_ptr<Converter>(new IntConverter<std::int32_t, 3>(image));
 				case Texture::Type::Float:
-					return std::unique_ptr<Converter>(new FloatConverter<float, 3>(image));
+					return std::unique_ptr<Converter>(new FloatConverter<3>(image));
 				default:
 					return nullptr;
 			}
@@ -319,7 +319,7 @@ static std::unique_ptr<Converter> createConverter(const Texture& texture, const
 				case Texture::Type::Int:
 					return std::unique_ptr<Converter>(new IntConverter<std::int32_t, 4>(image));
 				case Texture::Type::Float:
-					return std::unique_ptr<Converter>(new FloatConverter<float, 4>(image));
+					return std::unique_ptr<Converter>(new FloatConverter<4>(image));
 				default:
 					return nullptr;
 			}

diff --git a/lib/src/HalfFloat.cpp b/lib/src/HalfFloat.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2023 Aaron Barany
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "HalfFloat.h"
+
+#if CUTTLEFISH_WINDOWS
+#include <intrin.h>
+#endif
+
+namespace cuttlefish
+{
+
+#if CUTTLEFISH_SSE && !CUTTLEFISH_WINDOWS
+static void __cpuid(int cpuInfo[4], int function)
+{
+	cpuInfo[0] = function;
+	cpuInfo[2] = 0;
+	asm volatile("cpuid\n\t" : "+a"(cpuInfo[0]), "=b"(cpuInfo[1]), "+c"(cpuInfo[2]),
+		"=d"(cpuInfo[3]));
+}
+#endif
+
+bool checkHasHardwareHalfFloat()
+{
+#if CUTTLEFISH_SSE
+	const int f16cBit = 1 << 29;
+
+	int cpuInfo[4];
+	__cpuid(cpuInfo, 1);
+	int ecx = cpuInfo[2];
+	return (ecx & f16cBit) != 0;
+#elif CUTTLEFISH_NEON
+	return true;
+#else
+	return false;
+#endif
+}
+
+const bool hasHardwareHalfFloat = checkHasHardwareHalfFloat();
+
+} // cuttlefish
diff --git a/lib/src/HalfFloat.h b/lib/src/HalfFloat.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2023 Aaron Barany
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuttlefish/Config.h>
+#include <cuttlefish/Export.h>
+
+#include <cassert>
+#include <cstdint>
+
+#if CUTTLEFISH_X86_32 || CUTTLEFISH_X86_64
+#include <immintrin.h>
+#define CUTTLEFISH_SSE 1
+#define CUTTLEFISH_NEON 0
+#elif CUTTLEFISH_ARM_32 || CUTTLEFISH_ARM_64
+#include <arm_neon.h>
+#define CUTTLEFISH_SSE 0
+#define CUTTLEFISH_NEON 1
+#else
+#define CUTTLEFISH_SSE 0
+#define CUTTLEFISH_NEON 0
+#endif
+
+#if CUTTLEFISH_SSE && CUTTLEFISH_CLANG
+#define CUTTLEFISH_START_HALF_FLOAT() \
+	_Pragma("clang attribute push(__attribute__((target(\"sse,sse2,f16c\"))), apply_to = function)")
+#define CUTTLEFISH_END_HALF_FLOAT() _Pragma("clang attribute pop")
+#elif CUTTLEFISH_SSE && CUTTLEFISH_GCC
+#define CUTTLEFISH_START_HALF_FLOAT() \
+	_Pragma("GCC push_options") \
+	_Pragma("GCC target(\"sse,sse2,f16c\")")
+#define CUTTLEFISH_END_HALF_FLOAT() _Pragma("GCC pop_options")
+#else
+#define CUTTLEFISH_START_HALF_FLOAT()
+#define CUTTLEFISH_END_HALF_FLOAT()
+#endif
+
+namespace cuttlefish
+{
+
+// Export for unit tests.
+CUTTLEFISH_EXPORT extern const bool hasHardwareHalfFloat;
+
+CUTTLEFISH_START_HALF_FLOAT()
+
+// NOTE: Always assume input has 4 floats, though output may be different.
+inline void packHardwareHalfFloat1(std::uint16_t* result, const float* value)
+{
+#if CUTTLEFISH_SSE
+	__m128 f = _mm_loadu_ps(value);
+	__m128i h = _mm_cvtps_ph(f, 0);
+	*result = static_cast<std::uint16_t>(_mm_cvtsi128_si32(h));
+#elif CUTTLEFISH_NEON
+	float32x4_t f = vld1q_f32(value);
+	float16x4_t h = vcvt_f16_f32(f);
+	vst1_lane_f16(reinterpret_cast<float16_t*>(result), h, 0);
+#else
+	CUTTLEFISH_UNUSED(result);
+	CUTTLEFISH_UNUSED(value);
+	assert(false);
+#endif
+}
+
+inline void packHardwareHalfFloat2(std::uint16_t* result, const float* value)
+{
+#if CUTTLEFISH_SSE
+	__m128 f = _mm_loadu_ps(value);
+	__m128i h = _mm_cvtps_ph(f, 0);
+	*reinterpret_cast<std::uint32_t*>(result) = _mm_cvtsi128_si32(h);
+#elif CUTTLEFISH_NEON
+	float32x4_t f = vld1q_f32(value);
+	float16x4_t h = vcvt_f16_f32(f);
+	vst1_lane_f16(reinterpret_cast<float16_t*>(result), h, 0);
+	vst1_lane_f16(reinterpret_cast<float16_t*>(result) + 1, h, 1);
+#else
+	CUTTLEFISH_UNUSED(result);
+	CUTTLEFISH_UNUSED(value);
+	assert(false);
+#endif
+}
+
+inline void packHardwareHalfFloat3(std::uint16_t* result, const float* value)
+{
+#if CUTTLEFISH_SSE
+	__m128 f = _mm_loadu_ps(value);
+	__m128i h = _mm_cvtps_ph(f, 0);
+	std::uint64_t temp;
+	_mm_storeu_si64(&temp, h);
+	result[0] = reinterpret_cast<std::uint16_t*>(&temp)[0];
+	result[1] = reinterpret_cast<std::uint16_t*>(&temp)[1];
+	result[2] = reinterpret_cast<std::uint16_t*>(&temp)[2];
+#elif CUTTLEFISH_NEON
+	float32x4_t f = vld1q_f32(value);
+	float16x4_t h = vcvt_f16_f32(f);
+	vst1_lane_f16(reinterpret_cast<float16_t*>(result), h, 0);
+	vst1_lane_f16(reinterpret_cast<float16_t*>(result) + 1, h, 1);
+	vst1_lane_f16(reinterpret_cast<float16_t*>(result) + 2, h, 2);
+#else
+	CUTTLEFISH_UNUSED(result);
+	CUTTLEFISH_UNUSED(value);
+	assert(false);
+#endif
+}
+
+inline void packHardwareHalfFloat4(std::uint16_t* result, const float* value)
+{
+#if CUTTLEFISH_SSE
+	__m128 f = _mm_loadu_ps(value);
+	__m128i h = _mm_cvtps_ph(f, 0);
+	_mm_storeu_si64(result, h);
+#elif CUTTLEFISH_NEON
+	float32x4_t f = vld1q_f32(value);
+	float16x4_t h = vcvt_f16_f32(f);
+	vst1_f16(reinterpret_cast<float16_t*>(result), h);
+#else
+	CUTTLEFISH_UNUSED(result);
+	CUTTLEFISH_UNUSED(value);
+	assert(false);
+#endif
+}
+
+CUTTLEFISH_END_HALF_FLOAT()
+
+} // namespace cuttlefish