Skip to content

Commit

Permalink
Rollback adding support for ARM intrinsics
Browse files Browse the repository at this point in the history
In some configurations this change causes compilation errors. We will roll this
forward again after those issue are addressed.

PiperOrigin-RevId: 562810916
Change-Id: I45b2a8d456273e9eff188f36da8f11323c4dfe66
  • Loading branch information
Abseil Team authored and copybara-github committed Sep 5, 2023
1 parent 1a88283 commit 461f1e4
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 76 deletions.
2 changes: 1 addition & 1 deletion CMake/AbseilDll.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ set(ABSL_INTERNAL_DLL_FILES
"crc/internal/crc_x86_arm_combined.cc"
"crc/internal/crc_memcpy_fallback.cc"
"crc/internal/crc_memcpy.h"
"crc/internal/crc_memcpy_x86_arm_combined.cc"
"crc/internal/crc_memcpy_x86_64.cc"
"crc/internal/crc_non_temporal_memcpy.cc"
"crc/internal/crc_x86_arm_combined.cc"
"crc/internal/non_temporal_arm_intrinsics.h"
Expand Down
5 changes: 4 additions & 1 deletion absl/crc/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,10 @@ cc_library(
visibility = ["//visibility:private"],
deps = [
":cpu_detect",
"//absl/base",
"//absl/base:config",
"//absl/base:core_headers",
"//absl/base:dynamic_annotations",
"//absl/base:endian",
"//absl/base:prefetch",
"//absl/base:raw_logging_internal",
Expand All @@ -70,7 +72,7 @@ cc_library(
"crc32c.cc",
"internal/crc32c_inline.h",
"internal/crc_memcpy_fallback.cc",
"internal/crc_memcpy_x86_arm_combined.cc",
"internal/crc_memcpy_x86_64.cc",
"internal/crc_non_temporal_memcpy.cc",
],
hdrs = [
Expand All @@ -87,6 +89,7 @@ cc_library(
":non_temporal_memcpy",
"//absl/base:config",
"//absl/base:core_headers",
"//absl/base:dynamic_annotations",
"//absl/base:endian",
"//absl/base:prefetch",
"//absl/strings",
Expand Down
5 changes: 4 additions & 1 deletion absl/crc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,10 @@ absl_cc_library(
${ABSL_DEFAULT_COPTS}
DEPS
absl::crc_cpu_detect
absl::base
absl::config
absl::core_headers
absl::dynamic_annotations
absl::endian
absl::prefetch
absl::raw_logging_internal
Expand All @@ -62,7 +64,7 @@ absl_cc_library(
"crc32c.cc"
"internal/crc32c_inline.h"
"internal/crc_memcpy_fallback.cc"
"internal/crc_memcpy_x86_arm_combined.cc"
"internal/crc_memcpy_x86_64.cc"
"internal/crc_non_temporal_memcpy.cc"
COPTS
${ABSL_DEFAULT_COPTS}
Expand All @@ -72,6 +74,7 @@ absl_cc_library(
absl::non_temporal_memcpy
absl::config
absl::core_headers
absl::dynamic_annotations
absl::endian
absl::prefetch
absl::str_format
Expand Down
31 changes: 3 additions & 28 deletions absl/crc/internal/crc32_x86_arm_combined_simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,8 @@ namespace crc_internal {

#if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
using V128 = uint64x2_t;
using V128u = uint64x2_t;
#else
using V128 = __m128i;
using V128u = __m128i_u;
#endif

// Starting with the initial value in |crc|, accumulates a CRC32 value for
Expand All @@ -78,10 +76,7 @@ uint32_t CRC32_u64(uint32_t crc, uint64_t v);
V128 V128_Load(const V128* src);

// Load 128 bits of integer data. |src| does not need to be aligned.
V128 V128_LoadU(const V128u* src);

// Store 128 bits of integer data. |src| must be 16-byte aligned.
void V128_Store(V128* dst, V128 data);
V128 V128_LoadU(const V128* src);

// Polynomially multiplies the high 64 bits of |l| and |r|.
V128 V128_PMulHi(const V128 l, const V128 r);
Expand Down Expand Up @@ -114,10 +109,6 @@ V128 V128_ShiftRight(const V128 l);
template <int imm>
int V128_Extract32(const V128 l);

// Extracts a 64-bit integer from |l|, selected with |imm|.
template <int imm>
uint64_t V128_Extract64(const V128 l);

// Extracts the low 64 bits from V128.
int64_t V128_Low64(const V128 l);

Expand Down Expand Up @@ -146,9 +137,7 @@ inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {

inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }

inline V128 V128_LoadU(const V128u* src) { return _mm_loadu_si128(src); }

inline void V128_Store(V128* dst, V128 data) { _mm_store_si128(dst, data); }
inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }

inline V128 V128_PMulHi(const V128 l, const V128 r) {
return _mm_clmulepi64_si128(l, r, 0x11);
Expand Down Expand Up @@ -184,11 +173,6 @@ inline int V128_Extract32(const V128 l) {
return _mm_extract_epi32(l, imm);
}

template <int imm>
inline uint64_t V128_Extract64(const V128 l) {
return static_cast<uint64_t>(_mm_extract_epi64(l, imm));
}

inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); }

inline V128 V128_ShiftLeft64(const V128 l, const V128 r) {
Expand All @@ -215,14 +199,10 @@ inline V128 V128_Load(const V128* src) {
return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
}

inline V128 V128_LoadU(const V128u* src) {
inline V128 V128_LoadU(const V128* src) {
return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
}

inline void V128_Store(V128* dst, V128 data) {
vst1q_u64(reinterpret_cast<uint64_t*>(dst), data);
}

// Using inline assembly as clang does not generate the pmull2 instruction and
// performance drops by 15-20%.
// TODO(b/193678732): Investigate why the compiler decides not to generate
Expand Down Expand Up @@ -272,11 +252,6 @@ inline int V128_Extract32(const V128 l) {
return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm);
}

template <int imm>
inline uint64_t V128_Extract64(const V128 l) {
return vgetq_lane_s64(vreinterpretq_s64_u64(l), imm);
}

inline int64_t V128_Low64(const V128 l) {
return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0);
}
Expand Down
3 changes: 0 additions & 3 deletions absl/crc/internal/crc_memcpy.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,12 @@

#include "absl/base/config.h"
#include "absl/crc/crc32c.h"
#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"

// Defined if the class AcceleratedCrcMemcpyEngine exists.
#if defined(__x86_64__) && defined(__SSE4_2__)
#define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1
#elif defined(_MSC_VER) && defined(__AVX__)
#define ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 1
#elif defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
#define ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE 1
#endif

namespace absl {
Expand Down
6 changes: 2 additions & 4 deletions absl/crc/internal/crc_memcpy_fallback.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,7 @@ absl::crc32c_t FallbackCrcMemcpyEngine::Compute(void* __restrict dst,
}

// Compile the following only if we don't have
#if !defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) && \
!defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE)
#ifndef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE

CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
CrcMemcpy::ArchSpecificEngines engines;
Expand All @@ -69,8 +68,7 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int /*vector*/,
return std::make_unique<FallbackCrcMemcpyEngine>();
}

#endif // !ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE &&
// !ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE
#endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE

} // namespace crc_internal
ABSL_NAMESPACE_END
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.

// Simultaneous memcopy and CRC-32C for x86-64 and ARM 64. Uses integer
// registers because XMM registers do not support the CRC instruction (yet).
// While copying, compute the running CRC of the data being copied.
// Simultaneous memcopy and CRC-32C for x86-64. Uses integer registers because
// XMM registers do not support the CRC instruction (yet). While copying,
// compute the running CRC of the data being copied.
//
// It is assumed that any CPU running this code has SSE4.2 instructions
// available (for CRC32C). This file will do nothing if that is not true.
Expand Down Expand Up @@ -57,12 +57,10 @@
#include "absl/base/prefetch.h"
#include "absl/crc/crc32c.h"
#include "absl/crc/internal/cpu_detect.h"
#include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
#include "absl/crc/internal/crc_memcpy.h"
#include "absl/strings/string_view.h"

#if defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) || \
defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE)
#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE

namespace absl {
ABSL_NAMESPACE_BEGIN
Expand All @@ -77,43 +75,44 @@ inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
uint32_t crc_uint32 = static_cast<uint32_t>(crc);
for (std::size_t i = 0; i < length; i++) {
uint8_t data = *reinterpret_cast<const uint8_t*>(src);
crc_uint32 = CRC32_u8(crc_uint32, data);
crc_uint32 = _mm_crc32_u8(crc_uint32, data);
*reinterpret_cast<uint8_t*>(dst) = data;
++src;
++dst;
}
return crc32c_t{crc_uint32};
}

constexpr size_t kIntLoadsPerVec = sizeof(V128) / sizeof(uint64_t);
constexpr size_t kIntLoadsPerVec = sizeof(__m128i) / sizeof(uint64_t);

// Common function for copying the tails of multiple large regions.
template <size_t vec_regions, size_t int_regions>
inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
size_t region_size, size_t copy_rounds) {
std::array<V128, vec_regions> data;
std::array<__m128i, vec_regions> data;
std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data;

while (copy_rounds > 0) {
for (size_t i = 0; i < vec_regions; i++) {
size_t region = i;

auto* vsrc = reinterpret_cast<const V128u*>(*src + region_size * region);
auto* vdst = reinterpret_cast<V128*>(*dst + region_size * region);
auto* vsrc =
reinterpret_cast<const __m128i*>(*src + region_size * region);
auto* vdst = reinterpret_cast<__m128i*>(*dst + region_size * region);

// Load the blocks, unaligned
data[i] = V128_LoadU(vsrc);
data[i] = _mm_loadu_si128(vsrc);

// Store the blocks, aligned
V128_Store(vdst, data[i]);
_mm_store_si128(vdst, data[i]);

// Compute the running CRC
crcs[region] = crc32c_t{static_cast<uint32_t>(
CRC32_u64(static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(V128_Extract64<0>(data[i]))))};
_mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(_mm_extract_epi64(data[i], 0))))};
crcs[region] = crc32c_t{static_cast<uint32_t>(
CRC32_u64(static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(V128_Extract64<1>(data[i]))))};
_mm_crc32_u64(static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(_mm_extract_epi64(data[i], 1))))};
}

for (size_t i = 0; i < int_regions; i++) {
Expand All @@ -127,16 +126,16 @@ inline void LargeTailCopy(crc32c_t* crcs, char** dst, const char** src,
size_t data_index = i * kIntLoadsPerVec + j;

int_data[data_index] = *(usrc + j);
crcs[region] = crc32c_t{static_cast<uint32_t>(CRC32_u64(
crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
static_cast<uint32_t>(crcs[region]), int_data[data_index]))};

*(udst + j) = int_data[data_index];
}
}

// Increment pointers
*src += sizeof(V128);
*dst += sizeof(V128);
*src += sizeof(__m128i);
*dst += sizeof(__m128i);
--copy_rounds;
}
}
Expand All @@ -162,7 +161,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
constexpr std::size_t kRegions = vec_regions + int_regions;
static_assert(kRegions > 0, "Must specify at least one region.");
constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff};
constexpr std::size_t kBlockSize = sizeof(V128);
constexpr std::size_t kBlockSize = sizeof(__m128i);
constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize;

// Number of blocks per cacheline.
Expand Down Expand Up @@ -238,18 +237,15 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
const std::size_t tail_size = length - (kRegions * region_size);

// Holding registers for data in each region.
std::array<V128, vec_regions> vec_data;
std::array<__m128i, vec_regions> vec_data;
std::array<uint64_t, int_regions * kIntLoadsPerVec> int_data;

// Main loop.
while (copy_rounds > kBlocksPerCacheLine) {
// Prefetch kPrefetchAhead bytes ahead of each pointer.
for (size_t i = 0; i < kRegions; i++) {
absl::PrefetchToLocalCache(src_bytes + kPrefetchAhead + region_size * i);
#ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
// TODO(b/297082454): investigate dropping prefetch on x86.
absl::PrefetchToLocalCache(dst_bytes + kPrefetchAhead + region_size * i);
#endif
}

// Load and store data, computing CRC on the way.
Expand All @@ -262,20 +258,21 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
size_t region = (j + i) % kRegions;

auto* vsrc =
reinterpret_cast<const V128u*>(src_bytes + region_size * region);
auto* vdst = reinterpret_cast<V128*>(dst_bytes + region_size * region);
reinterpret_cast<const __m128i*>(src_bytes + region_size * region);
auto* vdst =
reinterpret_cast<__m128i*>(dst_bytes + region_size * region);

// Load and CRC data.
vec_data[j] = V128_LoadU(vsrc + i);
crcs[region] = crc32c_t{static_cast<uint32_t>(
CRC32_u64(static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(V128_Extract64<0>(vec_data[j]))))};
crcs[region] = crc32c_t{static_cast<uint32_t>(
CRC32_u64(static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(V128_Extract64<1>(vec_data[j]))))};
vec_data[j] = _mm_loadu_si128(vsrc + i);
crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 0))))};
crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
static_cast<uint32_t>(crcs[region]),
static_cast<uint64_t>(_mm_extract_epi64(vec_data[j], 1))))};

// Store the data.
V128_Store(vdst + i, vec_data[j]);
_mm_store_si128(vdst + i, vec_data[j]);
}

// Preload the partial CRCs for the CLMUL subregions.
Expand All @@ -295,7 +292,7 @@ crc32c_t AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(

// Load and CRC the data.
int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k);
crcs[region] = crc32c_t{static_cast<uint32_t>(CRC32_u64(
crcs[region] = crc32c_t{static_cast<uint32_t>(_mm_crc32_u64(
static_cast<uint32_t>(crcs[region]), int_data[data_index]))};

// Store the data.
Expand Down Expand Up @@ -446,5 +443,4 @@ std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector,
ABSL_NAMESPACE_END
} // namespace absl

#endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE ||
// ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE
#endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE

0 comments on commit 461f1e4

Please sign in to comment.