Skip to content

Commit

Permalink
Better vectorization and crc64. Cleaned up cmake and added better run…
Browse files Browse the repository at this point in the history
…time cpu detection (#1083)

Co-authored-by: Alfred G <28123637+alfred2g@users.noreply.github.com>
Co-authored-by: Alfred Gedeon <alfred2g@hotmail.com>
  • Loading branch information
3 people authored Mar 22, 2024
1 parent 2fd6652 commit 33c1bfb
Show file tree
Hide file tree
Showing 10 changed files with 178 additions and 34 deletions.
14 changes: 9 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -152,13 +152,17 @@ if (USE_CPU_EXTENSIONS)
)
endif()
elseif (AWS_ARCH_ARM64 OR AWS_ARCH_ARM32)
if (MSVC)
if (WINDOWS)
file(GLOB AWS_COMMON_ARCH_SRC
"source/arch/arm/msvc/*.c"
"source/arch/arm/windows/*.c"
)
elseif (AWS_HAVE_AUXV)
elseif(APPLE)
file(GLOB AWS_COMMON_ARCH_SRC
"source/arch/arm/darwin/*.c"
)
else()
file(GLOB AWS_COMMON_ARCH_SRC
"source/arch/arm/asm/*.c"
"source/arch/arm/auxv/*.c"
)
endif()
endif()
Expand Down Expand Up @@ -221,7 +225,7 @@ target_compile_definitions(${PROJECT_NAME} PRIVATE -DCJSON_HIDE_SYMBOLS)

if (AWS_HAVE_AVX2_INTRINSICS)
target_compile_definitions(${PROJECT_NAME} PRIVATE -DUSE_SIMD_ENCODING)
simd_add_source_avx(${PROJECT_NAME} "source/arch/intel/encoding_avx2.c")
simd_append_source_and_features(${PROJECT_NAME} "source/arch/intel/encoding_avx2.c" ${AWS_AVX2_FLAG})
message(STATUS "Building SIMD base64 decoder")
endif()

Expand Down
14 changes: 14 additions & 0 deletions bin/system_info/print_system_info.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <aws/common/byte_buf.h>
#include <aws/common/logging.h>
#include <aws/common/system_info.h>
#include <aws/common/cpuid.h>

int main(void) {
struct aws_allocator *allocator = aws_default_allocator();
Expand Down Expand Up @@ -39,6 +40,19 @@ int main(void) {
fprintf(stdout, " 'numa architecture': 'false'\n");
}

fprintf(stdout, " 'cpu_capabilities': {\n");
fprintf(stdout, " 'arm_crc': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC) ? "true" : "false");
fprintf(stdout, " 'arm_pmull': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_PMULL) ? "true" : "false");
fprintf(stdout, " 'arm_crypto': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRYPTO) ? "true" : "false");
fprintf(stdout, " 'amd_sse4_1': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_1) ? "true" : "false");
fprintf(stdout, " 'amd_sse4_2': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2) ? "true" : "false");
fprintf(stdout, " 'amd_clmul': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL) ? "true" : "false");
fprintf(stdout, " 'amd_vpclmulqdq': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ) ? "true" : "false");
fprintf(stdout, " 'amd_avx2': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_AVX2) ? "true" : "false");
fprintf(stdout, " 'amd_avx512': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512) ? "true" : "false");
fprintf(stdout, " 'amd_bmi2': %s\n", aws_cpu_has_feature(AWS_CPU_FEATURE_BMI2) ? "true" : "false");
fprintf(stdout, " }\n");

fprintf(stdout, "}\n");
aws_system_environment_release(env);
aws_logger_clean_up(&logger);
Expand Down
13 changes: 13 additions & 0 deletions cmake/AwsFeatureTests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ if(MINGW)
set(USE_CPU_EXTENSIONS OFF)
endif()

if (USE_CPU_EXTENSIONS)
set(AWS_USE_CPU_EXTENSIONS ON)
endif()

if(NOT CMAKE_CROSSCOMPILING)
check_c_source_runs("
#include <stdbool.h>
Expand Down Expand Up @@ -54,6 +58,15 @@ check_c_source_compiles("
}
" AWS_ARCH_INTEL)

check_c_source_compiles("
int main() {
#if !(defined(__x86_64__) || defined(_M_X64))
# error \"not intel\"
#endif
return 0;
}
" AWS_ARCH_INTEL_X64)

check_c_source_compiles("
int main() {
#if !(defined(__aarch64__) || defined(_M_ARM64))
Expand Down
97 changes: 73 additions & 24 deletions cmake/AwsSIMD.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,35 +4,39 @@
include(CheckCCompilerFlag)
include(CheckIncludeFile)

if (MSVC)
set(AWS_AVX2_FLAG "/arch:AVX2")
set(AWS_AVX512_FLAG "/arch:AVX512")
set(AWS_AVX512vL_FLAG "")
set(AWS_CLMUL_FLAG "")
set(AWS_SSE4_2_FLAG "")
set(AWS_ARMv8_1_FLAG "/arch:arm8.1")
set(WERROR_FLAG "")
else()
set(AWS_AVX2_FLAG "-mavx -mavx2")
set(AWS_AVX512_FLAG "-mavx512f -mvpclmulqdq")
set(AWS_AVX512vL_FLAG "-mavx512vl")
set(AWS_CLMUL_FLAG "-mpclmul")
set(AWS_SSE4_2_FLAG "-msse4.2")
set(AWS_ARMv8_1_FLAG "-march=armv8-a+crc+crypto -mtune=neoverse-v1")
set(WERROR_FLAG "-Werror")
endif()

if (USE_CPU_EXTENSIONS)
if (MSVC)
check_c_compiler_flag("/arch:AVX2" HAVE_M_AVX2_FLAG)
if (HAVE_M_AVX2_FLAG)
set(AVX_CFLAGS "/arch:AVX2")
endif()
else()
check_c_compiler_flag(-mavx2 HAVE_M_AVX2_FLAG)
if (HAVE_M_AVX2_FLAG)
set(AVX_CFLAGS "-mavx -mavx2")
endif()
set(AVX_CFLAGS ${AWS_SSE4_2_FLAG})

check_c_compiler_flag(${AWS_AVX2_FLAG} HAVE_M_AVX2_FLAG)
if (HAVE_M_AVX2_FLAG)
set(AVX_CFLAGS "${AWS_AVX2_FLAG} ${AVX_CFLAGS}")
endif()

if (MSVC)
check_c_compiler_flag("/arch:AVX512" HAVE_M_AVX512_FLAG)
if (HAVE_M_AVX512_FLAG)
# docs imply AVX512 brings in AVX2. And it will compile, but it will break at runtime on
# instructions such as _mm256_load_si256(). Leave it on.
set(AVX_CFLAGS "/arch:AVX512 /arch:AVX2")
endif()
else()
check_c_compiler_flag("-mavx512f -mvpclmulqdq" HAVE_M_AVX512_FLAG)
if (HAVE_M_AVX512_FLAG)
set(AVX_CFLAGS "-mavx512f -mvpclmulqdq -mpclmul -mavx -mavx2 -msse4.2")
endif()
check_c_compiler_flag("${AWS_AVX512_FLAG} ${AWS_CLMUL_FLAG}" HAVE_M_AVX512_FLAG)
if (HAVE_M_AVX512_FLAG)
set(AVX_CFLAGS "${AWS_AVX512_FLAG} ${AWS_CLMUL_FLAG} ${AVX_CFLAGS}")
endif()

set(old_flags "${CMAKE_REQUIRED_FLAGS}")
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${AVX_CFLAGS}")
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${AVX_CFLAGS} ${WERROR_FLAG}")

check_c_source_compiles("
#include <immintrin.h>
Expand Down Expand Up @@ -68,7 +72,35 @@ if (USE_CPU_EXTENSIONS)
return (int)_mm256_extract_epi64(vec, 2);
}" AWS_HAVE_MM256_EXTRACT_EPI64)

check_c_source_compiles("
#include <wmmintrin.h>
#include <emmintrin.h>
int main() {
__m128i a = _mm_setzero_si128();
__m128i b = _mm_setzero_si128();
__m128i result = _mm_clmulepi64_si128(a, b, 0x00);
(void)result;
return 0;
}" AWS_HAVE_CLMUL)

set(CMAKE_REQUIRED_FLAGS "${old_flags} ${AWS_ARMv8_1_FLAG} ${WERROR_FLAG}")
check_c_source_compiles("
#include <arm_acle.h>
int main() {
int crc = __crc32d(0, 1);
return 0;
}" AWS_HAVE_ARM32_CRC)

check_c_source_compiles("
#include <stdatomic.h>
int main() {
_Atomic int var = 0;
atomic_fetch_add_explicit(&var, 1, memory_order_relaxed);
return 0;
}" AWS_HAVE_ARMv8_1)

set(CMAKE_REQUIRED_FLAGS "${old_flags}")

endif() # USE_CPU_EXTENSIONS

# The part where the definition is added to the compiler flags has been moved to config.h.in
Expand All @@ -80,6 +112,23 @@ endif() # USE_CPU_EXTENSIONS
function(simd_add_source_avx target)
foreach(file ${ARGN})
target_sources(${target} PRIVATE ${file})
set_source_files_properties(${file} PROPERTIES COMPILE_FLAGS "${AVX_CFLAGS}")
set_source_files_properties(${file} PROPERTIES COMPILE_FLAGS " ${AVX_CFLAGS}")
endforeach()
endfunction(simd_add_source_avx)

# The part where the definition is added to the compiler flags has been moved to config.h.in
# see git history for more details.

# Adds compiler flags to the source and adds the source to target.
# Unfortunately the flags have to be passed as strings. Predefined flags are
# at the top of this file.
# Usage: simd_append_source_and_features(target file1.c ${AWS_AVX512_FLAG} ${AWS_AVX2_FLAG} ...)
function(simd_append_source_and_features target file)
set(CC_FLAGS "")
foreach(flag ${ARGN})
set(CC_FLAGS "${CC_FLAGS} ${flag}")
endforeach()

target_sources(${target} PRIVATE ${file})
set_source_files_properties(${file} PROPERTIES COMPILE_FLAGS " ${CC_FLAGS}")
endfunction(simd_append_source_and_features)
7 changes: 7 additions & 0 deletions include/aws/common/config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,12 @@
#cmakedefine AWS_HAVE_AVX2_INTRINSICS
#cmakedefine AWS_HAVE_AVX512_INTRINSICS
#cmakedefine AWS_HAVE_MM256_EXTRACT_EPI64
#cmakedefine AWS_HAVE_CLMUL
#cmakedefine AWS_HAVE_ARM32_CRC
#cmakedefine AWS_HAVE_ARMv8_1
#cmakedefine AWS_ARCH_ARM64
#cmakedefine AWS_ARCH_INTEL
#cmakedefine AWS_ARCH_INTEL_X64
#cmakedefine AWS_USE_CPU_EXTENSIONS

#endif
2 changes: 2 additions & 0 deletions include/aws/common/cpuid.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ enum aws_cpu_feature_name {
AWS_CPU_FEATURE_ARM_CRC,
AWS_CPU_FEATURE_BMI2,
AWS_CPU_FEATURE_VPCLMULQDQ,
AWS_CPU_FEATURE_ARM_PMULL,
AWS_CPU_FEATURE_ARM_CRYPTO,
AWS_CPU_FEATURE_COUNT,
};

Expand Down
8 changes: 7 additions & 1 deletion source/arch/arm/asm/cpuid.c → source/arch/arm/auxv/cpuid.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ struct cap_bits {

# if (defined(__aarch64__))
struct cap_bits s_check_cap[AWS_CPU_FEATURE_COUNT] = {
[AWS_CPU_FEATURE_ARM_CRC] = {0, 1 << 7 /* HWCAP_CRC */},
[AWS_CPU_FEATURE_ARM_CRC] = {0, 1 << 7 /* HWCAP_CRC32 */},
[AWS_CPU_FEATURE_ARM_PMULL] = {0, 1 << 4 /* HWCAP_PMULL */},
[AWS_CPU_FEATURE_ARM_CRYPTO] = {0, 1 << 3 /* HWCAP_AES */},
};
# else
struct cap_bits s_check_cap[AWS_CPU_FEATURE_COUNT] = {
Expand Down Expand Up @@ -67,6 +69,10 @@ bool aws_cpu_has_feature(enum aws_cpu_feature_name feature_name) {

switch (feature_name) {
case AWS_CPU_FEATURE_ARM_CRC:
# if (defined(__aarch64__))
case AWS_CPU_FEATURE_ARM_PMULL:
case AWS_CPU_FEATURE_ARM_CRYPTO:
# endif // (defined(__aarch64__))
return s_hwcap[s_check_cap[feature_name].cap] & s_check_cap[feature_name].bit;
default:
return false;
Expand Down
40 changes: 40 additions & 0 deletions source/arch/arm/darwin/cpuid.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License").
* You may not use this file except in compliance with the License.
* A copy of the License is located at
*
* http://aws.amazon.com/apache2.0
*
* or in the "license" file accompanying this file. This file is distributed
* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

#include <aws/common/cpuid.h>

#include <sys/sysctl.h>

bool aws_cpu_has_feature(enum aws_cpu_feature_name feature_name) {
int64_t ret = 0;
size_t size = sizeof(ret);

switch (feature_name) {
case AWS_CPU_FEATURE_ARM_PMULL:
if (sysctlbyname("hw.optional.arm.FEAT_PMULL", &ret, &size, NULL, 0) != -1) {
return ret == 1;
}
case AWS_CPU_FEATURE_ARM_CRC:
if (sysctlbyname("hw.optional.armv8_crc32", &ret, &size, NULL, 0) != -1) {
return ret == 1;
}
case AWS_CPU_FEATURE_ARM_CRYPTO:
if (sysctlbyname("hw.optional.arm.FEAT_AES", &ret, &size, NULL, 0) != -1) {
return ret == 1;
}
default:
return false;
}
}
13 changes: 11 additions & 2 deletions source/arch/arm/msvc/cpuid.c → source/arch/arm/windows/cpuid.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,18 @@
* permissions and limitations under the License.
*/

#include <Windows.h>
#include <aws/common/cpuid.h>
#include <stdlib.h>

bool aws_cpu_has_feature(enum aws_cpu_feature_name feature_name) {
return false;
switch (feature_name) {
case AWS_CPU_FEATURE_ARM_CRC:
return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) != 0;
// this is the best we've got on windows as they don't separate PMULL and AES from each other.
case AWS_CPU_FEATURE_ARM_PMULL:
case AWS_CPU_FEATURE_ARM_CRYPTO:
return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) != 0;
default:
return false;
}
}
4 changes: 2 additions & 2 deletions source/arch/intel/cpuid.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ static bool s_has_bmi2(void) {
static bool s_has_vpclmulqdq(void) {
uint32_t abcd[4];
/* Check VPCLMULQDQ:
* CPUID.(EAX=07H, ECX=0H):ECX.VPCLMULQDQ[bit 20]==1 */
uint32_t vpclmulqdq_mask = (1 << 20);
* CPUID.(EAX=07H, ECX=0H):ECX.VPCLMULQDQ[bit 10]==1 */
uint32_t vpclmulqdq_mask = (1 << 10);
aws_run_cpuid(7, 0, abcd);
if ((abcd[2] & vpclmulqdq_mask) != vpclmulqdq_mask) {
return false;
Expand Down

0 comments on commit 33c1bfb

Please sign in to comment.