Skip to content
4 changes: 4 additions & 0 deletions include/counters_mapping.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ DEFINE_COUNTER_RANGE(cycles, all, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES)
DEFINE_COUNTER_RANGE(instructions, all, PERF_TYPE_HARDWARE,
PERF_COUNT_HW_INSTRUCTIONS)

// cache misses and loads
DEFINE_COUNTER_RANGE(llc_misses, all, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES)
DEFINE_COUNTER_RANGE(llc_loads, all, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES)

// branch mispredictions
DEFINE_COUNTER_RANGE(branch_misses, all, PERF_TYPE_HARDWARE,
PERF_COUNT_HW_BRANCH_MISSES)
Expand Down
6 changes: 5 additions & 1 deletion include/uarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ enum uarch {
// qualcomm
oryon,
// arm
cortex_a53,
cortex_a55,
cortex_a73,
cortex_a77,
cortex_a78,
cortex_x1,
Expand All @@ -23,7 +26,7 @@ enum uarch {
neoverse_v2,
// hisilicon
tsv110,

tsv200m,
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any references to this model? If not, I'd better not include this.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You know hisilicon does not release any offical specs, so I'm not very sure. But it refers the micro-architecture on my OrangePi AI Pro.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand, but please avoid non-public architecture names.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could I use the SKU to refer to the architecture? If not set, there might be some problems for users using this hisilicon chip.

unknown_arm64,
arm64_begin = firestorm,
arm64_end = unknown_arm64,
Expand All @@ -41,6 +44,7 @@ enum uarch {
sunny_cove,
skylake,
broadwell,
whiskylake,
// amd
zen1,
zen2,
Expand Down
3 changes: 3 additions & 0 deletions include/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,5 +129,8 @@ int virt_to_phys_user(uintptr_t *paddr, uintptr_t vaddr);
#ifndef PHR_BRANCHES
#define PHR_BRANCHES 64
#endif
#ifndef PHRB_BRANCHES
#define PHRB_BRANCHES 32
#endif

#endif
93 changes: 86 additions & 7 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ cpp_args = []
link_args = []
cpu = target_machine.cpu_family()

avx2_support = false
avx512f_support = false
sve_support = false

if get_option('ios')
message('Configured for iOS')
cpp_args += ['-DIOS', '-std=c++11', '-DHOST_AARCH64', '-march=armv8.4-a']
Expand All @@ -20,12 +24,24 @@ else
foreach line : r.stderr().strip().split('\n')
message(line)
endforeach
foreach line : r.stdout().strip().split('\n')
cpp_args += [line]
foreach line : r.stdout().strip().split('\n') # check whether the line first char is -
if line[0] == '-'
cpp_args += [line]
elif line == 'AVX2 detected'
avx2_support = true
elif line == 'AVX512F detected'
avx512f_support = true
elif line == 'SVE detected'
sve_support = true
endif
endforeach
message('Got CXXFLAGS:', cpp_args)
endif

if cpu == 'x86_64'
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we already print CXXFLAGS above?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh it's for ISA detection.

message('Got CXXFLAGS:', cpp_args)
endif


message('Final CXXFLAGS:', cpp_args)
message('Final LDFLAGS:', link_args)
Expand All @@ -49,18 +65,81 @@ endforeach
libs = []

if cpu == 'x86_64'
gather_avx2 = executable('gather_avx2',
'src/gather.cpp',
cpp_args: ['-DAVX2', '-mavx2'],
link_with: utils,
install: true)
if avx2_support
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can just build these binaries without checking if the cpu actually supports it? It allows us to build them on one machine, and run them on another.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's all right. I'm just afraid that it may cause confuse.

gather_avx2 = executable('gather_avx2',
'src/gather.cpp',
cpp_args: ['-DAVX2', '-mavx2'],
link_with: utils,
install: true)
div_avx2 = executable('div_avx2',
'src/div.cpp',
cpp_args: ['-DAVX2', '-mavx2'],
link_with: utils,
install: true)
endif
if avx512f_support
gather_avx512 = executable('gather_avx512',
'src/gather.cpp',
cpp_args: ['-DAVX512', '-mavx512f'],
link_with: utils,
install: true)
endif
elif cpu == 'aarch64'
# gather_neon = executable('gather_neon',
# 'src/gather_aarch64.cpp',
# cpp_args: ['-DNEON'],
# link_with: utils,
# install: true)
if sve_support
gather_sve = executable('gather_sve',
'src/gather_aarch64.cpp',
cpp_args: ['-DSVE', '-march=armv8.6-a+sve'],
link_with: utils,
install: true)
sve_fp32_add = executable('sve_fp32_add',
'src/simd_aarch64.cpp',
cpp_args: ['-DSVE_FP32_ADD', '-march=armv8.6-a+sve'],
link_with: utils,
install: true)
sve_fp64_add = executable('sve_fp64_add',
'src/simd_aarch64.cpp',
cpp_args: ['-DSVE_FP64_ADD', '-march=armv8.6-a+sve'],
link_with: utils,
install: true)
sve_fp32_fma = executable('sve_fp32_fma',
'src/simd_aarch64.cpp',
cpp_args: ['-DSVE_FP32_FMA', '-march=armv8.6-a+sve'],
link_with: utils,
install: true)
sve_fp64_fma = executable('sve_fp64_fma',
'src/simd_aarch64.cpp',
cpp_args: ['-DSVE_FP64_FMA', '-march=armv8.6-a+sve'],
link_with: utils,
install: true)
endif
neon_fp32_add = executable('neon_fp32_add',
'src/simd_aarch64.cpp',
cpp_args: ['-DNEON_FP32_ADD'],
link_with: utils,
install: true)
neon_fp64_add = executable('neon_fp64_add',
'src/simd_aarch64.cpp',
cpp_args: ['-DNEON_FP64_ADD'],
link_with: utils,
install: true)
neon_fp32_fma = executable('neon_fp32_fma',
'src/simd_aarch64.cpp',
cpp_args: ['-DNEON_FP32_FMA'],
link_with: utils,
install: true)
neon_fp64_fma = executable('neon_fp64_fma',
'src/simd_aarch64.cpp',
cpp_args: ['-DNEON_FP64_FMA'],
link_with: utils,
install: true)
endif


cpp = meson.get_compiler('cpp')
cpu = target_machine.cpu_family()

Expand Down
13 changes: 13 additions & 0 deletions src/detect_uarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@ int main() {
case cortex_a77:
printf("-DARM_CORTEX_A77\n");
break;
case cortex_a53:
printf("-DARM_CORTEX_A53\n");
break;
case cortex_a55:
printf("-DARM_CORTEX_A55\n");
break;
case cortex_a73:
printf("-DARM_CORTEX_A73\n");
break;
case cortex_x1:
printf("-DARM_CORTEX_X1\n");
break;
Expand Down Expand Up @@ -70,6 +79,10 @@ int main() {
printf("-DINTEL\n");
printf("-DINTEL_BROADWELL\n");
break;
case whiskylake:
printf("-DINTEL\n");
printf("-DINTEL_WHISKYLAKE\n");
break;
case zen1:
printf("-DAMD\n");
printf("-DAMD_ZEN1\n");
Expand Down
127 changes: 127 additions & 0 deletions src/div.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#include "include/utils.h"
#include <immintrin.h>
#include <time.h>
#include <unistd.h>

int res = 0;
const int n = 1000;
int array[n] = {0};
const int repeat = 500;
const int unroll = 16;

void test_1(int *indices) {
#ifdef AVX2
__m256d index = _mm256_set1_pd(3.33);
__m256d d0 = _mm256_set1_pd(1.0001);
for (int i = 0; i < repeat; i++) {
index = _mm256_div_pd(index, d0);
index = _mm256_div_pd(index, d0);
index = _mm256_div_pd(index, d0);
index = _mm256_div_pd(index, d0);
index = _mm256_div_pd(index, d0);
index = _mm256_div_pd(index, d0);
index = _mm256_div_pd(index, d0);
index = _mm256_div_pd(index, d0);
index = _mm256_div_pd(index, d0);
index = _mm256_div_pd(index, d0);
index = _mm256_div_pd(index, d0);
index = _mm256_div_pd(index, d0);
index = _mm256_div_pd(index, d0);
index = _mm256_div_pd(index, d0);
index = _mm256_div_pd(index, d0);
index = _mm256_div_pd(index, d0);
}
res += index[0];
#endif
#ifdef AVX512
__m512d index = _mm512_set1_pd(3.33);
__m512d d0 = _mm512_set1_pd(1.0001);
for (int i = 0; i < repeat; i++) {
index = _mm512_div_pd(index, d0);
index = _mm512_div_pd(index, d0);
index = _mm512_div_pd(index, d0);
index = _mm512_div_pd(index, d0);
index = _mm512_div_pd(index, d0);
index = _mm512_div_pd(index, d0);
index = _mm512_div_pd(index, d0);
index = _mm512_div_pd(index, d0);
index = _mm512_div_pd(index, d0);
index = _mm512_div_pd(index, d0);
index = _mm512_div_pd(index, d0);
index = _mm512_div_pd(index, d0);
index = _mm512_div_pd(index, d0);
index = _mm512_div_pd(index, d0);
index = _mm512_div_pd(index, d0);
index = _mm512_div_pd(index, d0);
}
res += index[0];
#endif
}

int main(int argc, char *argv[]) {

int opt;
while ((opt = getopt(argc, argv, "")) != -1) {
switch (opt) {
default:
fprintf(stderr, "Usage: %s [-p]\n", argv[0]);
exit(EXIT_FAILURE);
}
}

bind_to_core();
setup_perf_instructions();
setup_perf_cycles();

// int indices[] = {0, 1, 2, 3, 4, 5, 6, 7};
#ifdef AVX2
const int vlen = 8;
#endif
#ifdef AVX512
const int vlen = 16;
#endif
int indices[vlen];
srand(time(NULL));
for (int i = 0; i < vlen; i++) {
indices[i] = rand() % 32;
}

printf("Numbers:");
for (int i = 0; i < vlen; i++) {
// generate patterns
printf(" %d", indices[i]);
array[indices[i]] = indices[i];
}
printf("\n");

int warmup = 1000;

for (int i = 0; i < warmup; i++) {
test_1(indices);
}

int m = 50000;
uint64_t cycles_before = perf_read_cycles();
uint64_t instructions_before = perf_read_instructions();

for (int i = 0; i < m; i++) {
test_1(indices);
}

uint64_t cycles_after = perf_read_cycles();
uint64_t instructions_after = perf_read_instructions();

// i9-14900K: AVX2 24 cycles
// i9-12900KS: AVX2 24 cycles
// i9-10980XE: AVX2 38 cycles, AVX512 43 cycles
// EPYC 9654: AVX2 20 cycles, AVX512 33 cycles
// EPYC 7742: AVX2 21 cycles
// EPYC 7551: AVX2 20 cycles
printf("%ld cycles, %ld instructions, %.2lf ipc, %d ans\n",
(cycles_after - cycles_before) / m / repeat / unroll,
(instructions_after - instructions_before) / m / repeat / unroll,
(double)(instructions_after - instructions_before) /
(cycles_after - cycles_before),
res);
return 0;
}
Loading