-
Notifications
You must be signed in to change notification settings - Fork 4
Fix some problems and Add more ARM supports #6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
858cf90
d7a607b
1cc99a1
78647d4
223f872
93cb3ad
365f81a
49b31da
61f0c36
b1b1b07
3665e2b
f7efc92
7b1cc7f
5f3f0f6
46f0c57
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,9 @@ enum uarch { | |
// qualcomm | ||
oryon, | ||
// arm | ||
cortex_a53, | ||
cortex_a55, | ||
cortex_a73, | ||
cortex_a77, | ||
cortex_a78, | ||
cortex_x1, | ||
|
@@ -23,7 +26,7 @@ enum uarch { | |
neoverse_v2, | ||
// hisilicon | ||
tsv110, | ||
|
||
tsv200m, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there any references to this model? If not, I'd better not include this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You know hisilicon does not release any offical specs, so I'm not very sure. But it refers the micro-architecture on my OrangePi AI Pro. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I understand, but please avoid non-public architecture names. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could I use the SKU to refer to the architecture? If not set, there might be some problems for users using this hisilicon chip. |
||
unknown_arm64, | ||
arm64_begin = firestorm, | ||
arm64_end = unknown_arm64, | ||
|
@@ -41,6 +44,7 @@ enum uarch { | |
sunny_cove, | ||
skylake, | ||
broadwell, | ||
whiskylake, | ||
// amd | ||
zen1, | ||
zen2, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,10 @@ cpp_args = [] | |
link_args = [] | ||
cpu = target_machine.cpu_family() | ||
|
||
avx2_support = false | ||
avx512f_support = false | ||
sve_support = false | ||
|
||
if get_option('ios') | ||
message('Configured for iOS') | ||
cpp_args += ['-DIOS', '-std=c++11', '-DHOST_AARCH64', '-march=armv8.4-a'] | ||
|
@@ -20,12 +24,24 @@ else | |
foreach line : r.stderr().strip().split('\n') | ||
message(line) | ||
endforeach | ||
foreach line : r.stdout().strip().split('\n') | ||
cpp_args += [line] | ||
foreach line : r.stdout().strip().split('\n') # check whether the line first char is - | ||
if line[0] == '-' | ||
cpp_args += [line] | ||
elif line == 'AVX2 detected' | ||
avx2_support = true | ||
elif line == 'AVX512F detected' | ||
avx512f_support = true | ||
elif line == 'SVE detected' | ||
sve_support = true | ||
endif | ||
endforeach | ||
message('Got CXXFLAGS:', cpp_args) | ||
endif | ||
|
||
if cpu == 'x86_64' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't we already print CXXFLAGS above? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh it's for ISA detection. |
||
message('Got CXXFLAGS:', cpp_args) | ||
endif | ||
|
||
|
||
message('Final CXXFLAGS:', cpp_args) | ||
message('Final LDFLAGS:', link_args) | ||
|
@@ -49,18 +65,81 @@ endforeach | |
libs = [] | ||
|
||
if cpu == 'x86_64' | ||
gather_avx2 = executable('gather_avx2', | ||
'src/gather.cpp', | ||
cpp_args: ['-DAVX2', '-mavx2'], | ||
link_with: utils, | ||
install: true) | ||
if avx2_support | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can just build these binaries without checking if the cpu actually supports it? It allows us to build them on one machine, and run them on another. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's all right. I'm just afraid that it may cause confuse. |
||
gather_avx2 = executable('gather_avx2', | ||
'src/gather.cpp', | ||
cpp_args: ['-DAVX2', '-mavx2'], | ||
link_with: utils, | ||
install: true) | ||
div_avx2 = executable('div_avx2', | ||
'src/div.cpp', | ||
cpp_args: ['-DAVX2', '-mavx2'], | ||
link_with: utils, | ||
install: true) | ||
endif | ||
if avx512f_support | ||
gather_avx512 = executable('gather_avx512', | ||
'src/gather.cpp', | ||
cpp_args: ['-DAVX512', '-mavx512f'], | ||
link_with: utils, | ||
install: true) | ||
endif | ||
elif cpu == 'aarch64' | ||
# gather_neon = executable('gather_neon', | ||
# 'src/gather_aarch64.cpp', | ||
# cpp_args: ['-DNEON'], | ||
# link_with: utils, | ||
# install: true) | ||
if sve_support | ||
gather_sve = executable('gather_sve', | ||
'src/gather_aarch64.cpp', | ||
cpp_args: ['-DSVE', '-march=armv8.6-a+sve'], | ||
link_with: utils, | ||
install: true) | ||
sve_fp32_add = executable('sve_fp32_add', | ||
'src/simd_aarch64.cpp', | ||
cpp_args: ['-DSVE_FP32_ADD', '-march=armv8.6-a+sve'], | ||
link_with: utils, | ||
install: true) | ||
sve_fp64_add = executable('sve_fp64_add', | ||
'src/simd_aarch64.cpp', | ||
cpp_args: ['-DSVE_FP64_ADD', '-march=armv8.6-a+sve'], | ||
link_with: utils, | ||
install: true) | ||
sve_fp32_fma = executable('sve_fp32_fma', | ||
'src/simd_aarch64.cpp', | ||
cpp_args: ['-DSVE_FP32_FMA', '-march=armv8.6-a+sve'], | ||
link_with: utils, | ||
install: true) | ||
sve_fp64_fma = executable('sve_fp64_fma', | ||
'src/simd_aarch64.cpp', | ||
cpp_args: ['-DSVE_FP64_FMA', '-march=armv8.6-a+sve'], | ||
link_with: utils, | ||
install: true) | ||
endif | ||
neon_fp32_add = executable('neon_fp32_add', | ||
'src/simd_aarch64.cpp', | ||
cpp_args: ['-DNEON_FP32_ADD'], | ||
link_with: utils, | ||
install: true) | ||
neon_fp64_add = executable('neon_fp64_add', | ||
'src/simd_aarch64.cpp', | ||
cpp_args: ['-DNEON_FP64_ADD'], | ||
link_with: utils, | ||
install: true) | ||
neon_fp32_fma = executable('neon_fp32_fma', | ||
'src/simd_aarch64.cpp', | ||
cpp_args: ['-DNEON_FP32_FMA'], | ||
link_with: utils, | ||
install: true) | ||
neon_fp64_fma = executable('neon_fp64_fma', | ||
'src/simd_aarch64.cpp', | ||
cpp_args: ['-DNEON_FP64_FMA'], | ||
link_with: utils, | ||
install: true) | ||
endif | ||
|
||
|
||
cpp = meson.get_compiler('cpp') | ||
cpu = target_machine.cpu_family() | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
#include "include/utils.h" | ||
#include <immintrin.h> | ||
#include <time.h> | ||
#include <unistd.h> | ||
|
||
int res = 0; | ||
const int n = 1000; | ||
int array[n] = {0}; | ||
const int repeat = 500; | ||
const int unroll = 16; | ||
|
||
void test_1(int *indices) { | ||
#ifdef AVX2 | ||
__m256d index = _mm256_set1_pd(3.33); | ||
__m256d d0 = _mm256_set1_pd(1.0001); | ||
for (int i = 0; i < repeat; i++) { | ||
index = _mm256_div_pd(index, d0); | ||
index = _mm256_div_pd(index, d0); | ||
index = _mm256_div_pd(index, d0); | ||
index = _mm256_div_pd(index, d0); | ||
index = _mm256_div_pd(index, d0); | ||
index = _mm256_div_pd(index, d0); | ||
index = _mm256_div_pd(index, d0); | ||
index = _mm256_div_pd(index, d0); | ||
index = _mm256_div_pd(index, d0); | ||
index = _mm256_div_pd(index, d0); | ||
index = _mm256_div_pd(index, d0); | ||
index = _mm256_div_pd(index, d0); | ||
index = _mm256_div_pd(index, d0); | ||
index = _mm256_div_pd(index, d0); | ||
index = _mm256_div_pd(index, d0); | ||
index = _mm256_div_pd(index, d0); | ||
} | ||
res += index[0]; | ||
#endif | ||
#ifdef AVX512 | ||
__m512d index = _mm512_set1_pd(3.33); | ||
__m512d d0 = _mm512_set1_pd(1.0001); | ||
for (int i = 0; i < repeat; i++) { | ||
index = _mm512_div_pd(index, d0); | ||
index = _mm512_div_pd(index, d0); | ||
index = _mm512_div_pd(index, d0); | ||
index = _mm512_div_pd(index, d0); | ||
index = _mm512_div_pd(index, d0); | ||
index = _mm512_div_pd(index, d0); | ||
index = _mm512_div_pd(index, d0); | ||
index = _mm512_div_pd(index, d0); | ||
index = _mm512_div_pd(index, d0); | ||
index = _mm512_div_pd(index, d0); | ||
index = _mm512_div_pd(index, d0); | ||
index = _mm512_div_pd(index, d0); | ||
index = _mm512_div_pd(index, d0); | ||
index = _mm512_div_pd(index, d0); | ||
index = _mm512_div_pd(index, d0); | ||
index = _mm512_div_pd(index, d0); | ||
} | ||
res += index[0]; | ||
#endif | ||
} | ||
|
||
int main(int argc, char *argv[]) { | ||
|
||
int opt; | ||
while ((opt = getopt(argc, argv, "")) != -1) { | ||
switch (opt) { | ||
default: | ||
fprintf(stderr, "Usage: %s [-p]\n", argv[0]); | ||
exit(EXIT_FAILURE); | ||
} | ||
} | ||
|
||
bind_to_core(); | ||
setup_perf_instructions(); | ||
setup_perf_cycles(); | ||
|
||
// int indices[] = {0, 1, 2, 3, 4, 5, 6, 7}; | ||
#ifdef AVX2 | ||
const int vlen = 8; | ||
#endif | ||
#ifdef AVX512 | ||
const int vlen = 16; | ||
#endif | ||
int indices[vlen]; | ||
srand(time(NULL)); | ||
for (int i = 0; i < vlen; i++) { | ||
indices[i] = rand() % 32; | ||
} | ||
|
||
printf("Numbers:"); | ||
for (int i = 0; i < vlen; i++) { | ||
// generate patterns | ||
printf(" %d", indices[i]); | ||
array[indices[i]] = indices[i]; | ||
} | ||
printf("\n"); | ||
|
||
int warmup = 1000; | ||
|
||
for (int i = 0; i < warmup; i++) { | ||
test_1(indices); | ||
} | ||
|
||
int m = 50000; | ||
uint64_t cycles_before = perf_read_cycles(); | ||
uint64_t instructions_before = perf_read_instructions(); | ||
|
||
for (int i = 0; i < m; i++) { | ||
test_1(indices); | ||
} | ||
|
||
uint64_t cycles_after = perf_read_cycles(); | ||
uint64_t instructions_after = perf_read_instructions(); | ||
|
||
// i9-14900K: AVX2 24 cycles | ||
// i9-12900KS: AVX2 24 cycles | ||
// i9-10980XE: AVX2 38 cycles, AVX512 43 cycles | ||
// EPYC 9654: AVX2 20 cycles, AVX512 33 cycles | ||
// EPYC 7742: AVX2 21 cycles | ||
// EPYC 7551: AVX2 20 cycles | ||
printf("%ld cycles, %ld instructions, %.2lf ipc, %d ans\n", | ||
(cycles_after - cycles_before) / m / repeat / unroll, | ||
(instructions_after - instructions_before) / m / repeat / unroll, | ||
(double)(instructions_after - instructions_before) / | ||
(cycles_after - cycles_before), | ||
res); | ||
return 0; | ||
} |
Uh oh!
There was an error while loading. Please reload this page.