jiegec · yegroup001 · Mar 4, 2025 · Mar 4, 2025 · Mar 4, 2025 · Mar 4, 2025
diff --git a/include/counters_mapping.h b/include/counters_mapping.h
@@ -67,6 +67,10 @@ DEFINE_COUNTER_RANGE(cycles, all, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES)
 DEFINE_COUNTER_RANGE(instructions, all, PERF_TYPE_HARDWARE,
                      PERF_COUNT_HW_INSTRUCTIONS)
 
+// cache misses and loads
+DEFINE_COUNTER_RANGE(llc_misses, all, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES)
+DEFINE_COUNTER_RANGE(llc_loads, all, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES)
+
 // branch mispredictions
 DEFINE_COUNTER_RANGE(branch_misses, all, PERF_TYPE_HARDWARE,
                      PERF_COUNT_HW_BRANCH_MISSES)

diff --git a/include/uarch.h b/include/uarch.h
@@ -14,6 +14,9 @@ enum uarch {
   // qualcomm
   oryon,
   // arm
+  cortex_a53,
+  cortex_a55,
+  cortex_a73,
   cortex_a77,
   cortex_a78,
   cortex_x1,
@@ -23,7 +26,7 @@ enum uarch {
   neoverse_v2,
   // hisilicon
   tsv110,
-
+  tsv200m,
   unknown_arm64,
   arm64_begin = firestorm,
   arm64_end = unknown_arm64,
@@ -41,6 +44,7 @@ enum uarch {
   sunny_cove,
   skylake,
   broadwell,
+  whiskylake,
   // amd
   zen1,
   zen2,

diff --git a/include/utils.h b/include/utils.h
@@ -129,5 +129,8 @@ int virt_to_phys_user(uintptr_t *paddr, uintptr_t vaddr);
 #ifndef PHR_BRANCHES
 #define PHR_BRANCHES 64
 #endif
+#ifndef PHRB_BRANCHES
+#define PHRB_BRANCHES 32
+#endif
 
 #endif
diff --git a/meson.build b/meson.build
@@ -4,6 +4,10 @@ cpp_args = []
 link_args = []
 cpu = target_machine.cpu_family()
 
+avx2_support = false
+avx512f_support = false
+sve_support = false
+
 if get_option('ios')
 	message('Configured for iOS')
 	cpp_args += ['-DIOS', '-std=c++11', '-DHOST_AARCH64', '-march=armv8.4-a']
@@ -20,12 +24,24 @@ else
 	foreach line : r.stderr().strip().split('\n')
 		message(line)
 	endforeach
-	foreach line : r.stdout().strip().split('\n')
-		cpp_args += [line]
+	foreach line : r.stdout().strip().split('\n') # check whether the line first char is -
+		if line[0] == '-'
+			cpp_args += [line]
+		elif line == 'AVX2 detected'
+			avx2_support = true
+		elif line == 'AVX512F detected'
+			avx512f_support = true
+		elif line == 'SVE detected'
+			sve_support = true
+		endif
 	endforeach
 	message('Got CXXFLAGS:', cpp_args)
 endif
 
+if cpu == 'x86_64'
+	message('Got CXXFLAGS:', cpp_args)
+endif
+
 
 message('Final CXXFLAGS:', cpp_args)
 message('Final LDFLAGS:', link_args)
@@ -49,18 +65,81 @@ endforeach
 libs = []
 
 if cpu == 'x86_64'
-	gather_avx2 = executable('gather_avx2',
-		'src/gather.cpp',
-		cpp_args: ['-DAVX2', '-mavx2'],
-		link_with: utils,
-		install: true)
+	if avx2_support
+		gather_avx2 = executable('gather_avx2',
+			'src/gather.cpp',
+			cpp_args: ['-DAVX2', '-mavx2'],
+			link_with: utils,
+			install: true)
+		div_avx2 = executable('div_avx2',
+			'src/div.cpp',
+			cpp_args: ['-DAVX2', '-mavx2'],
+			link_with: utils,
+			install: true)
+	endif
+	if avx512f_support
 	gather_avx512 = executable('gather_avx512',
 		'src/gather.cpp',
 		cpp_args: ['-DAVX512', '-mavx512f'],
 		link_with: utils,
 		install: true)
+	endif
+elif cpu == 'aarch64'
+	# gather_neon = executable('gather_neon',
+	# 	'src/gather_aarch64.cpp',
+	# 	cpp_args: ['-DNEON'],
+	# 	link_with: utils,
+	# 	install: true)
+	if sve_support
+		gather_sve = executable('gather_sve',
+			'src/gather_aarch64.cpp',
+			cpp_args: ['-DSVE', '-march=armv8.6-a+sve'],
+			link_with: utils,
+			install: true)
+		sve_fp32_add = executable('sve_fp32_add',
+			'src/simd_aarch64.cpp',
+			cpp_args: ['-DSVE_FP32_ADD', '-march=armv8.6-a+sve'],
+			link_with: utils,
+			install: true)
+		sve_fp64_add = executable('sve_fp64_add',
+			'src/simd_aarch64.cpp',
+			cpp_args: ['-DSVE_FP64_ADD', '-march=armv8.6-a+sve'],
+			link_with: utils,
+			install: true)
+		sve_fp32_fma = executable('sve_fp32_fma',
+			'src/simd_aarch64.cpp',
+			cpp_args: ['-DSVE_FP32_FMA', '-march=armv8.6-a+sve'],
+			link_with: utils,
+			install: true)
+		sve_fp64_fma = executable('sve_fp64_fma',
+			'src/simd_aarch64.cpp',
+			cpp_args: ['-DSVE_FP64_FMA', '-march=armv8.6-a+sve'],
+			link_with: utils,
+			install: true)
+	endif
+	neon_fp32_add = executable('neon_fp32_add',
+		'src/simd_aarch64.cpp',
+		cpp_args: ['-DNEON_FP32_ADD'],
+		link_with: utils,
+		install: true)
+	neon_fp64_add = executable('neon_fp64_add',
+		'src/simd_aarch64.cpp',
+		cpp_args: ['-DNEON_FP64_ADD'],
+		link_with: utils,
+		install: true)
+	neon_fp32_fma = executable('neon_fp32_fma',
+		'src/simd_aarch64.cpp',
+		cpp_args: ['-DNEON_FP32_FMA'],
+		link_with: utils,
+		install: true)
+	neon_fp64_fma = executable('neon_fp64_fma',
+		'src/simd_aarch64.cpp',
+		cpp_args: ['-DNEON_FP64_FMA'],
+		link_with: utils,
+		install: true)
 endif
 
+
 cpp = meson.get_compiler('cpp')
 cpu = target_machine.cpu_family()
 

diff --git a/src/detect_uarch.cpp b/src/detect_uarch.cpp
@@ -30,6 +30,15 @@ int main() {
   case cortex_a77:
     printf("-DARM_CORTEX_A77\n");
     break;
+  case cortex_a53:
+    printf("-DARM_CORTEX_A53\n");
+    break;
+  case cortex_a55:
+    printf("-DARM_CORTEX_A55\n");
+    break;
+  case cortex_a73:
+    printf("-DARM_CORTEX_A73\n");
+    break;
   case cortex_x1:
     printf("-DARM_CORTEX_X1\n");
     break;
@@ -70,6 +79,10 @@ int main() {
     printf("-DINTEL\n");
     printf("-DINTEL_BROADWELL\n");
     break;
+  case whiskylake:
+    printf("-DINTEL\n");
+    printf("-DINTEL_WHISKYLAKE\n");
+    break;
   case zen1:
     printf("-DAMD\n");
     printf("-DAMD_ZEN1\n");

diff --git a/src/div.cpp b/src/div.cpp
@@ -0,0 +1,127 @@
+#include "include/utils.h"
+#include <immintrin.h>
+#include <time.h>
+#include <unistd.h>
+
+int res = 0;
+const int n = 1000;
+int array[n] = {0};
+const int repeat = 500;
+const int unroll = 16;
+
+void test_1(int *indices) {
+#ifdef AVX2
+  __m256d index = _mm256_set1_pd(3.33);
+  __m256d d0 = _mm256_set1_pd(1.0001);
+  for (int i = 0; i < repeat; i++) {
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+  }
+  res += index[0];
+#endif
+#ifdef AVX512
+  __m512d index = _mm512_set1_pd(3.33);
+  __m512d d0 = _mm512_set1_pd(1.0001);
+  for (int i = 0; i < repeat; i++) {
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+  }
+  res += index[0];
+#endif
+}
+
+int main(int argc, char *argv[]) {
+
+  int opt;
+  while ((opt = getopt(argc, argv, "")) != -1) {
+    switch (opt) {
+    default:
+      fprintf(stderr, "Usage: %s [-p]\n", argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  bind_to_core();
+  setup_perf_instructions();
+  setup_perf_cycles();
+
+  // int indices[] = {0, 1, 2, 3, 4, 5, 6, 7};
+#ifdef AVX2
+  const int vlen = 8;
+#endif
+#ifdef AVX512
+  const int vlen = 16;
+#endif
+  int indices[vlen];
+  srand(time(NULL));
+  for (int i = 0; i < vlen; i++) {
+    indices[i] = rand() % 32;
+  }
+
+  printf("Numbers:");
+  for (int i = 0; i < vlen; i++) {
+    // generate patterns
+    printf(" %d", indices[i]);
+    array[indices[i]] = indices[i];
+  }
+  printf("\n");
+
+  int warmup = 1000;
+
+  for (int i = 0; i < warmup; i++) {
+    test_1(indices);
+  }
+
+  int m = 50000;
+  uint64_t cycles_before = perf_read_cycles();
+  uint64_t instructions_before = perf_read_instructions();
+
+  for (int i = 0; i < m; i++) {
+    test_1(indices);
+  }
+
+  uint64_t cycles_after = perf_read_cycles();
+  uint64_t instructions_after = perf_read_instructions();
+
+  // i9-14900K: AVX2 24 cycles
+  // i9-12900KS: AVX2 24 cycles
+  // i9-10980XE: AVX2 38 cycles, AVX512 43 cycles
+  // EPYC 9654: AVX2 20 cycles, AVX512 33 cycles
+  // EPYC 7742: AVX2 21 cycles
+  // EPYC 7551: AVX2 20 cycles
+  printf("%ld cycles, %ld instructions, %.2lf ipc, %d ans\n",
+         (cycles_after - cycles_before) / m / repeat / unroll,
+         (instructions_after - instructions_before) / m / repeat / unroll,
+         (double)(instructions_after - instructions_before) /
+             (cycles_after - cycles_before),
+         res);
+  return 0;
+}