Skip to content

Commit c58984a

Browse files
authored
Update prime_finder.c
Add faster initialization with SIMD instructions
1 parent bacc915 commit c58984a

File tree

1 file changed

+40
-16
lines changed

1 file changed

+40
-16
lines changed

prime_finder.c

+40-16
Original file line numberDiff line numberDiff line change
@@ -4,53 +4,75 @@
44
#include <time.h>
55
#include <stdint.h>
66

7-
uint64_t run(uint64_t max) {
7+
#include <immintrin.h>
8+
9+
uint64_t compute(uint64_t max) {
810
if (max < 5) {
911
printf("Max value must be at least 5.\n");
1012
return -1;
1113
}
1214
uint64_t byteSize = ((max - 5) >> 4) + 1;
13-
byteSize += 3 - (byteSize % 3); // Bit overflow for optimization
15+
byteSize += 32 - (byteSize % 32); // Make it a multiple of 32
1416
printf("Attempting to allocate %lu bytes.\n", byteSize);
1517

1618
unsigned char* mem = (unsigned char*)malloc(byteSize);
1719
if (!mem) {
1820
printf("Failed to allocate memory.\n");
1921
return -1;
2022
}
21-
for (uint64_t i = 0; i < byteSize; i += 3) {
22-
mem[i] = 0b00100100; // if (i + 1 < byteSize)
23-
mem[i + 1] = 0b01001001;
24-
mem[i + 2] = 0b10010010;
23+
24+
__m256i pattern = _mm256_setr_epi8(
25+
0b00100100, 0b01001001, 0b10010010,
26+
0b00100100, 0b01001001, 0b10010010,
27+
0b00100100, 0b01001001, 0b10010010,
28+
0b00100100, 0b01001001, 0b10010010,
29+
0b00100100, 0b01001001, 0b10010010,
30+
0b00100100, 0b01001001, 0b10010010,
31+
0b00100100, 0b01001001, 0b10010010,
32+
0b00100100, 0b01001001, 0b10010010,
33+
0b00100100, 0b01001001, 0b10010010,
34+
0b00100100, 0b01001001, 0b10010010,
35+
0b00100100, 0b01001001
36+
);
37+
38+
for (uint64_t i = 0; i <= byteSize; i += 33) {
39+
_mm256_storeu_si256((__m256i *)(mem + i), pattern);
40+
mem[32 + i] = 0b10010010;
2541
}
2642

2743
printf("Memory allocated.\n");
2844

29-
uint64_t halfMax = max >> 1;
30-
uint64_t sqrtMax = (uint64_t)sqrt(max) | 1;
45+
const uint64_t halfMax = max >> 1;
46+
const uint64_t sqrtMax = (uint64_t)sqrt(max); // Have to be odd !
3147
uint64_t total = 2;
3248

33-
if (sqrtMax < 5) { sqrtMax = 5; }
49+
if (sqrtMax < 5) { const sqrtMax = 5; }
3450

3551
for (uint64_t n = 5; n <= sqrtMax; n += 2) {
36-
uint64_t idx = (n - 5) >> 1;
37-
unsigned char byte = mem[idx >> 3];
38-
unsigned char bit = 1 << (idx & 7);
52+
const uint64_t idx = (n - 5) >> 1;
53+
const unsigned char byte = mem[idx >> 3];
54+
const unsigned char bit = 1 << (idx & 7);
3955

4056
if (!(byte & bit)) {
4157
total++;
58+
59+
const int startPos = (n * n - 5) >> 1;
60+
61+
4262
for (uint64_t k = (n * n - 5) >> 1; k <= halfMax; k += n) {
4363
mem[k >> 3] |= (1 << (k & 7));
4464
}
4565
}
4666
}
4767

48-
uint64_t cacheidx = (sqrtMax - 5) >> 1;
49-
unsigned char bit = 1 << ((cacheidx & 7) - 1);
68+
printf("First pass completed.\n");
69+
70+
uint64_t cacheidx = (sqrtMax + 1 - 5) >> 1;
71+
unsigned char bit = 1 << (cacheidx & 7);
5072
cacheidx >>= 3;
5173
unsigned char cache = mem[cacheidx];
5274

53-
for (uint64_t n = sqrtMax + 2; n <= max; n += 2) {
75+
for (uint64_t n = sqrtMax + 1; n <= max; n += 2) {
5476
if (!(cache & bit)) {
5577
total++;
5678
}
@@ -62,6 +84,8 @@ uint64_t run(uint64_t max) {
6284
}
6385
}
6486

87+
printf("\nSecond pass completed.\n");
88+
6589
free(mem);
6690
return total;
6791
}
@@ -76,7 +100,7 @@ int main(int argc, char *argv[]) {
76100
}
77101

78102
clock_t start = clock();
79-
uint64_t total = run(max_value);
103+
uint64_t total = compute(max_value);
80104
clock_t end = clock();
81105
double time_spent = (double)(end - start) / CLOCKS_PER_SEC;
82106

0 commit comments

Comments
 (0)