This repository has been archived by the owner on Jan 13, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.c
204 lines (162 loc) · 6.56 KB
/
main.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
// Adapted from
// https://github.com/itzmeanjan/vectorized-rescue-prime/blob/614500d/main.c
#include "bench.h"
#include "test.h"
#define show_message_and_exit(status, msg) \
if (status != CL_SUCCESS) { \
printf(msg); \
return EXIT_FAILURE; \
}
// Executes same kernel N -times with same input configuration, finding out
// average execution time in nanosecond level granularity, along with average
// host to device & device to host data transfer cost
#define avg_bench_time(itr_cnt, ts) \
for (size_t i = 0; i < itr_cnt; i++) { \
cl_ulong* ts_ = (cl_ulong*)malloc(sizeof(cl_ulong) * 3); \
status = bench_merklize(ctx, c_queue, krnl_2, leaf_count, wg_size, ts_); \
*(ts + 0) += *(ts_ + 0); \
*(ts + 1) += *(ts_ + 1); \
*(ts + 2) += *(ts_ + 2); \
} \
*(ts + 0) /= itr_cnt; \
*(ts + 1) /= itr_cnt; \
*(ts + 2) /= itr_cnt;
#define STR_(x) #x
#define STR(x) STR_(x)
#ifndef PROGRAM_FROM_IL
#define PROGRAM_FROM_SOURCE
#else
#ifndef SPIRV_IR_0
#define SPIRV_IR_0 kernel_0.spv
#endif
#ifndef SPIRV_IR_1
#define SPIRV_IR_1 kernel_1.spv
#endif
#ifndef SPIRV_IR_2
#define SPIRV_IR_2 kernel_2.spv
#endif
#endif
int
main(int argc, char** argv)
{
cl_int status;
cl_device_id dev_id;
status = find_device(&dev_id);
show_message_and_exit(status, "failed to find device !\n");
size_t val_size;
status = clGetDeviceInfo(dev_id, CL_DEVICE_NAME, 0, NULL, &val_size);
show_message_and_exit(status, "failed to get device name !\n");
void* dev_name = malloc(val_size);
check_mem_alloc(dev_name);
status = clGetDeviceInfo(dev_id, CL_DEVICE_NAME, val_size, dev_name, NULL);
show_message_and_exit(status, "failed to get device name !\n");
printf("running on %s\n", (char*)dev_name);
cl_context ctx = clCreateContext(NULL, 1, &dev_id, NULL, NULL, &status);
show_message_and_exit(status, "failed to create context !\n");
// enable profiling in queue, to get (precise) kernel execution time with
// nanosecond level granularity
//
// out of order execution is beneficial, because hierarchical structure of
// merkle tree requires multiple command submissions, which are not always
// dependent on all previous commands enqueued till now
//
// in certain cases, runtime will benefit by better inferring compute
// dependency graph from event dependencies specified when enqueuing commands
cl_queue_properties props[] = {
CL_QUEUE_PROPERTIES,
CL_QUEUE_PROFILING_ENABLE |
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, // because it's a bit field
0
};
cl_command_queue c_queue =
clCreateCommandQueueWithProperties(ctx, dev_id, props, &status);
show_message_and_exit(status, "failed to create command queue !\n");
// Note following three programs, use different compilation flags
// resulting into different kernels in preprocessed source code
cl_program* prgm_0 = (cl_program*)malloc(sizeof(cl_program));
#ifdef PROGRAM_FROM_IL
status = build_kernel_from_il(ctx, dev_id, STR(SPIRV_IR_0), NULL, prgm_0);
#else
status = build_kernel_from_source(
ctx, dev_id, "kernel.cl", ocl_kernel_flag_0, prgm_0);
#endif
if (status != CL_SUCCESS) {
printf("failed to compile kernel !\n");
show_build_log(dev_id, *prgm_0);
return EXIT_FAILURE;
}
cl_program* prgm_1 = (cl_program*)malloc(sizeof(cl_program));
#ifdef PROGRAM_FROM_IL
status = build_kernel_from_il(ctx, dev_id, STR(SPIRV_IR_1), NULL, prgm_1);
#else
status = build_kernel_from_source(
ctx, dev_id, "kernel.cl", ocl_kernel_flag_1, prgm_1);
#endif
if (status != CL_SUCCESS) {
printf("failed to compile kernel !\n");
show_build_log(dev_id, *prgm_1);
return EXIT_FAILURE;
}
cl_program* prgm_2 = (cl_program*)malloc(sizeof(cl_program));
#ifdef PROGRAM_FROM_IL
status = build_kernel_from_il(ctx, dev_id, STR(SPIRV_IR_2), NULL, prgm_2);
#else
status = build_kernel_from_source(
ctx, dev_id, "kernel.cl", ocl_kernel_flag_2, prgm_2);
#endif
if (status != CL_SUCCESS) {
printf("failed to compile kernel !\n");
show_build_log(dev_id, *prgm_2);
return EXIT_FAILURE;
}
cl_kernel krnl_0 = clCreateKernel(*prgm_0, "hash", &status);
show_message_and_exit(status, "failed to create `hash` kernel !\n");
cl_kernel krnl_1 = clCreateKernel(*prgm_1, "hash", &status);
show_message_and_exit(status, "failed to create `hash` kernel !\n");
// kernel to be used for benchmarking
cl_kernel krnl_2 = clCreateKernel(*prgm_2, "merklize", &status);
show_message_and_exit(status, "failed to create `merklize` kernel !\n");
status = test_hash_0(ctx, c_queue, krnl_0);
status = test_hash_1(ctx, c_queue, krnl_1);
printf("\npassed blake3 hash test !\n");
printf("\nBenchmarking Binary Merklization using BLAKE3\n\n");
size_t wg_size = 0;
preferred_work_group_size_multiple(krnl_2, dev_id, &wg_size);
const size_t itr_cnt = 1 << 3;
for (size_t i = 20; i <= 25; i++) {
size_t leaf_count = 1 << i;
// allocate enough space so that following three kinds of time ( in nano
// second level granularity ) can be stored after completion of merklization
//
// 0. kernel execution time
// 1. host to device data tx time
// 2. device to host data tx time
cl_ulong* ts = (cl_ulong*)malloc(sizeof(cl_ulong) * 3);
memset(ts, 0, sizeof(cl_ulong) * 3); // just to be safe !
avg_bench_time(itr_cnt, ts);
printf(
"merklized 2 ^ %2zu leaves in %16.4lf ms\t\twith host to device data tx "
"in %16.4lf ms\t\twhile device to host data tx took %16.4lf ms\n",
i,
(double)*(ts + 0) * 1e-6,
(double)*(ts + 1) * 1e-6,
(double)*(ts + 2) * 1e-6);
free(ts);
}
// release all opencl resources acquired
clReleaseKernel(krnl_0);
clReleaseKernel(krnl_1);
clReleaseKernel(krnl_2);
clReleaseProgram(*prgm_0);
clReleaseProgram(*prgm_1);
clReleaseProgram(*prgm_2);
clReleaseCommandQueue(c_queue);
clReleaseContext(ctx);
clReleaseDevice(dev_id);
// release host memory
free(dev_name);
free(prgm_0);
free(prgm_1);
free(prgm_2);
return EXIT_SUCCESS;
}