Skip to content

Commit

Permalink
Added work-efficient prefix sum
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyaBolkisev committed Jan 14, 2025
1 parent 286de39 commit 6080918
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 4 deletions.
22 changes: 21 additions & 1 deletion src/cl/prefix_sum.cl
Original file line number Diff line number Diff line change
@@ -1 +1,21 @@
// TODO
#ifdef __CLION_IDE__
#include <libgpu/opencl/cl/clion_defines.cl>
#endif

#line 6

__kernel void prefix_sum_binary(__global unsigned int* sum, unsigned int n, unsigned int rate)
{
const int i = (get_global_id(0) + 1) * rate - 1;
if (i < n) {
sum[i] += sum[i - (rate >> 1)];
}
}

__kernel void prefix_sum_second_part(__global unsigned int* sum, unsigned int n, unsigned int rate)
{
const int i = (get_global_id(0) + 1) * rate - 1 + (rate >> 1);
if (i < n) {
sum[i] += sum[i - (rate >> 1)];
}
}
29 changes: 26 additions & 3 deletions src/main_prefix_sum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,18 +83,41 @@ int main(int argc, char **argv)
#endif

// work-efficient prefix sum
#if 0
#if 1
{
gpu::Device device = gpu::chooseGPUDevice(argc, argv);
gpu::Context context;
context.init(device.device_id_opencl);
context.activate();

std::vector<unsigned int> res(n);

ocl::Kernel prefix_sum_binary(prefix_sum_kernel, prefix_sum_kernel_length, "prefix_sum_binary");
ocl::Kernel prefix_sum_second_part(prefix_sum_kernel, prefix_sum_kernel_length, "prefix_sum_second_part");
prefix_sum_binary.compile();
prefix_sum_second_part.compile();

gpu::gpu_mem_32u gpu;
gpu.resizeN(n);

timer t;
for (int iter = 0; iter < benchmarkingIters; ++iter) {
// TODO
gpu.writeN(as.data(), as.size());
t.restart();
// TODO

for (unsigned int rate = 2; rate <= n; rate *= 2)
prefix_sum_binary.exec(
gpu::WorkSize(64, n / rate), gpu, n, rate);

for (unsigned int rate = n / 2; rate >= 2; rate /= 2)
prefix_sum_second_part.exec(gpu::WorkSize(64, (n + rate - 1) / rate),
gpu, n, rate);

t.nextLap();
}

gpu.readN(res.data(), as.size());

std::cout << "GPU [work-efficient]: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl;
std::cout << "GPU [work-efficient]: " << (n / 1000.0 / 1000.0) / t.lapAvg() << " millions/s" << std::endl;

Expand Down

0 comments on commit 6080918

Please sign in to comment.