diff --git a/src/cl/radix.cl b/src/cl/radix.cl index 0ffdd02f..caed426d 100644 --- a/src/cl/radix.cl +++ b/src/cl/radix.cl @@ -1 +1,96 @@ -// TODO \ No newline at end of file + +#define WORK_GROUP_SIZE 4 +#define TILE_SIZE 16 + +__kernel void fill_with_zeros(__global unsigned int *as, unsigned int n) +{ + const unsigned int gid = get_global_id(0); + if (gid >= n) { + return; + } + as[gid] = 0; +} + +__kernel void count(__global unsigned int *as, __global unsigned int *counters, unsigned int n, unsigned int shift, unsigned int bits_count) { + unsigned int gid = get_global_id(0); + if (gid >= n) { + return; + } + unsigned int value = (as[gid] >> shift) & ((1 << bits_count) - 1); + unsigned int wgid = get_group_id(0); + atomic_inc(&counters[wgid * (1 << bits_count) + value]); +} + +__kernel void matrix_transpose_local_good_banks( + __global float *a, + __global float *at, + unsigned int m, + unsigned int k +) { + int i = get_global_id(0); + int j = get_global_id(1); + + __local float tile[TILE_SIZE][TILE_SIZE + 1]; + int local_i = get_local_id(0); + int local_j = get_local_id(1); + + if (i < k && j < m) { + tile[local_j][local_i] = a[j * k + i]; + } else { + tile[local_j][local_i] = 0; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + int target_j = (i - local_i) + local_j; + int target_i = (j - local_j) + local_i; + if (target_i < k && target_j < m) { + at[target_j * k + target_i] = tile[local_i][local_j]; + } +} + +__kernel void prefix_sum(__global unsigned int *as, __global unsigned int *bs, unsigned int i, unsigned int n) +{ + unsigned int gid = get_global_id(0); + if (gid >= n) { + return; + } + if (gid >= i) { + bs[gid] = as[gid - i] + as[gid]; + } else { + bs[gid] = as[gid]; + } +} + +__kernel void radix_sort(__global unsigned int *as, __global unsigned int *bs, __global unsigned int *counters, unsigned int n, unsigned int shift, unsigned int bits_count) +{ + unsigned int gid = get_global_id(0); + if (gid >= n) { + return; + } + + unsigned int value = (as[gid] >> shift) & ((1 << bits_count) - 1); + + unsigned int wgid = get_group_id(0); + + unsigned int start = wgid * WORK_GROUP_SIZE; + unsigned int end = gid; + unsigned int offset = 0; + + for (unsigned int i = start; i < end; ++i) { + unsigned int prev_value = (as[i] >> shift) & ((1 << bits_count) - 1); + if (prev_value == value) { + ++offset; + } + } + + unsigned int base_idx; + unsigned int counters_idx = wgid + value * WORK_GROUP_SIZE; + if (counters_idx > 0) { + base_idx = counters[counters_idx - 1]; + } else { + base_idx = 0; + } + + bs[base_idx + offset] = as[gid]; +} diff --git a/src/main_radix.cpp b/src/main_radix.cpp index 20af3316..895918f4 100644 --- a/src/main_radix.cpp +++ b/src/main_radix.cpp @@ -11,9 +11,13 @@ #include #include +#define TILE_SIZE 16 + const int benchmarkingIters = 10; const int benchmarkingItersCPU = 1; -const unsigned int n = 32 * 1024 * 1024; +const unsigned int n = 16; + +const int bits_count = 2; template void raiseFail(const T &a, const T &b, std::string message, std::string filename, int line) { @@ -57,21 +61,67 @@ int main(int argc, char **argv) { std::cout << "Data generated for n=" << n << "!" << std::endl; const std::vector cpu_reference = computeCPU(as); - - // remove me - return 0; - + + unsigned int workGroupSize = 4; + unsigned int workGroupsCount = (n + workGroupSize - 1) / workGroupSize; + unsigned int globalWorkSize = workGroupsCount * workGroupSize; + unsigned int countersSize = workGroupsCount * (1 << bits_count); + unsigned int countersWorkSize = (countersSize + workGroupSize - 1) / workGroupSize * workGroupSize; + + gpu::gpu_mem_32u as_gpu; + gpu::gpu_mem_32u bs_gpu; + gpu::gpu_mem_32u counters_gpu; + gpu::gpu_mem_32u counters_gpu_tmp; + + as_gpu.resizeN(n); + bs_gpu.resizeN(n); + counters_gpu.resizeN(countersSize); + counters_gpu_tmp.resizeN(countersSize); + { + ocl::Kernel fill_with_zeros(radix_kernel, radix_kernel_length, "fill_with_zeros"); + fill_with_zeros.compile(); + ocl::Kernel count(radix_kernel, radix_kernel_length, "count"); + count.compile(); + ocl::Kernel transpose(radix_kernel, radix_kernel_length, "matrix_transpose_local_good_banks"); + transpose.compile(); + ocl::Kernel prefix_sum(radix_kernel, radix_kernel_length, "prefix_sum"); + prefix_sum.compile(); + ocl::Kernel radix_sort(radix_kernel, radix_kernel_length, "radix_sort"); + radix_sort.compile(); + + std::vector counters(countersSize, 0); + timer t; for (int iter = 0; iter < benchmarkingIters; ++iter) { - // Запускаем секундомер после прогрузки данных, чтобы замерять время работы кернела, а не трансфер данных + as_gpu.writeN(as.data(), n); + t.restart(); + + for (unsigned int shift = 0; shift < 32; shift += bits_count) { + fill_with_zeros.exec(gpu::WorkSize(workGroupSize, countersWorkSize), counters_gpu, countersSize); + + count.exec(gpu::WorkSize(workGroupSize, globalWorkSize), as_gpu, counters_gpu, n, shift, bits_count); + + unsigned int x_size = ((1 << bits_count) + TILE_SIZE - 1) / TILE_SIZE * TILE_SIZE; + unsigned int y_size = (workGroupsCount + TILE_SIZE - 1) / TILE_SIZE * TILE_SIZE; + transpose.exec(gpu::WorkSize(TILE_SIZE, TILE_SIZE, x_size, y_size), counters_gpu, counters_gpu_tmp, 1 << bits_count, workGroupsCount); + std::swap(counters_gpu, counters_gpu_tmp); + + for (unsigned int i = 1; i < countersSize; i *= 2) { + prefix_sum.exec(gpu::WorkSize(workGroupSize, countersWorkSize), counters_gpu, counters_gpu_tmp, i, countersSize); + std::swap(counters_gpu, counters_gpu_tmp); + } - // TODO + radix_sort.exec(gpu::WorkSize(workGroupSize, globalWorkSize), as_gpu, bs_gpu, counters_gpu, n, shift, bits_count); + std::swap(as_gpu, bs_gpu); + } + t.nextLap(); } t.stop(); std::cout << "GPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl; std::cout << "GPU: " << (n / 1000.0 / 1000.0) / t.lapAvg() << " millions/s" << std::endl; + as_gpu.readN(as.data(), n); } // Проверяем корректность результатов