GPGPUCourse · kurmakaevAlsu · Nov 10, 2024 · Jan 8, 2025 · Jan 8, 2025 · Jan 8, 2025
diff --git a/src/cl/radix.cl b/src/cl/radix.cl
@@ -1 +1,96 @@
-// TODO
+
+#define WORK_GROUP_SIZE 4
+#define TILE_SIZE 16
+
+__kernel void fill_with_zeros(__global unsigned int *as, unsigned int n)
+{
+    const unsigned int gid = get_global_id(0);
+    if (gid >= n) {
+        return;
+    }
+    as[gid] = 0;
+}
+
+__kernel void count(__global unsigned int *as, __global unsigned int *counters, unsigned int n, unsigned int shift, unsigned int bits_count) {
+    unsigned int gid = get_global_id(0);
+    if (gid >= n) {
+        return;
+    }
+    unsigned int value = (as[gid] >> shift) & ((1 << bits_count) - 1); 
+    unsigned int wgid = get_group_id(0);
+    atomic_inc(&counters[wgid * (1 << bits_count) + value]);
+}
+
+__kernel void matrix_transpose_local_good_banks(
+    __global float *a,
+    __global float *at,
+    unsigned int m,
+    unsigned int k
+) {
+    int i = get_global_id(0);
+    int j = get_global_id(1);
+
+    __local float tile[TILE_SIZE][TILE_SIZE + 1];
+    int local_i = get_local_id(0);
+    int local_j = get_local_id(1);
+
+    if (i < k && j < m) {
+        tile[local_j][local_i] = a[j * k + i];
+    } else {
+        tile[local_j][local_i] = 0;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int target_j = (i - local_i) + local_j;
+    int target_i = (j - local_j) + local_i;
+    if (target_i < k && target_j < m) {
+        at[target_j * k + target_i] = tile[local_i][local_j];
+    }
+}
+
+__kernel void prefix_sum(__global unsigned int *as, __global unsigned int *bs, unsigned int i, unsigned int n)
+{
+    unsigned int gid = get_global_id(0);
+    if (gid >= n) {
+        return;
+    }
+    if (gid >= i) {
+        bs[gid] = as[gid - i] + as[gid];
+    } else {
+        bs[gid] = as[gid];
+    }
+}
+
+__kernel void radix_sort(__global unsigned int *as, __global unsigned int *bs, __global unsigned int *counters, unsigned int n, unsigned int shift, unsigned int bits_count)
+{
+    unsigned int gid = get_global_id(0);
+    if (gid >= n) {
+        return;
+    }
+
+    unsigned int value = (as[gid] >> shift) & ((1 << bits_count) - 1);
+
+    unsigned int wgid = get_group_id(0);
+
+    unsigned int start = wgid * WORK_GROUP_SIZE;
+    unsigned int end = gid;
+    unsigned int offset = 0;
+
+    for (unsigned int i = start; i < end; ++i) {
+        unsigned int prev_value = (as[i] >> shift) & ((1 << bits_count) - 1);
+        if (prev_value == value) {
+            ++offset;
+        }
+    }
+
+    unsigned int base_idx;
+    unsigned int counters_idx = wgid + value * WORK_GROUP_SIZE;
+    if (counters_idx > 0) {
+        base_idx = counters[counters_idx - 1];
+    } else {
+        base_idx = 0;
+    }
+
+    bs[base_idx + offset] = as[gid];
+}
diff --git a/src/main_radix.cpp b/src/main_radix.cpp
@@ -11,9 +11,13 @@
 #include <stdexcept>
 #include <vector>
 
+#define TILE_SIZE 16
+
 const int benchmarkingIters = 10;
 const int benchmarkingItersCPU = 1;
-const unsigned int n = 32 * 1024 * 1024;
+const unsigned int n = 16;
+
+const int bits_count = 2;
 
 template<typename T>
 void raiseFail(const T &a, const T &b, std::string message, std::string filename, int line) {
@@ -57,21 +61,67 @@ int main(int argc, char **argv) {
     std::cout << "Data generated for n=" << n << "!" << std::endl;
 
     const std::vector<unsigned int> cpu_reference = computeCPU(as);
-
-    // remove me
-    return 0;
-
+
+    unsigned int workGroupSize = 4;
+    unsigned int workGroupsCount = (n + workGroupSize - 1) / workGroupSize;
+    unsigned int globalWorkSize = workGroupsCount * workGroupSize;
+    unsigned int countersSize = workGroupsCount * (1 << bits_count);
+    unsigned int countersWorkSize = (countersSize + workGroupSize - 1) / workGroupSize * workGroupSize;
+
+    gpu::gpu_mem_32u as_gpu;
+    gpu::gpu_mem_32u bs_gpu;
+    gpu::gpu_mem_32u counters_gpu;
+    gpu::gpu_mem_32u counters_gpu_tmp;
+
+    as_gpu.resizeN(n);
+    bs_gpu.resizeN(n);
+    counters_gpu.resizeN(countersSize);
+    counters_gpu_tmp.resizeN(countersSize);
+
     {
+        ocl::Kernel fill_with_zeros(radix_kernel, radix_kernel_length, "fill_with_zeros");
+        fill_with_zeros.compile();
+        ocl::Kernel count(radix_kernel, radix_kernel_length, "count");
+        count.compile();
+        ocl::Kernel transpose(radix_kernel, radix_kernel_length, "matrix_transpose_local_good_banks");
+        transpose.compile();
+        ocl::Kernel prefix_sum(radix_kernel, radix_kernel_length, "prefix_sum");
+        prefix_sum.compile();
+        ocl::Kernel radix_sort(radix_kernel, radix_kernel_length, "radix_sort");
+        radix_sort.compile();
+
+	std::vector<unsigned int> counters(countersSize, 0);
+
         timer t;
         for (int iter = 0; iter < benchmarkingIters; ++iter) {
-            // Запускаем секундомер после прогрузки данных, чтобы замерять время работы кернела, а не трансфер данных
+            as_gpu.writeN(as.data(), n);
+            t.restart();
+
+            for (unsigned int shift = 0; shift < 32; shift += bits_count) {
+                fill_with_zeros.exec(gpu::WorkSize(workGroupSize, countersWorkSize), counters_gpu, countersSize);
+
+                count.exec(gpu::WorkSize(workGroupSize, globalWorkSize), as_gpu, counters_gpu, n, shift, bits_count);
+
+                unsigned int x_size = ((1 << bits_count) + TILE_SIZE - 1) / TILE_SIZE * TILE_SIZE;
+                unsigned int y_size = (workGroupsCount + TILE_SIZE - 1) / TILE_SIZE * TILE_SIZE;
+                transpose.exec(gpu::WorkSize(TILE_SIZE, TILE_SIZE, x_size, y_size), counters_gpu, counters_gpu_tmp, 1 << bits_count, workGroupsCount);
+                std::swap(counters_gpu, counters_gpu_tmp);
+
+                for (unsigned int i = 1; i < countersSize; i *= 2) {
+                    prefix_sum.exec(gpu::WorkSize(workGroupSize, countersWorkSize), counters_gpu, counters_gpu_tmp, i, countersSize);
+                    std::swap(counters_gpu, counters_gpu_tmp);
+                }
 
-            // TODO
+                radix_sort.exec(gpu::WorkSize(workGroupSize, globalWorkSize), as_gpu, bs_gpu, counters_gpu, n, shift, bits_count);
+                std::swap(as_gpu, bs_gpu);
+            }
+            t.nextLap();
         }
         t.stop();
 
         std::cout << "GPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl;
         std::cout << "GPU: " << (n / 1000.0 / 1000.0) / t.lapAvg() << " millions/s" << std::endl;
+        as_gpu.readN(as.data(), n);
     }
 
     // Проверяем корректность результатов