From fd5412735aa3b4032d32b01d824095865483c34d Mon Sep 17 00:00:00 2001
From: anton aksenov <anton.axenov.anton@gmail.com>
Date: Sun, 6 Oct 2024 23:31:53 +0300
Subject: [PATCH] task04 done

---
 src/cl/matrix_multiplication.cl    | 109 ++++++++++++++++++++++++++---
 src/cl/matrix_transpose.cl         |  78 ++++++++++++++++++---
 src/main_matrix_multiplication.cpp |  16 ++---
 src/main_matrix_transpose.cpp      |  10 +--
 4 files changed, 175 insertions(+), 38 deletions(-)

diff --git a/src/cl/matrix_multiplication.cl b/src/cl/matrix_multiplication.cl
index 14c668a8..b0cd7dc4 100644
--- a/src/cl/matrix_multiplication.cl
+++ b/src/cl/matrix_multiplication.cl
@@ -1,5 +1,7 @@
 #ifdef __CLION_IDE__
-    #include <libgpu/opencl/cl/clion_defines.cl>
+
+#include <libgpu/opencl/cl/clion_defines.cl>
+
 #endif
 
 
@@ -7,21 +9,108 @@
 
 // TILE_SIZE и WORK_PER_THREAD задаются через поле 'defines' в кернел конфиге
 
-__kernel void matrix_multiplication_naive()
-{
-    // TODO
+__kernel void matrix_multiplication_naive(
+        __global float *as,
+        __global float *bs,
+        __global float *cs,
+        const unsigned int M,
+        const unsigned int K,
+        const unsigned int N
+) {
+    unsigned int i = get_global_id(0);
+    unsigned int j = get_global_id(1);
+
+    if (i >= N || j >= M)
+        return;
+
+    float sum = 0.0f;
+    for (unsigned int k = 0; k < K; k++)
+        sum += as[j * K + k] * bs[k * N + i];
+    cs[j * N + i] = sum;
 }
 
 #ifdef TILE_SIZE
-__kernel void matrix_multiplication_local()
-{
-    // TODO
+__kernel void matrix_multiplication_local(
+        __global float *as,
+        __global float *bs,
+        __global float *cs,
+        const unsigned int M,
+        const unsigned int K,
+        const unsigned int N
+) {
+    unsigned int i = get_global_id(0);
+    unsigned int j = get_global_id(1);
+
+    __local float tile_a[TILE_SIZE][TILE_SIZE];
+    __local float tile_b[TILE_SIZE][TILE_SIZE];
+
+    unsigned int local_i = get_local_id(0);
+    unsigned int local_j = get_local_id(1);
+
+    float sum = 0.0f;
+    for (int tile_start = 0; tile_start < K; tile_start += TILE_SIZE) {
+        if (i < N && j < M && tile_start + local_i < K)
+            tile_a[local_j][local_i] = as[(tile_start + local_i) + j * K];
+        else
+            tile_a[local_j][local_i] = 0.0f;
+
+        if (i < N && j < M && tile_start + local_j < K)
+            tile_b[local_j][local_i] = bs[i + (tile_start + local_j) * K];
+        else
+            tile_b[local_j][local_i] = 0.0f;
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        for (int l = 0; l < TILE_SIZE; ++l)
+            sum += tile_a[local_j][l] * tile_b[l][local_i];
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (i < N && j < M)
+        cs[j * N + i] = sum;
 }
 #endif
 
 #if defined(TILE_SIZE) && defined(WORK_PER_THREAD)
-__kernel void matrix_multiplication_local_wpt()
-{
-    // TODO
+__kernel void matrix_multiplication_local_wpt(
+        __global float *as,
+        __global float *bs,
+        __global float *cs,
+        const unsigned int M,
+        const unsigned int K,
+        const unsigned int N
+) {
+    unsigned int i = get_global_id(0);
+    unsigned int j = get_global_id(1);
+
+    __local float tile_a[TILE_SIZE][TILE_SIZE];
+    __local float tile_b[TILE_SIZE][TILE_SIZE];
+
+    unsigned int local_i = get_local_id(0);
+    unsigned int local_j = get_local_id(1);
+
+    float sum = 0.0f;
+    for (int tile_start = 0; tile_start < K; tile_start += TILE_SIZE) {
+        if (i < N && j < M && tile_start + local_i < K)
+            tile_a[local_j][local_i] = as[(tile_start + local_i) + j * K];
+        else
+            tile_a[local_j][local_i] = 0.0f;
+
+        if (i < N && j < M && tile_start + local_j < K)
+            tile_b[local_j][local_i] = bs[i + (tile_start + local_j) * N];
+        else
+            tile_b[local_j][local_i] = 0.0f;
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        for (int l = 0; l < TILE_SIZE; ++l)
+            sum += tile_a[local_j][l] * tile_b[l][local_i];
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (i < N && j < M)
+        cs[j * N + i] = sum;
 }
 #endif
diff --git a/src/cl/matrix_transpose.cl b/src/cl/matrix_transpose.cl
index 92de15fc..b8e6ab4a 100644
--- a/src/cl/matrix_transpose.cl
+++ b/src/cl/matrix_transpose.cl
@@ -1,21 +1,79 @@
 #ifdef __CLION_IDE__
-    #include <libgpu/opencl/cl/clion_defines.cl>
+
+#include <libgpu/opencl/cl/clion_defines.cl>
+
 #endif
 
 
 #line 6
 
-__kernel void matrix_transpose_naive()
-{
-    // TODO
+__kernel void matrix_transpose_naive(
+        __global float *as,
+        __global float *as_t,
+        const unsigned int M,
+        const unsigned int K
+) {
+    unsigned int i = get_global_id(0);
+    unsigned int j = get_global_id(1);
+
+    if (i >= M || j >= K)
+        return;
+
+    as_t[j * M + i] = as[i * K + j];
 }
 
-__kernel void matrix_transpose_local_bad_banks()
-{
-    // TODO
+#define TILE_SIZE 16
+__kernel void matrix_transpose_local_bad_banks(
+        __global float *as,
+        __global float *as_t,
+        const unsigned int M,
+        const unsigned int K
+) {
+    unsigned int i = get_global_id(0);
+    unsigned int j = get_global_id(1);
+
+    unsigned int local_i = get_local_id(0);
+    unsigned int local_j = get_local_id(1);
+
+    __local float tile[TILE_SIZE][TILE_SIZE];
+
+    unsigned int group_i = get_group_id(0);
+    unsigned int group_j = get_group_id(1);
+
+    unsigned int i_new = group_i * TILE_SIZE + local_j;
+    unsigned int j_new = group_j * TILE_SIZE + local_i;
+
+    tile[local_j][local_i] = as[j * M + i];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    as_t[i_new * K + j_new] = tile[local_i][local_j];
 }
 
-__kernel void matrix_transpose_local_good_banks()
-{
-    // TODO
+#define TILE_SIZE 16
+__kernel void matrix_transpose_local_good_banks(
+        __global float *as,
+        __global float *as_t,
+        const unsigned int M,
+        const unsigned int K
+) {
+    unsigned int i = get_global_id(0);
+    unsigned int j = get_global_id(1);
+
+    unsigned int local_i = get_local_id(0);
+    unsigned int local_j = get_local_id(1);
+
+    __local float tile[TILE_SIZE * (TILE_SIZE + 1)];
+
+    unsigned int group_i = get_group_id(0);
+    unsigned int group_j = get_group_id(1);
+
+    unsigned int i_new = group_i * TILE_SIZE + local_j;
+    unsigned int j_new = group_j * TILE_SIZE + local_i;
+
+    tile[local_j * (TILE_SIZE + 1) + local_i] = as[j * M + i];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    as_t[i_new * K + j_new] = tile[local_i * (TILE_SIZE + 1) + local_j];
 }
diff --git a/src/main_matrix_multiplication.cpp b/src/main_matrix_multiplication.cpp
index 87fe2ce4..99d15e64 100644
--- a/src/main_matrix_multiplication.cpp
+++ b/src/main_matrix_multiplication.cpp
@@ -10,8 +10,8 @@
 #include <iostream>
 #include <stdexcept>
 
-const int benchmarkingIters = 10;
-const int benchmarkingItersCPU = 1;
+const int benchmarkingIters = 20;
+const int benchmarkingItersCPU = 5;
 const unsigned int M = 1024;
 const unsigned int K = 1024;
 const unsigned int N = 1024;
@@ -50,9 +50,8 @@ struct KernelConfig {
 
 KernelConfig makeNaiveConfig(unsigned int tile_size)
 {
-    throw std::runtime_error("not implemented");
     std::string kernel_name = "matrix_multiplication_naive";
-    gpu::WorkSize work_size(0, 0/*TODO*/);
+    gpu::WorkSize work_size(tile_size, tile_size, M, N);
     std::string defines;
     std::string prefix = "[naive, ts=" + std::to_string(tile_size) + "]";
     return KernelConfig{kernel_name, work_size, defines, prefix};
@@ -60,9 +59,8 @@ KernelConfig makeNaiveConfig(unsigned int tile_size)
 
 KernelConfig makeLocalConfig(unsigned int tile_size)
 {
-    throw std::runtime_error("not implemented");
     std::string kernel_name = "matrix_multiplication_local";
-    gpu::WorkSize work_size(0, 0/*TODO*/);
+    gpu::WorkSize work_size(tile_size, tile_size, M, N);
     std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size);
     std::string prefix = "[local, ts=" + std::to_string(tile_size) + "]";
     return KernelConfig{kernel_name, work_size, defines, prefix};
@@ -70,9 +68,8 @@ KernelConfig makeLocalConfig(unsigned int tile_size)
 
 KernelConfig makeLocalWPTConfig(unsigned int tile_size, unsigned int wpt)
 {
-    throw std::runtime_error("not implemented");
     std::string kernel_name = "matrix_multiplication_local_wpt";
-    gpu::WorkSize work_size(0, 0/*TODO*/);
+    gpu::WorkSize work_size(tile_size, tile_size, M, N);
     std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size) + " -DWORK_PER_THREAD=" + std::to_string(wpt);
     std::string prefix = "[local wpt, ts=" + std::to_string(tile_size) + ", wpt=" + std::to_string(wpt) + "]";
     return KernelConfig{kernel_name, work_size, defines, prefix};
@@ -143,9 +140,6 @@ int main(int argc, char **argv)
 
     const std::vector<float> cs_cpu_reference = computeCPU(as.data(), bs.data());
 
-    // TODO uncomment
-    return 0;
-
     runTest(makeNaiveConfig(4), as.data(), bs.data(), cs_cpu_reference.data());
     runTest(makeNaiveConfig(8), as.data(), bs.data(), cs_cpu_reference.data());
     runTest(makeNaiveConfig(16), as.data(), bs.data(), cs_cpu_reference.data());
diff --git a/src/main_matrix_transpose.cpp b/src/main_matrix_transpose.cpp
index 7aea58af..8e7fe11b 100644
--- a/src/main_matrix_transpose.cpp
+++ b/src/main_matrix_transpose.cpp
@@ -10,7 +10,7 @@
 #include <iostream>
 #include <stdexcept>
 
-const int benchmarkingIters = 100;
+const int benchmarkingIters = 1000;
 const unsigned int M = 4096;
 const unsigned int K = 4096;
 
@@ -33,9 +33,8 @@ void runTest(const std::string &kernel_name, const float *as)
         // поставьте каретку редактирования кода внутри скобок конструктора WorkSize -> Ctrl+P -> заметьте что есть 2, 4 и 6 параметров
         // - для 1D, 2D и 3D рабочего пространства соответственно
 
-        // TODO uncomment
-//        gpu::WorkSize work_size(0, 0, 0, 0 /*TODO*/);
-//        matrix_transpose_kernel.exec(work_size, as_gpu, as_t_gpu, M, K);
+        gpu::WorkSize work_size(16, 16, M, K);
+        matrix_transpose_kernel.exec(work_size, as_gpu, as_t_gpu, M, K);
 
         t.nextLap();
     }
@@ -74,9 +73,6 @@ int main(int argc, char **argv)
     }
     std::cout << "Data generated for M=" << M << ", K=" << K << std::endl;
 
-    // TODO uncomment
-    return 0;
-
     runTest("matrix_transpose_naive", as.data());
     runTest("matrix_transpose_local_bad_banks", as.data());
     runTest("matrix_transpose_local_good_banks", as.data());