From fd5412735aa3b4032d32b01d824095865483c34d Mon Sep 17 00:00:00 2001 From: anton aksenov Date: Sun, 6 Oct 2024 23:31:53 +0300 Subject: [PATCH] task04 done --- src/cl/matrix_multiplication.cl | 109 ++++++++++++++++++++++++++--- src/cl/matrix_transpose.cl | 78 ++++++++++++++++++--- src/main_matrix_multiplication.cpp | 16 ++--- src/main_matrix_transpose.cpp | 10 +-- 4 files changed, 175 insertions(+), 38 deletions(-) diff --git a/src/cl/matrix_multiplication.cl b/src/cl/matrix_multiplication.cl index 14c668a8..b0cd7dc4 100644 --- a/src/cl/matrix_multiplication.cl +++ b/src/cl/matrix_multiplication.cl @@ -1,5 +1,7 @@ #ifdef __CLION_IDE__ - #include + +#include + #endif @@ -7,21 +9,108 @@ // TILE_SIZE и WORK_PER_THREAD задаются через поле 'defines' в кернел конфиге -__kernel void matrix_multiplication_naive() -{ - // TODO +__kernel void matrix_multiplication_naive( + __global float *as, + __global float *bs, + __global float *cs, + const unsigned int M, + const unsigned int K, + const unsigned int N +) { + unsigned int i = get_global_id(0); + unsigned int j = get_global_id(1); + + if (i >= N || j >= M) + return; + + float sum = 0.0f; + for (unsigned int k = 0; k < K; k++) + sum += as[j * K + k] * bs[k * N + i]; + cs[j * N + i] = sum; } #ifdef TILE_SIZE -__kernel void matrix_multiplication_local() -{ - // TODO +__kernel void matrix_multiplication_local( + __global float *as, + __global float *bs, + __global float *cs, + const unsigned int M, + const unsigned int K, + const unsigned int N +) { + unsigned int i = get_global_id(0); + unsigned int j = get_global_id(1); + + __local float tile_a[TILE_SIZE][TILE_SIZE]; + __local float tile_b[TILE_SIZE][TILE_SIZE]; + + unsigned int local_i = get_local_id(0); + unsigned int local_j = get_local_id(1); + + float sum = 0.0f; + for (int tile_start = 0; tile_start < K; tile_start += TILE_SIZE) { + if (i < N && j < M && tile_start + local_i < K) + tile_a[local_j][local_i] = as[(tile_start + local_i) + j * K]; + else + tile_a[local_j][local_i] = 0.0f; + + if (i < N && j < M && tile_start + local_j < K) + tile_b[local_j][local_i] = bs[i + (tile_start + local_j) * K]; + else + tile_b[local_j][local_i] = 0.0f; + + barrier(CLK_LOCAL_MEM_FENCE); + + for (int l = 0; l < TILE_SIZE; ++l) + sum += tile_a[local_j][l] * tile_b[l][local_i]; + + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (i < N && j < M) + cs[j * N + i] = sum; } #endif #if defined(TILE_SIZE) && defined(WORK_PER_THREAD) -__kernel void matrix_multiplication_local_wpt() -{ - // TODO +__kernel void matrix_multiplication_local_wpt( + __global float *as, + __global float *bs, + __global float *cs, + const unsigned int M, + const unsigned int K, + const unsigned int N +) { + unsigned int i = get_global_id(0); + unsigned int j = get_global_id(1); + + __local float tile_a[TILE_SIZE][TILE_SIZE]; + __local float tile_b[TILE_SIZE][TILE_SIZE]; + + unsigned int local_i = get_local_id(0); + unsigned int local_j = get_local_id(1); + + float sum = 0.0f; + for (int tile_start = 0; tile_start < K; tile_start += TILE_SIZE) { + if (i < N && j < M && tile_start + local_i < K) + tile_a[local_j][local_i] = as[(tile_start + local_i) + j * K]; + else + tile_a[local_j][local_i] = 0.0f; + + if (i < N && j < M && tile_start + local_j < K) + tile_b[local_j][local_i] = bs[i + (tile_start + local_j) * N]; + else + tile_b[local_j][local_i] = 0.0f; + + barrier(CLK_LOCAL_MEM_FENCE); + + for (int l = 0; l < TILE_SIZE; ++l) + sum += tile_a[local_j][l] * tile_b[l][local_i]; + + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (i < N && j < M) + cs[j * N + i] = sum; } #endif diff --git a/src/cl/matrix_transpose.cl b/src/cl/matrix_transpose.cl index 92de15fc..b8e6ab4a 100644 --- a/src/cl/matrix_transpose.cl +++ b/src/cl/matrix_transpose.cl @@ -1,21 +1,79 @@ #ifdef __CLION_IDE__ - #include + +#include + #endif #line 6 -__kernel void matrix_transpose_naive() -{ - // TODO +__kernel void matrix_transpose_naive( + __global float *as, + __global float *as_t, + const unsigned int M, + const unsigned int K +) { + unsigned int i = get_global_id(0); + unsigned int j = get_global_id(1); + + if (i >= M || j >= K) + return; + + as_t[j * M + i] = as[i * K + j]; } -__kernel void matrix_transpose_local_bad_banks() -{ - // TODO +#define TILE_SIZE 16 +__kernel void matrix_transpose_local_bad_banks( + __global float *as, + __global float *as_t, + const unsigned int M, + const unsigned int K +) { + unsigned int i = get_global_id(0); + unsigned int j = get_global_id(1); + + unsigned int local_i = get_local_id(0); + unsigned int local_j = get_local_id(1); + + __local float tile[TILE_SIZE][TILE_SIZE]; + + unsigned int group_i = get_group_id(0); + unsigned int group_j = get_group_id(1); + + unsigned int i_new = group_i * TILE_SIZE + local_j; + unsigned int j_new = group_j * TILE_SIZE + local_i; + + tile[local_j][local_i] = as[j * M + i]; + + barrier(CLK_LOCAL_MEM_FENCE); + + as_t[i_new * K + j_new] = tile[local_i][local_j]; } -__kernel void matrix_transpose_local_good_banks() -{ - // TODO +#define TILE_SIZE 16 +__kernel void matrix_transpose_local_good_banks( + __global float *as, + __global float *as_t, + const unsigned int M, + const unsigned int K +) { + unsigned int i = get_global_id(0); + unsigned int j = get_global_id(1); + + unsigned int local_i = get_local_id(0); + unsigned int local_j = get_local_id(1); + + __local float tile[TILE_SIZE * (TILE_SIZE + 1)]; + + unsigned int group_i = get_group_id(0); + unsigned int group_j = get_group_id(1); + + unsigned int i_new = group_i * TILE_SIZE + local_j; + unsigned int j_new = group_j * TILE_SIZE + local_i; + + tile[local_j * (TILE_SIZE + 1) + local_i] = as[j * M + i]; + + barrier(CLK_LOCAL_MEM_FENCE); + + as_t[i_new * K + j_new] = tile[local_i * (TILE_SIZE + 1) + local_j]; } diff --git a/src/main_matrix_multiplication.cpp b/src/main_matrix_multiplication.cpp index 87fe2ce4..99d15e64 100644 --- a/src/main_matrix_multiplication.cpp +++ b/src/main_matrix_multiplication.cpp @@ -10,8 +10,8 @@ #include #include -const int benchmarkingIters = 10; -const int benchmarkingItersCPU = 1; +const int benchmarkingIters = 20; +const int benchmarkingItersCPU = 5; const unsigned int M = 1024; const unsigned int K = 1024; const unsigned int N = 1024; @@ -50,9 +50,8 @@ struct KernelConfig { KernelConfig makeNaiveConfig(unsigned int tile_size) { - throw std::runtime_error("not implemented"); std::string kernel_name = "matrix_multiplication_naive"; - gpu::WorkSize work_size(0, 0/*TODO*/); + gpu::WorkSize work_size(tile_size, tile_size, M, N); std::string defines; std::string prefix = "[naive, ts=" + std::to_string(tile_size) + "]"; return KernelConfig{kernel_name, work_size, defines, prefix}; @@ -60,9 +59,8 @@ KernelConfig makeNaiveConfig(unsigned int tile_size) KernelConfig makeLocalConfig(unsigned int tile_size) { - throw std::runtime_error("not implemented"); std::string kernel_name = "matrix_multiplication_local"; - gpu::WorkSize work_size(0, 0/*TODO*/); + gpu::WorkSize work_size(tile_size, tile_size, M, N); std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size); std::string prefix = "[local, ts=" + std::to_string(tile_size) + "]"; return KernelConfig{kernel_name, work_size, defines, prefix}; @@ -70,9 +68,8 @@ KernelConfig makeLocalConfig(unsigned int tile_size) KernelConfig makeLocalWPTConfig(unsigned int tile_size, unsigned int wpt) { - throw std::runtime_error("not implemented"); std::string kernel_name = "matrix_multiplication_local_wpt"; - gpu::WorkSize work_size(0, 0/*TODO*/); + gpu::WorkSize work_size(tile_size, tile_size, M, N); std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size) + " -DWORK_PER_THREAD=" + std::to_string(wpt); std::string prefix = "[local wpt, ts=" + std::to_string(tile_size) + ", wpt=" + std::to_string(wpt) + "]"; return KernelConfig{kernel_name, work_size, defines, prefix}; @@ -143,9 +140,6 @@ int main(int argc, char **argv) const std::vector cs_cpu_reference = computeCPU(as.data(), bs.data()); - // TODO uncomment - return 0; - runTest(makeNaiveConfig(4), as.data(), bs.data(), cs_cpu_reference.data()); runTest(makeNaiveConfig(8), as.data(), bs.data(), cs_cpu_reference.data()); runTest(makeNaiveConfig(16), as.data(), bs.data(), cs_cpu_reference.data()); diff --git a/src/main_matrix_transpose.cpp b/src/main_matrix_transpose.cpp index 7aea58af..8e7fe11b 100644 --- a/src/main_matrix_transpose.cpp +++ b/src/main_matrix_transpose.cpp @@ -10,7 +10,7 @@ #include #include -const int benchmarkingIters = 100; +const int benchmarkingIters = 1000; const unsigned int M = 4096; const unsigned int K = 4096; @@ -33,9 +33,8 @@ void runTest(const std::string &kernel_name, const float *as) // поставьте каретку редактирования кода внутри скобок конструктора WorkSize -> Ctrl+P -> заметьте что есть 2, 4 и 6 параметров // - для 1D, 2D и 3D рабочего пространства соответственно - // TODO uncomment -// gpu::WorkSize work_size(0, 0, 0, 0 /*TODO*/); -// matrix_transpose_kernel.exec(work_size, as_gpu, as_t_gpu, M, K); + gpu::WorkSize work_size(16, 16, M, K); + matrix_transpose_kernel.exec(work_size, as_gpu, as_t_gpu, M, K); t.nextLap(); } @@ -74,9 +73,6 @@ int main(int argc, char **argv) } std::cout << "Data generated for M=" << M << ", K=" << K << std::endl; - // TODO uncomment - return 0; - runTest("matrix_transpose_naive", as.data()); runTest("matrix_transpose_local_bad_banks", as.data()); runTest("matrix_transpose_local_good_banks", as.data());