Skip to content

Commit

Permalink
task04 done
Browse files Browse the repository at this point in the history
  • Loading branch information
AntonAksenov committed Oct 6, 2024
1 parent ebe3665 commit fd54127
Show file tree
Hide file tree
Showing 4 changed files with 175 additions and 38 deletions.
109 changes: 99 additions & 10 deletions src/cl/matrix_multiplication.cl
Original file line number Diff line number Diff line change
@@ -1,27 +1,116 @@
#ifdef __CLION_IDE__
#include <libgpu/opencl/cl/clion_defines.cl>

#include <libgpu/opencl/cl/clion_defines.cl>

#endif


#line 6

// TILE_SIZE и WORK_PER_THREAD задаются через поле 'defines' в кернел конфиге

__kernel void matrix_multiplication_naive()
{
// TODO
__kernel void matrix_multiplication_naive(
__global float *as,
__global float *bs,
__global float *cs,
const unsigned int M,
const unsigned int K,
const unsigned int N
) {
unsigned int i = get_global_id(0);
unsigned int j = get_global_id(1);

if (i >= N || j >= M)
return;

float sum = 0.0f;
for (unsigned int k = 0; k < K; k++)
sum += as[j * K + k] * bs[k * N + i];
cs[j * N + i] = sum;
}

#ifdef TILE_SIZE
__kernel void matrix_multiplication_local()
{
// TODO
__kernel void matrix_multiplication_local(
__global float *as,
__global float *bs,
__global float *cs,
const unsigned int M,
const unsigned int K,
const unsigned int N
) {
unsigned int i = get_global_id(0);
unsigned int j = get_global_id(1);

__local float tile_a[TILE_SIZE][TILE_SIZE];
__local float tile_b[TILE_SIZE][TILE_SIZE];

unsigned int local_i = get_local_id(0);
unsigned int local_j = get_local_id(1);

float sum = 0.0f;
for (int tile_start = 0; tile_start < K; tile_start += TILE_SIZE) {
if (i < N && j < M && tile_start + local_i < K)
tile_a[local_j][local_i] = as[(tile_start + local_i) + j * K];
else
tile_a[local_j][local_i] = 0.0f;

if (i < N && j < M && tile_start + local_j < K)
tile_b[local_j][local_i] = bs[i + (tile_start + local_j) * K];
else
tile_b[local_j][local_i] = 0.0f;

barrier(CLK_LOCAL_MEM_FENCE);

for (int l = 0; l < TILE_SIZE; ++l)
sum += tile_a[local_j][l] * tile_b[l][local_i];

barrier(CLK_LOCAL_MEM_FENCE);
}

if (i < N && j < M)
cs[j * N + i] = sum;
}
#endif

#if defined(TILE_SIZE) && defined(WORK_PER_THREAD)
__kernel void matrix_multiplication_local_wpt()
{
// TODO
__kernel void matrix_multiplication_local_wpt(
__global float *as,
__global float *bs,
__global float *cs,
const unsigned int M,
const unsigned int K,
const unsigned int N
) {
unsigned int i = get_global_id(0);
unsigned int j = get_global_id(1);

__local float tile_a[TILE_SIZE][TILE_SIZE];
__local float tile_b[TILE_SIZE][TILE_SIZE];

unsigned int local_i = get_local_id(0);
unsigned int local_j = get_local_id(1);

float sum = 0.0f;
for (int tile_start = 0; tile_start < K; tile_start += TILE_SIZE) {
if (i < N && j < M && tile_start + local_i < K)
tile_a[local_j][local_i] = as[(tile_start + local_i) + j * K];
else
tile_a[local_j][local_i] = 0.0f;

if (i < N && j < M && tile_start + local_j < K)
tile_b[local_j][local_i] = bs[i + (tile_start + local_j) * N];
else
tile_b[local_j][local_i] = 0.0f;

barrier(CLK_LOCAL_MEM_FENCE);

for (int l = 0; l < TILE_SIZE; ++l)
sum += tile_a[local_j][l] * tile_b[l][local_i];

barrier(CLK_LOCAL_MEM_FENCE);
}

if (i < N && j < M)
cs[j * N + i] = sum;
}
#endif
78 changes: 68 additions & 10 deletions src/cl/matrix_transpose.cl
Original file line number Diff line number Diff line change
@@ -1,21 +1,79 @@
#ifdef __CLION_IDE__
#include <libgpu/opencl/cl/clion_defines.cl>

#include <libgpu/opencl/cl/clion_defines.cl>

#endif


#line 6

__kernel void matrix_transpose_naive()
{
// TODO
__kernel void matrix_transpose_naive(
__global float *as,
__global float *as_t,
const unsigned int M,
const unsigned int K
) {
unsigned int i = get_global_id(0);
unsigned int j = get_global_id(1);

if (i >= M || j >= K)
return;

as_t[j * M + i] = as[i * K + j];
}

__kernel void matrix_transpose_local_bad_banks()
{
// TODO
#define TILE_SIZE 16
__kernel void matrix_transpose_local_bad_banks(
__global float *as,
__global float *as_t,
const unsigned int M,
const unsigned int K
) {
unsigned int i = get_global_id(0);
unsigned int j = get_global_id(1);

unsigned int local_i = get_local_id(0);
unsigned int local_j = get_local_id(1);

__local float tile[TILE_SIZE][TILE_SIZE];

unsigned int group_i = get_group_id(0);
unsigned int group_j = get_group_id(1);

unsigned int i_new = group_i * TILE_SIZE + local_j;
unsigned int j_new = group_j * TILE_SIZE + local_i;

tile[local_j][local_i] = as[j * M + i];

barrier(CLK_LOCAL_MEM_FENCE);

as_t[i_new * K + j_new] = tile[local_i][local_j];
}

__kernel void matrix_transpose_local_good_banks()
{
// TODO
#define TILE_SIZE 16
__kernel void matrix_transpose_local_good_banks(
__global float *as,
__global float *as_t,
const unsigned int M,
const unsigned int K
) {
unsigned int i = get_global_id(0);
unsigned int j = get_global_id(1);

unsigned int local_i = get_local_id(0);
unsigned int local_j = get_local_id(1);

__local float tile[TILE_SIZE * (TILE_SIZE + 1)];

unsigned int group_i = get_group_id(0);
unsigned int group_j = get_group_id(1);

unsigned int i_new = group_i * TILE_SIZE + local_j;
unsigned int j_new = group_j * TILE_SIZE + local_i;

tile[local_j * (TILE_SIZE + 1) + local_i] = as[j * M + i];

barrier(CLK_LOCAL_MEM_FENCE);

as_t[i_new * K + j_new] = tile[local_i * (TILE_SIZE + 1) + local_j];
}
16 changes: 5 additions & 11 deletions src/main_matrix_multiplication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
#include <iostream>
#include <stdexcept>

const int benchmarkingIters = 10;
const int benchmarkingItersCPU = 1;
const int benchmarkingIters = 20;
const int benchmarkingItersCPU = 5;
const unsigned int M = 1024;
const unsigned int K = 1024;
const unsigned int N = 1024;
Expand Down Expand Up @@ -50,29 +50,26 @@ struct KernelConfig {

KernelConfig makeNaiveConfig(unsigned int tile_size)
{
throw std::runtime_error("not implemented");
std::string kernel_name = "matrix_multiplication_naive";
gpu::WorkSize work_size(0, 0/*TODO*/);
gpu::WorkSize work_size(tile_size, tile_size, M, N);
std::string defines;
std::string prefix = "[naive, ts=" + std::to_string(tile_size) + "]";
return KernelConfig{kernel_name, work_size, defines, prefix};
}

KernelConfig makeLocalConfig(unsigned int tile_size)
{
throw std::runtime_error("not implemented");
std::string kernel_name = "matrix_multiplication_local";
gpu::WorkSize work_size(0, 0/*TODO*/);
gpu::WorkSize work_size(tile_size, tile_size, M, N);
std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size);
std::string prefix = "[local, ts=" + std::to_string(tile_size) + "]";
return KernelConfig{kernel_name, work_size, defines, prefix};
}

KernelConfig makeLocalWPTConfig(unsigned int tile_size, unsigned int wpt)
{
throw std::runtime_error("not implemented");
std::string kernel_name = "matrix_multiplication_local_wpt";
gpu::WorkSize work_size(0, 0/*TODO*/);
gpu::WorkSize work_size(tile_size, tile_size, M, N);
std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size) + " -DWORK_PER_THREAD=" + std::to_string(wpt);
std::string prefix = "[local wpt, ts=" + std::to_string(tile_size) + ", wpt=" + std::to_string(wpt) + "]";
return KernelConfig{kernel_name, work_size, defines, prefix};
Expand Down Expand Up @@ -143,9 +140,6 @@ int main(int argc, char **argv)

const std::vector<float> cs_cpu_reference = computeCPU(as.data(), bs.data());

// TODO uncomment
return 0;

runTest(makeNaiveConfig(4), as.data(), bs.data(), cs_cpu_reference.data());
runTest(makeNaiveConfig(8), as.data(), bs.data(), cs_cpu_reference.data());
runTest(makeNaiveConfig(16), as.data(), bs.data(), cs_cpu_reference.data());
Expand Down
10 changes: 3 additions & 7 deletions src/main_matrix_transpose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include <iostream>
#include <stdexcept>

const int benchmarkingIters = 100;
const int benchmarkingIters = 1000;
const unsigned int M = 4096;
const unsigned int K = 4096;

Expand All @@ -33,9 +33,8 @@ void runTest(const std::string &kernel_name, const float *as)
// поставьте каретку редактирования кода внутри скобок конструктора WorkSize -> Ctrl+P -> заметьте что есть 2, 4 и 6 параметров
// - для 1D, 2D и 3D рабочего пространства соответственно

// TODO uncomment
// gpu::WorkSize work_size(0, 0, 0, 0 /*TODO*/);
// matrix_transpose_kernel.exec(work_size, as_gpu, as_t_gpu, M, K);
gpu::WorkSize work_size(16, 16, M, K);
matrix_transpose_kernel.exec(work_size, as_gpu, as_t_gpu, M, K);

t.nextLap();
}
Expand Down Expand Up @@ -74,9 +73,6 @@ int main(int argc, char **argv)
}
std::cout << "Data generated for M=" << M << ", K=" << K << std::endl;

// TODO uncomment
return 0;

runTest("matrix_transpose_naive", as.data());
runTest("matrix_transpose_local_bad_banks", as.data());
runTest("matrix_transpose_local_good_banks", as.data());
Expand Down

0 comments on commit fd54127

Please sign in to comment.