diff --git a/src/cl/matrix_multiplication.cl b/src/cl/matrix_multiplication.cl index b0cd7dc4..da9154c6 100644 --- a/src/cl/matrix_multiplication.cl +++ b/src/cl/matrix_multiplication.cl @@ -90,27 +90,22 @@ __kernel void matrix_multiplication_local_wpt( unsigned int local_i = get_local_id(0); unsigned int local_j = get_local_id(1); - float sum = 0.0f; - for (int tile_start = 0; tile_start < K; tile_start += TILE_SIZE) { - if (i < N && j < M && tile_start + local_i < K) - tile_a[local_j][local_i] = as[(tile_start + local_i) + j * K]; - else - tile_a[local_j][local_i] = 0.0f; - - if (i < N && j < M && tile_start + local_j < K) - tile_b[local_j][local_i] = bs[i + (tile_start + local_j) * N]; - else - tile_b[local_j][local_i] = 0.0f; + float sum[WORK_PER_THREAD] = { 0.0f }; + for (int tile_start = 0; tile_start < (K + TILE_SIZE - 1) / TILE_SIZE; tile_start++) { + for (int w = 0; w < WORK_PER_THREAD; w++) { + tile_a[local_j * WORK_PER_THREAD + w][local_i] = as[(tile_start * TILE_SIZE + local_i) + (j * WORK_PER_THREAD + w) * K]; + tile_b[local_j * WORK_PER_THREAD + w][local_i] = bs[(local_i) + (tile_start * TILE_SIZE + local_j * WORK_PER_THREAD + w) * N]; + } barrier(CLK_LOCAL_MEM_FENCE); for (int l = 0; l < TILE_SIZE; ++l) - sum += tile_a[local_j][l] * tile_b[l][local_i]; + for (int w = 0; w < WORK_PER_THREAD; w++) + sum[w] += tile_a[local_j * WORK_PER_THREAD + w][l] * tile_b[l][local_i]; barrier(CLK_LOCAL_MEM_FENCE); } - - if (i < N && j < M) - cs[j * N + i] = sum; + for (int w = 0; w < WORK_PER_THREAD; w++) + cs[i + (j * WORK_PER_THREAD + w) * N] = sum[w]; } #endif diff --git a/src/main_matrix_multiplication.cpp b/src/main_matrix_multiplication.cpp index 99d15e64..0321c2f6 100644 --- a/src/main_matrix_multiplication.cpp +++ b/src/main_matrix_multiplication.cpp @@ -69,7 +69,7 @@ KernelConfig makeLocalConfig(unsigned int tile_size) KernelConfig makeLocalWPTConfig(unsigned int tile_size, unsigned int wpt) { std::string kernel_name = "matrix_multiplication_local_wpt"; - gpu::WorkSize work_size(tile_size, tile_size, M, N); + gpu::WorkSize work_size(tile_size, tile_size / wpt, M, N / wpt); std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size) + " -DWORK_PER_THREAD=" + std::to_string(wpt); std::string prefix = "[local wpt, ts=" + std::to_string(tile_size) + ", wpt=" + std::to_string(wpt) + "]"; return KernelConfig{kernel_name, work_size, defines, prefix};