Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Task08 Алсу Верещагина ITMO #263

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 96 additions & 1 deletion src/cl/radix.cl
Original file line number Diff line number Diff line change
@@ -1 +1,96 @@
// TODO

#define WORK_GROUP_SIZE 4
#define TILE_SIZE 16

__kernel void fill_with_zeros(__global unsigned int *as, unsigned int n)
{
const unsigned int gid = get_global_id(0);
if (gid >= n) {
return;
}
as[gid] = 0;
}

__kernel void count(__global unsigned int *as, __global unsigned int *counters, unsigned int n, unsigned int shift, unsigned int bits_count) {
unsigned int gid = get_global_id(0);
if (gid >= n) {
return;
}
unsigned int value = (as[gid] >> shift) & ((1 << bits_count) - 1);
unsigned int wgid = get_group_id(0);
atomic_inc(&counters[wgid * (1 << bits_count) + value]);
}

__kernel void matrix_transpose_local_good_banks(
__global float *a,
__global float *at,
unsigned int m,
unsigned int k
) {
int i = get_global_id(0);
int j = get_global_id(1);

__local float tile[TILE_SIZE][TILE_SIZE + 1];
int local_i = get_local_id(0);
int local_j = get_local_id(1);

if (i < k && j < m) {
tile[local_j][local_i] = a[j * k + i];
} else {
tile[local_j][local_i] = 0;
}

barrier(CLK_LOCAL_MEM_FENCE);

int target_j = (i - local_i) + local_j;
int target_i = (j - local_j) + local_i;
if (target_i < k && target_j < m) {
at[target_j * k + target_i] = tile[local_i][local_j];
}
}

__kernel void prefix_sum(__global unsigned int *as, __global unsigned int *bs, unsigned int i, unsigned int n)
{
unsigned int gid = get_global_id(0);
if (gid >= n) {
return;
}
if (gid >= i) {
bs[gid] = as[gid - i] + as[gid];
} else {
bs[gid] = as[gid];
}
}

__kernel void radix_sort(__global unsigned int *as, __global unsigned int *bs, __global unsigned int *counters, unsigned int n, unsigned int shift, unsigned int bits_count)
{
unsigned int gid = get_global_id(0);
if (gid >= n) {
return;
}

unsigned int value = (as[gid] >> shift) & ((1 << bits_count) - 1);

unsigned int wgid = get_group_id(0);

unsigned int start = wgid * WORK_GROUP_SIZE;
unsigned int end = gid;
unsigned int offset = 0;

for (unsigned int i = start; i < end; ++i) {
unsigned int prev_value = (as[i] >> shift) & ((1 << bits_count) - 1);
if (prev_value == value) {
++offset;
}
}

unsigned int base_idx;
unsigned int counters_idx = wgid + value * WORK_GROUP_SIZE;
if (counters_idx > 0) {
base_idx = counters[counters_idx - 1];
} else {
base_idx = 0;
}

bs[base_idx + offset] = as[gid];
}
64 changes: 57 additions & 7 deletions src/main_radix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,13 @@
#include <stdexcept>
#include <vector>

#define TILE_SIZE 16

const int benchmarkingIters = 10;
const int benchmarkingItersCPU = 1;
const unsigned int n = 32 * 1024 * 1024;
const unsigned int n = 16;

const int bits_count = 2;

template<typename T>
void raiseFail(const T &a, const T &b, std::string message, std::string filename, int line) {
Expand Down Expand Up @@ -57,21 +61,67 @@ int main(int argc, char **argv) {
std::cout << "Data generated for n=" << n << "!" << std::endl;

const std::vector<unsigned int> cpu_reference = computeCPU(as);

// remove me
return 0;


unsigned int workGroupSize = 4;
unsigned int workGroupsCount = (n + workGroupSize - 1) / workGroupSize;
unsigned int globalWorkSize = workGroupsCount * workGroupSize;
unsigned int countersSize = workGroupsCount * (1 << bits_count);
unsigned int countersWorkSize = (countersSize + workGroupSize - 1) / workGroupSize * workGroupSize;

gpu::gpu_mem_32u as_gpu;
gpu::gpu_mem_32u bs_gpu;
gpu::gpu_mem_32u counters_gpu;
gpu::gpu_mem_32u counters_gpu_tmp;

as_gpu.resizeN(n);
bs_gpu.resizeN(n);
counters_gpu.resizeN(countersSize);
counters_gpu_tmp.resizeN(countersSize);

{
ocl::Kernel fill_with_zeros(radix_kernel, radix_kernel_length, "fill_with_zeros");
fill_with_zeros.compile();
ocl::Kernel count(radix_kernel, radix_kernel_length, "count");
count.compile();
ocl::Kernel transpose(radix_kernel, radix_kernel_length, "matrix_transpose_local_good_banks");
transpose.compile();
ocl::Kernel prefix_sum(radix_kernel, radix_kernel_length, "prefix_sum");
prefix_sum.compile();
ocl::Kernel radix_sort(radix_kernel, radix_kernel_length, "radix_sort");
radix_sort.compile();

std::vector<unsigned int> counters(countersSize, 0);

timer t;
for (int iter = 0; iter < benchmarkingIters; ++iter) {
// Запускаем секундомер после прогрузки данных, чтобы замерять время работы кернела, а не трансфер данных
as_gpu.writeN(as.data(), n);
t.restart();

for (unsigned int shift = 0; shift < 32; shift += bits_count) {
fill_with_zeros.exec(gpu::WorkSize(workGroupSize, countersWorkSize), counters_gpu, countersSize);

count.exec(gpu::WorkSize(workGroupSize, globalWorkSize), as_gpu, counters_gpu, n, shift, bits_count);

unsigned int x_size = ((1 << bits_count) + TILE_SIZE - 1) / TILE_SIZE * TILE_SIZE;
unsigned int y_size = (workGroupsCount + TILE_SIZE - 1) / TILE_SIZE * TILE_SIZE;
transpose.exec(gpu::WorkSize(TILE_SIZE, TILE_SIZE, x_size, y_size), counters_gpu, counters_gpu_tmp, 1 << bits_count, workGroupsCount);
std::swap(counters_gpu, counters_gpu_tmp);

for (unsigned int i = 1; i < countersSize; i *= 2) {
prefix_sum.exec(gpu::WorkSize(workGroupSize, countersWorkSize), counters_gpu, counters_gpu_tmp, i, countersSize);
std::swap(counters_gpu, counters_gpu_tmp);
}

// TODO
radix_sort.exec(gpu::WorkSize(workGroupSize, globalWorkSize), as_gpu, bs_gpu, counters_gpu, n, shift, bits_count);
std::swap(as_gpu, bs_gpu);
}
t.nextLap();
}
t.stop();

std::cout << "GPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl;
std::cout << "GPU: " << (n / 1000.0 / 1000.0) / t.lapAvg() << " millions/s" << std::endl;
as_gpu.readN(as.data(), n);
}

// Проверяем корректность результатов
Expand Down