Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ blas/OpenCL ] SGEMM OpenCL kernels added #2648

Merged
merged 1 commit into from
Jul 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 18 additions & 6 deletions nntrainer/layers/layer_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -681,16 +681,28 @@ std::string RunLayerContext::getKernelName(LayerKernel layerKernel) {
switch (layerKernel) {
case LayerKernel::SGEMV:
return "sgemv_cl";
case LayerKernel::DOT:
return "dot_cl";
case LayerKernel::SGEMM:
return "sgemm_cl";
case LayerKernel::SGEMV_FP16:
return "sgemv_cl_fp16";
case LayerKernel::DOT:
return "dot_cl";
case LayerKernel::DOT_FP16:
return "dot_cl_fp16";
case LayerKernel::SGEMM_FP16:
return "sgemm_cl_fp16";
case LayerKernel::SGEMM_NOTRANS:
return "sgemm_cl_noTrans";
case LayerKernel::SGEMM_NOTRANS_FP16:
return "sgemm_cl_noTrans_fp16";
case LayerKernel::SGEMM_TRANSA:
return "sgemm_cl_transA";
case LayerKernel::SGEMM_TRANSA_FP16:
return "sgemm_cl_transA_fp16";
case LayerKernel::SGEMM_TRANSB:
return "sgemm_cl_transB";
case LayerKernel::SGEMM_TRANSB_FP16:
return "sgemm_cl_transB_fp16";
case LayerKernel::SGEMM_TRANSAB:
return "sgemm_cl_transAB";
case LayerKernel::SGEMM_TRANSAB_FP16:
return "sgemm_cl_transAB_fp16";
case LayerKernel::ADD:
return "addition_cl";
case LayerKernel::ADD_FP16:
Expand Down
34 changes: 20 additions & 14 deletions nntrainer/layers/layer_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -830,20 +830,26 @@ class RunLayerContext {
* getKernelName function.
*/
enum LayerKernel {
SGEMV = 1 << 0, /**< placeholder for kernel name */
DOT = 1 << 1, /**< placeholder for kernel name */
SGEMM = 1 << 2, /**< placeholder for kernel name */
SGEMV_FP16 = 1 << 3, /**< placeholder for kernel name */
DOT_FP16 = 1 << 4, /**< placeholder for kernel name */
SGEMM_FP16 = 1 << 5, /**< placeholder for kernel name */
ADD = 1 << 6, /**< placeholder for kernel name */
ADD_FP16 = 1 << 7, /**< placeholder for kernel name */
SWIGLU = 1 << 8, /**< placeholder for kernel name */
SWIGLU_FP16 = 1 << 9, /**< placeholder for kernel name */
SSCAL = 1 << 10, /**< placeholder for kernel name */
SSCAL_FP16 = 1 << 11, /**< placeholder for kernel name */
COPY = 1 << 12, /**< placeholder for kernel name */
COPY_FP16 = 1 << 13 /**< placeholder for kernel name */
SGEMV = 1 << 0, /**< placeholder for kernel name */
SGEMV_FP16 = 1 << 1, /**< placeholder for kernel name */
DOT = 1 << 2, /**< placeholder for kernel name */
DOT_FP16 = 1 << 3, /**< placeholder for kernel name */
SGEMM_NOTRANS = 1 << 4, /**< placeholder for kernel name */
SGEMM_NOTRANS_FP16 = 1 << 5, /**< placeholder for kernel name */
SGEMM_TRANSA = 1 << 6, /**< placeholder for kernel name */
SGEMM_TRANSA_FP16 = 1 << 7, /**< placeholder for kernel name */
SGEMM_TRANSB = 1 << 8, /**< placeholder for kernel name */
SGEMM_TRANSB_FP16 = 1 << 9, /**< placeholder for kernel name */
SGEMM_TRANSAB = 1 << 10, /**< placeholder for kernel name */
SGEMM_TRANSAB_FP16 = 1 << 11, /**< placeholder for kernel name */
ADD = 1 << 12, /**< placeholder for kernel name */
ADD_FP16 = 1 << 13, /**< placeholder for kernel name */
SWIGLU = 1 << 14, /**< placeholder for kernel name */
SWIGLU_FP16 = 1 << 15, /**< placeholder for kernel name */
SSCAL = 1 << 16, /**< placeholder for kernel name */
SSCAL_FP16 = 1 << 17, /**< placeholder for kernel name */
COPY = 1 << 18, /**< placeholder for kernel name */
COPY_FP16 = 1 << 19, /**< placeholder for kernel name */
};

/**
Expand Down
10 changes: 4 additions & 6 deletions nntrainer/tensor/cl_operations/blas_kernel_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,9 +147,8 @@ void dotCl(Tensor const &input, Tensor const &m, Tensor &result,
}
/// case others: use gemm
else {
// transA == false, transB == false
sgemm_cl(data, mdata, rdata, M, N, K, lda, ldb, ldc, context);
// todo: other condition implementations
sgemm_cl(transA, transB, data, mdata, rdata, M, N, K, lda, ldb, ldc,
context);
}
} else if (input.getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
Expand Down Expand Up @@ -184,9 +183,8 @@ void dotCl(Tensor const &input, Tensor const &m, Tensor &result,
}
/// case others: use sgemm
else {
// transA == false, transB == false
sgemm_cl(data, mdata, rdata, M, N, K, lda, ldb, ldc, context);
// todo: other condition implementations
sgemm_cl(transA, transB, data, mdata, rdata, M, N, K, lda, ldb, ldc,
context);
}
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
Expand Down
111 changes: 95 additions & 16 deletions nntrainer/tensor/cl_operations/blas_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ std::string dot_cl_kernel_ =
}
})";

std::string sgemm_cl_kernel_ =
R"(__kernel void sgemm_cl(const __global float* A, const __global float* B,
std::string sgemm_cl_noTrans_kernel_ =
R"(__kernel void sgemm_cl_noTrans(const __global float* A, const __global float* B,
__global float* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {

unsigned int m = get_global_id(0);
Expand All @@ -51,6 +51,58 @@ std::string sgemm_cl_kernel_ =
C[m * ldc + n] = c;
})";

std::string sgemm_cl_transA_kernel_ =
R"(__kernel void sgemm_cl_transA(const __global float* A, const __global float* B,
__global float* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {

unsigned int m = get_global_id(0);
unsigned int n = get_global_id(1);
float c = 0.0f;
for (unsigned int k = 0; k < K; ++k) {
float a, b;
a = A[k * lda + m];
b = B[k * ldb + n];
c += a * b;
}
C[m * ldc + n] = c;
})";

std::string sgemm_cl_transB_kernel_ =
R"(__kernel void sgemm_cl_transB(const __global float *A, const __global float *B,
__global float *C, unsigned int K,
unsigned int lda, unsigned int ldb,
unsigned int ldc) {

unsigned int m = get_global_id(0);
unsigned int n = get_global_id(1);
float c = 0.0f;
for (unsigned int k = 0; k < K; ++k) {
float a, b;
a = A[m * lda + k];
b = B[n * ldb + k];
c += a * b;
}
C[m * ldc + n] = c;
})";

std::string sgemm_cl_transAB_kernel_ =
R"(__kernel void sgemm_cl_transAB(const __global float *A, const __global float *B,
__global float *C, unsigned int K,
unsigned int lda, unsigned int ldb,
unsigned int ldc) {

unsigned int m = get_global_id(0);
unsigned int n = get_global_id(1);
float c = 0.0f;
for (unsigned int k = 0; k < K; ++k) {
float a, b;
a = A[k * lda + m];
b = B[n * ldb + k];
c += a * b;
}
C[m * ldc + n] = c;
})";

std::string addition_cl_kernel_ =
R"(__kernel void addition_cl(__global const float* input, __global float* output, const unsigned int size) {
#pragma printf_support
Expand All @@ -71,7 +123,10 @@ std::string sscal_cl_kernel_ =
* @brief defining global kernel objects
*/
opencl::Kernel kernel_sgemv;
opencl::Kernel kernel_sgemm;
opencl::Kernel kernel_sgemm_transAB;
opencl::Kernel kernel_sgemm_transA;
opencl::Kernel kernel_sgemm_transB;
opencl::Kernel kernel_sgemm_noTrans;
opencl::Kernel kernel_dot;
opencl::Kernel kernel_addition;
opencl::Kernel kernel_sscal;
Expand Down Expand Up @@ -227,19 +282,43 @@ float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1,
return cl_ret;
}

void sgemm_cl(const float *A, const float *B, float *C, unsigned int M,
unsigned int N, unsigned int K, unsigned int lda,
unsigned int ldb, unsigned int ldc, RunLayerContext &context) {
void sgemm_cl(CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB, const float *A,
const float *B, float *C, unsigned int M, unsigned int N,
unsigned int K, unsigned int lda, unsigned int ldb,
unsigned int ldc, RunLayerContext &context) {

opencl::Kernel *kernel_sgemm = nullptr;
RunLayerContext::LayerKernel layerKernel;
std::string sgemm_cl_kernel_;

if (TransA != CblasTrans && TransB != CblasTrans) {
kernel_sgemm = &kernel_sgemm_noTrans;
layerKernel = context.LayerKernel::SGEMM_NOTRANS;
sgemm_cl_kernel_ = sgemm_cl_noTrans_kernel_;
} else if (TransA == CblasTrans && TransB != CblasTrans) {
kernel_sgemm = &kernel_sgemm_transA;
layerKernel = context.LayerKernel::SGEMM_TRANSA;
sgemm_cl_kernel_ = sgemm_cl_transA_kernel_;
} else if (TransA != CblasTrans && TransB == CblasTrans) {
kernel_sgemm = &kernel_sgemm_transB;
layerKernel = context.LayerKernel::SGEMM_TRANSB;
sgemm_cl_kernel_ = sgemm_cl_transB_kernel_;
} else {
kernel_sgemm = &kernel_sgemm_transAB;
layerKernel = context.LayerKernel::SGEMM_TRANSAB;
sgemm_cl_kernel_ = sgemm_cl_transAB_kernel_;
}

bool result = false;

do {
result = context.clCreateKernel(sgemm_cl_kernel_,
context.LayerKernel::SGEMM, kernel_sgemm);
result =
context.clCreateKernel(sgemm_cl_kernel_, layerKernel, *kernel_sgemm);
if (!result) {
break;
}

// sizes will be same for transpose
size_t m_k_size = M * K * sizeof(float);
size_t k_n_size = K * N * sizeof(float);
size_t m_n_size = M * N * sizeof(float);
Expand All @@ -265,37 +344,37 @@ void sgemm_cl(const float *A, const float *B, float *C, unsigned int M,
break;
}

result = kernel_sgemm.SetKernelArguments(0, &inputA, sizeof(cl_mem));
result = kernel_sgemm->SetKernelArguments(0, &inputA, sizeof(cl_mem));
if (!result) {
break;
}

result = kernel_sgemm.SetKernelArguments(1, &inputB, sizeof(cl_mem));
result = kernel_sgemm->SetKernelArguments(1, &inputB, sizeof(cl_mem));
if (!result) {
break;
}

result = kernel_sgemm.SetKernelArguments(2, &inOutC, sizeof(cl_mem));
result = kernel_sgemm->SetKernelArguments(2, &inOutC, sizeof(cl_mem));
if (!result) {
break;
}

result = kernel_sgemm.SetKernelArguments(3, &K, sizeof(int));
result = kernel_sgemm->SetKernelArguments(3, &K, sizeof(int));
if (!result) {
break;
}

result = kernel_sgemm.SetKernelArguments(4, &lda, sizeof(int));
result = kernel_sgemm->SetKernelArguments(4, &lda, sizeof(int));
if (!result) {
break;
}

result = kernel_sgemm.SetKernelArguments(5, &ldb, sizeof(int));
result = kernel_sgemm->SetKernelArguments(5, &ldb, sizeof(int));
if (!result) {
break;
}

result = kernel_sgemm.SetKernelArguments(6, &ldc, sizeof(int));
result = kernel_sgemm->SetKernelArguments(6, &ldc, sizeof(int));
if (!result) {
break;
}
Expand All @@ -304,7 +383,7 @@ void sgemm_cl(const float *A, const float *B, float *C, unsigned int M,
const int work_group_size[3] = {32, 32, 1}; // test-value

result = context.command_queue_inst_.DispatchCommand(
kernel_sgemm, work_groups_count, work_group_size);
*kernel_sgemm, work_groups_count, work_group_size);
if (!result) {
break;
}
Expand Down
28 changes: 20 additions & 8 deletions nntrainer/tensor/cl_operations/blas_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ namespace nntrainer {
* @brief declaring global kernel objects
*/
extern opencl::Kernel kernel_sgemv;
extern opencl::Kernel kernel_sgemm;
extern opencl::Kernel kernel_sgemm_noTrans;
extern opencl::Kernel kernel_sgemm_transAB;
extern opencl::Kernel kernel_sgemm_transA;
extern opencl::Kernel kernel_sgemm_transB;
extern opencl::Kernel kernel_dot;
extern opencl::Kernel kernel_addition;
extern opencl::Kernel kernel_sscal;
Expand Down Expand Up @@ -58,6 +61,8 @@ float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1,
/**
* @brief sgemm computation : Y = op(A)*op(B) + C,
* where op(X) is one of X or X**T
* @param[in] transA CBLAS_TRANSPOSE
* @param[in] transB CBLAS_TRANSPOSE
* @param[in] A float * for Matrix A
* @param[in] B float * for Matrix B
* @param[in] C float * for Matrix C
Expand All @@ -69,9 +74,10 @@ float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1,
* @param[in] ldc number of C's columns
* @param[in] context RunLayerContext reference
*/
void sgemm_cl(const float *A, const float *B, float *C, unsigned int M,
unsigned int N, unsigned int K, unsigned int lda,
unsigned int ldb, unsigned int ldc, RunLayerContext &context);
void sgemm_cl(CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB, const float *A,
const float *B, float *C, unsigned int M, unsigned int N,
unsigned int K, unsigned int lda, unsigned int ldb,
unsigned int ldc, RunLayerContext &context);

/**
* @brief addition : sum of all input vectors
Expand All @@ -98,7 +104,10 @@ void sscal_cl(float *X, const unsigned int N, const float alpha,
* @brief declaring global fp16 kernel objects
*/
extern opencl::Kernel kernel_sgemv_fp16;
extern opencl::Kernel kernel_sgemm_fp16;
extern opencl::Kernel kernel_sgemm_noTrans_fp16;
extern opencl::Kernel kernel_sgemm_transAB_fp16;
extern opencl::Kernel kernel_sgemm_transA_fp16;
extern opencl::Kernel kernel_sgemm_transB_fp16;
extern opencl::Kernel kernel_dot_fp16;
extern opencl::Kernel kernel_addition_fp16;
extern opencl::Kernel kernel_sscal_fp16;
Expand Down Expand Up @@ -131,6 +140,8 @@ __fp16 dot_cl(const __fp16 *vecAdata, const __fp16 *vecXdata, unsigned int dim1,
/**
* @brief fp16 sgemm computation : Y = op(A)*op(B) + C,
* where op(X) is one of X or X**T
* @param[in] transA CBLAS_TRANSPOSE
* @param[in] transB CBLAS_TRANSPOSE
* @param[in] A fp16 * for Matrix A
* @param[in] B fp16 * for Matrix B
* @param[in] C fp16 * for Matrix C
Expand All @@ -142,9 +153,10 @@ __fp16 dot_cl(const __fp16 *vecAdata, const __fp16 *vecXdata, unsigned int dim1,
* @param[in] ldc number of C's columns
* @param[in] context RunLayerContext reference
*/
void sgemm_cl(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M,
unsigned int N, unsigned int K, unsigned int lda,
unsigned int ldb, unsigned int ldc, RunLayerContext &context);
void sgemm_cl(CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB, const __fp16 *A,
const __fp16 *B, __fp16 *C, unsigned int M, unsigned int N,
unsigned int K, unsigned int lda, unsigned int ldb,
unsigned int ldc, RunLayerContext &context);

/**
* @brief fp16 addition : sum of all input vectors
Expand Down
Loading
Loading